## Dataset information:
https://islp.readthedocs.io/en/latest/datasets/Carseats.html

## Load libraries:

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

## Read the dataset

In [13]:
# Load the dataset
df = pd.read_csv('5110.csv')


# Check for missing values
print(df.isna().any())



# Convert categorical variables to dummies and drop specific columns
df.describe()
df.head




Unnamed: 0    False
id            False
dt            False
hh            False
kwh           False
dtype: bool


<bound method NDFrame.head of        Unnamed: 0    id                dt  hh   kwh
0               0  5110  15FEB08:13:00:00  26  0.75
1               1  5110  15FEB08:13:30:00  27  0.25
2               2  5110  15FEB08:14:00:00  28  0.30
3               3  5110  15FEB08:14:30:00  29  0.20
4               4  5110  15FEB08:15:00:00  30  0.25
...           ...   ...               ...  ..   ...
40177       40177  5110  29SEP10:20:30:00  41  0.25
40178       40178  5110  29SEP10:21:00:00  42  0.25
40179       40179  5110  29SEP10:21:30:00  43  0.30
40180       40180  5110  29SEP10:22:00:00  44  0.20
40181       40181  5110  29SEP10:22:30:00  45  0.25

[40182 rows x 5 columns]>

In [16]:

# Drop the 'Unnamed: 0' and 'hh' columns, and convert 'dt' to datetime
df['dt'] = pd.to_datetime(df['dt'], format='%d%b%y:%H:%M:%S')
df = df.drop(['Unnamed: 0', 'hh'], axis=1)

# Feature engineering from 'dt': creating year, month, day, hour features
df['year'] = df['dt'].dt.year
df['month'] = df['dt'].dt.month
df['day'] = df['dt'].dt.day
df['hour'] = df['dt'].dt.hour

# Define features (X) and target (y)
X = df.drop(['id', 'dt', 'kwh'], axis=1)  # Dropping 'id' as it seems to be an identifier
y = df['kwh']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions using the Random Forest model
y_pred_rf = rf_model.predict(X_test)

# Evaluation
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print out the metrics
print(f"Random Forest MSE: {mse_rf}")
print(f"Random Forest RMSE: {rmse_rf}")
print(f"Random Forest R^2: {r2_rf}")


Random Forest MSE: 0.04043675519314826
Random Forest RMSE: 0.2010889235963738
Random Forest R^2: 0.4192625000849639
