In [24]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
##Note, all baseline models that i have used give poor to medium r2 scores which means that its a suitable dataset to use in the sense that its not very predictable by default.

data = pd.read_csv("energydata_complete.csv")  


#print("Data head:\n", data.head())
#print("\nData info:\n", data.info())
#print("\nMissing values:\n", data.isnull().sum())


#pre processing

## this is converting the date to pure numerical values. regression models need this
data['date'] = pd.to_datetime(data['date'])

data['hour'] = data['date'].dt.hour
data['day_of_week'] = data['date'].dt.dayofweek

## drop unecessary data, rv1 and rv2 are declared as random variables in the dataset description
data = data.drop(columns=['date','rv1','rv2'])



##declare appliances as the target feature to predict.
X = data.drop(columns=['Appliances'])
y = data['Appliances']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## test all different regression models to see what rmse and r2 scores they give
baseline_model = LinearRegression()
baseline_model.fit(X_train_scaled, y_train)

y_pred = baseline_model.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n--- Linear Regression---")
print(f"RMSE: {rmse:.2f} Wh")
print(f"R^2: {r2:.2f}")
if r2 < 0.3:
    print("low baseline, potential room for great improvement.")
elif r2 > 0.8:
    print("Note: High baseline, less room for improvement.")
else:
    print("Baseline performance is meh, decent room for improvement")

ridge_model = Ridge()
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)
print("\n--- Ridge Regression ---")
print(f"RMSE: {rmse_ridge:.2f} Wh")
print(f"R^2: {r2_ridge:.2f}")
if r2_ridge < 0.3:
    print("low baseline, potential room for great improvement.")
elif r2_ridge > 0.8:
    print("Note: High baseline, less room for improvement.")
else:
    print("Baseline performance is meh, decent room for improvement")


lasso_model = Lasso()
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)
print("\n--- Lasso Regression ---")
print(f"RMSE: {rmse_lasso:.2f} Wh")
print(f"R^2: {r2_lasso:.2f}")
if r2_lasso< 0.3:
    print("low baseline, potential room for great improvement.")
elif r2_lasso > 0.8:
    print("Note: High baseline, less room for improvement.")
else:
    print("Baseline performance is meh, decent room for improvement")


rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)  
y_pred_rf = rf_model.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
print("\n--- Random Forest Regressor ---")
print(f"RMSE: {rmse_rf:.2f} Wh")
print(f"R^2: {r2_rf:.2f}")
if r2_rf < 0.3:
    print("low baseline, potential room for great improvement.")
elif r2_rf > 0.8:
    print("Note: High baseline, less room for improvement.")
else:
    print("Baseline performance is meh, decent room for improvement")


gbr_model = GradientBoostingRegressor()
gbr_model.fit(X_train, y_train)
y_pred_gbr = gbr_model.predict(X_test)
rmse_gbr = np.sqrt(mean_squared_error(y_test, y_pred_gbr))
r2_gbr = r2_score(y_test, y_pred_gbr)
print("\n--- Gradient Boosting Regressor ---")
print(f"RMSE: {rmse_gbr:.2f} Wh")
print(f"R^2: {r2_gbr:.2f}")
if r2_gbr < 0.3:
    print("low baseline, potential room for great improvement.")
elif r2_gbr > 0.8:
    print("Note: High baseline, less room for improvement.")
else:
    print("Baseline performance is meh, decent room for improvement")












--- Linear Regression---
RMSE: 91.13 Wh
R^2: 0.17
low baseline, potential room for great improvement.

--- Ridge Regression ---
RMSE: 91.13 Wh
R^2: 0.17
low baseline, potential room for great improvement.

--- Lasso Regression ---
RMSE: 91.96 Wh
R^2: 0.16
low baseline, potential room for great improvement.

--- Random Forest Regressor ---
RMSE: 65.72 Wh
R^2: 0.57
Baseline performance is meh, decent room for improvement

--- Gradient Boosting Regressor ---
RMSE: 83.28 Wh
R^2: 0.31
Baseline performance is meh, decent room for improvement
