In [63]:
#Libraries
import pandas as pd
import numpy as np
from etl.utils import read_sql_table
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [None]:
df = read_sql_table("gold_cpw")

###Variable manipulation
# Convert HourDK to datetime if it's not already in datetime format
df['hour_utc'] = pd.to_datetime(df['hour_utc'])

# Extract the hour from the HourDK column
df['hour'] = df['hour_utc'].dt.hour

#Subsetting variables
df = df[['hour', 'consumption_kwh', 'spot_price_dkk', 'temp_mean_past1h', 'wind_speed_past1h',
    'humidity_past1h', 'precip_past1h']]
#Converting variables to float64
df['hour'] = df['hour'].astype('float64')
df['spot_price_dkk'] = df['spot_price_dkk'].astype('float64')

print(df.head().to_string(), "\n")
print(df.info())

In [None]:
##Checking for missing values
print(df.isna().any())
#No missing values

In [None]:
##Splitting dataset
X = df[['hour', 'spot_price_dkk', 'temp_mean_past1h', 'wind_speed_past1h', 'humidity_past1h', 'precip_past1h']]
y = df[['consumption_kwh']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train)

In [None]:
####Linear Regression####
##Training and evaluating linear regression model without preprocessing
est = sm.OLS(y_train, sm.add_constant(X_train)) #model with constant
est_fit = est.fit()
print(est_fit.summary()) #All but hour and precip_past1h are significant
#In a zero-intercept model, all but precip_past1h and spot_price_dkk are significant


In [None]:
#Linear regression with scaling
X_train_scale = StandardScaler().fit_transform(X_train)
est = sm.OLS(y_train, sm.add_constant(X_train_scale))
est_fit = est.fit()
print(est_fit.summary())
#Conclude on variable importance based on coefficients...

In [70]:
###Prediction accuracy - linear regression
X_test_scale = StandardScaler().fit_transform(X_test)
y_pred_lm = est_fit.predict(sm.add_constant(X_test_scale))
mse_lm = mean_squared_error(y_test, y_pred_lm)
print(round(mse_lm)) #3,498,841,479

3498841479


In [64]:
###Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train) #Train model
y_pred_rf = rf_model.predict(X_test) #Test
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(round(mse_rf)) #1,581,182,860
#Performs significantly better than lm


  return fit_method(estimator, *args, **kwargs)


1581182860


In [74]:
###Support Vector Machines
svc_model = SVR()
svc_model.fit(X_train, y_train) #Train
y_pred_svc = svc_model.predict(X_test) #Test
mse_svc = mean_squared_error(y_test, y_pred_svc) #MSE
print(round(mse_svc)) #3,906,756,583

3906756583


  y = column_or_1d(y, warn=True)


In [77]:
#Comparing models
mse_data = {'Model': ['Linear Model', 'Random Forrest', 'Support Vector Machine'],
            'MSE': [mse_lm, mse_rf, mse_svc]
    }

mse_df = pd.DataFrame(mse_data) #Make dataframe of MSE data

print(mse_df)


                    Model           MSE
0            Linear Model  3.498841e+09
1          Random Forrest  1.581183e+09
2  Support Vector Machine  3.906757e+09
