In [109]:
#Libraries
import pandas as pd
import numpy as np
from etl.utils import read_sql_table
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import statsmodels.api as sm

In [None]:
df = read_sql_table("gold_cpw")

###Variable manipulation
# Convert HourDK to datetime if it's not already in datetime format
df['hour_utc'] = pd.to_datetime(df['hour_utc'])

# Extract the hour from the HourDK column
df['hour'] = df['hour_utc'].dt.hour

#Subsetting variables
df = df[['hour', 'consumption_kwh', 'spot_price_dkk', 'temp_mean_past1h', 'wind_speed_past1h',
    'humidity_past1h', 'precip_past1h']]
#Converting variables to float64
df['hour'] = df['hour'].astype('float64')
df['spot_price_dkk'] = df['spot_price_dkk'].astype('float64')

print(df.head().to_string(), "\n")
print(df.info())

In [None]:
##Checking for missing values
print(df.isna().any())
#No missing values

In [None]:
##Splitting dataset
X = df[['hour', 'spot_price_dkk', 'temp_mean_past1h', 'wind_speed_past1h', 'humidity_past1h', 'precip_past1h']]
y = df[['consumption_kwh']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train)

In [None]:
####Linear Regression####
##Training and evaluating linear regression model without preprocessing
est = sm.OLS(y_train, sm.add_constant(X_train)) #model with constant
est_fit = est.fit()
print(est_fit.summary()) #All but hour and precip_past1h are significant
#In a zero-intercept model, all but precip_past1h and spot_price_dkk are significant


In [None]:
#Linear regression with scaling
X_train_scale = StandardScaler().fit_transform(X_train)
est = sm.OLS(y_train, sm.add_constant(X_train_scale))
est_fit = est.fit()
print(est_fit.summary())
#Conclude on variable importance based on coefficients...

In [100]:
###Prediction accuracy - linear regression
X_test_scale = StandardScaler().fit_transform(X_test)
y_pred_lm = est_fit.predict(sm.add_constant(X_test_scale))
mse_lm = mean_squared_error(y_test, y_pred_lm)
print(round(mse_lm)) #3,498,841,479

3866886527


In [101]:
###Random Forest
#Documentation https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
rf_model = RandomForestRegressor(n_estimators = 200, bootstrap=True) #200 trees
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test) #Test
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(round(mse_rf)) #1,540,289,685
#Performs significantly better than lm



  return fit_method(estimator, *args, **kwargs)


1251774367


In [114]:
###Support Vector Machines
svm_model = SVR()
svm_model.fit(X_train, y_train) #Train
y_pred_svm = svm_model.predict(X_test) #Test
mse_svm = mean_squared_error(y_test, y_pred_svm) #MSE
print(round(mse_svm)) #3,906,756,583

4752477125


  y = column_or_1d(y, warn=True)


In [103]:
#Comparing models
mse_data = {'Model': ['Linear Model', 'Random Forrest', 'Support Vector Machine'],
            'MSE': [mse_lm, mse_rf, mse_svm]
    }

mse_df = pd.DataFrame(mse_data) #Make dataframe of MSE data

print(mse_df)


                    Model           MSE
0            Linear Model  3.866887e+09
1          Random Forrest  1.251774e+09
2  Support Vector Machine  4.752477e+09


Implementing cross validation on the three models

In [110]:
#Linear model CV
# Define a pipeline with StandardScaler and LinearRegression
pipeline = make_pipeline(StandardScaler(), LinearRegression())

# Perform cross-validation
mse_scores_lm = -cross_val_score(pipeline, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

# Calculate mean and standard deviation of MSE scores
mean_mse_lm = np.mean(mse_scores_lm)
std_mse_lm = np.std(mse_scores_lm)

# Print mean squared error from cross-validation
print("Mean Squared Error (Cross-validation):", round(mean_mse_lm))
print("Standard Deviation of MSE (Cross-validation):", round(std_mse_lm))

Mean Squared Error (Cross-validation): 3371586478
Standard Deviation of MSE (Cross-validation): 694220397


In [None]:
#Random forest CV
rf_model = RandomForestRegressor(n_estimators=200, bootstrap=True)

#Cross-validation
mse_scores_rf = -cross_val_score(rf_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

# Calculate mean and standard deviation of MSE scores
mean_mse_rf = np.mean(mse_scores_rf)
std_mse_rf = np.std(mse_scores_rf)

# Train model on full training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Calculate mse on the test set
mse_rf = mean_squared_error(y_test, y_pred_rf)

# Print mse from cv and on test set
print("Mean Squared Error (Cross-validation):", round(mean_mse_rf))
print("Standard Deviation of MSE (Cross-validation):", round(std_mse_rf)) #Large variation in mse scores.
#Suggests that the splits from the small dataset has large influence on performance.
print("Mean Squared Error (Test set):", round(mse_rf))

In [113]:
#Support Vector Machines CV
svm_model = SVR()

#CV
mse_scores_svm = -cross_val_score(svm_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

# Calculate mean and standard deviation of MSE scores
mean_mse_svm = np.mean(mse_scores_svm)
std_mse_svm = np.std(mse_scores_svm)

print("Mean Squared Error (Cross-validation):", round(mean_mse_svm))
print("Standard Deviation of MSE (Cross-validation):", round(std_mse_svm))

Mean Squared Error (Cross-validation): 3753792972
Standard Deviation of MSE (Cross-validation): 915109063


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
