In [91]:
#Libraries
import pandas as pd
import numpy as np
from etl.utils import read_sql_table
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import statsmodels.api as sm

In [154]:
df = read_sql_table("gold_cpw")

###Variable manipulation
# Convert HourDK to datetime if it's not already in datetime format
df['hour_utc'] = pd.to_datetime(df['hour_utc'])

# Extract the hour from the HourDK column
df['hour'] = df['hour_utc'].dt.hour

#Subsetting variables
df = df[['hour', 'consumption_kwh', 'spot_price_dkk', 'temp_mean_past1h', 'wind_speed_past1h',
    'humidity_past1h', 'precip_past1h']]

#Converting spot price to float64
df['spot_price_dkk'] = df['spot_price_dkk'].astype('float64')

#Dummy encoding 'hour'
hour_dummies = pd.get_dummies(df['hour'], prefix='hour').astype('float64')

#Concatenate this with original dataframe
df = pd.concat([df, hour_dummies], axis=1)

#Drop hour
df = df.drop('hour', axis=1)

#print(df.head().to_string(), "\n")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   consumption_kwh    647 non-null    float64
 1   spot_price_dkk     647 non-null    float64
 2   temp_mean_past1h   647 non-null    float64
 3   wind_speed_past1h  647 non-null    float64
 4   humidity_past1h    647 non-null    float64
 5   precip_past1h      647 non-null    float64
 6   hour_0             647 non-null    float64
 7   hour_1             647 non-null    float64
 8   hour_2             647 non-null    float64
 9   hour_3             647 non-null    float64
 10  hour_4             647 non-null    float64
 11  hour_5             647 non-null    float64
 12  hour_6             647 non-null    float64
 13  hour_7             647 non-null    float64
 14  hour_8             647 non-null    float64
 15  hour_9             647 non-null    float64
 16  hour_10            647 non

In [155]:
##Checking for missing values
print(df.isna().any())
#No missing values

consumption_kwh      False
spot_price_dkk       False
temp_mean_past1h     False
wind_speed_past1h    False
humidity_past1h      False
precip_past1h        False
hour_0               False
hour_1               False
hour_2               False
hour_3               False
hour_4               False
hour_5               False
hour_6               False
hour_7               False
hour_8               False
hour_9               False
hour_10              False
hour_11              False
hour_12              False
hour_13              False
hour_14              False
hour_15              False
hour_16              False
hour_17              False
hour_18              False
hour_19              False
hour_20              False
hour_21              False
hour_22              False
hour_23              False
dtype: bool


In [169]:
##Splitting dataset
X = df.drop('consumption_kwh', axis=1)
y = df[['consumption_kwh']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
#print(y_train)

     consumption_kwh
0         143408.914
1         136154.493
2         130648.111
3         126554.756
4         131530.892
..               ...
642       187155.927
643       167505.560
644       157276.047
645       151904.994
646       134374.880

[647 rows x 1 columns]


In [167]:
####Linear Regression####
##Training and evaluating linear regression model without preprocessing
est = sm.OLS(y_train, sm.add_constant(X_train)) #model with constant
est_fit = est.fit()
print(est_fit.summary()) #All but hour and precip_past1h are significant
#In a zero-intercept model, all but precip_past1h and spot_price_dkk are significant


                            OLS Regression Results                            
Dep. Variable:        consumption_kwh   R-squared:                       0.410
Model:                            OLS   Adj. R-squared:                  0.376
Method:                 Least Squares   F-statistic:                     12.09
Date:                Wed, 15 May 2024   Prob (F-statistic):           3.56e-40
Time:                        17:29:30   Log-Likelihood:                -6306.6
No. Observations:                 517   AIC:                         1.267e+04
Df Residuals:                     488   BIC:                         1.279e+04
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              1.966e+05   2.57e+0

In [158]:
#Linear regression with scaling
X_train_scale = StandardScaler().fit_transform(X_train)
est = sm.OLS(y_train, sm.add_constant(X_train_scale))
est_fit = est.fit()
print(est_fit.summary())

                            OLS Regression Results                            
Dep. Variable:        consumption_kwh   R-squared:                       0.410
Model:                            OLS   Adj. R-squared:                  0.376
Method:                 Least Squares   F-statistic:                     12.09
Date:                Wed, 15 May 2024   Prob (F-statistic):           3.56e-40
Time:                        17:22:32   Log-Likelihood:                -6306.6
No. Observations:                 517   AIC:                         1.267e+04
Df Residuals:                     488   BIC:                         1.279e+04
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2.006e+05   2174.081     92.264      0.0

In [159]:
###Prediction accuracy - linear regression
X_test_scale = StandardScaler().fit_transform(X_test)
y_pred_lm = est_fit.predict(sm.add_constant(X_test_scale))
rmse_lm = root_mean_squared_error(y_test, y_pred_lm)
print(round(rmse_lm))

52962


In [160]:
###Random Forest
#Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
rf_model = RandomForestRegressor(n_estimators = 200, bootstrap=True) #200 trees
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test) #Test
rmse_rf = root_mean_squared_error(y_test, y_pred_rf)
print(round(rmse_rf))

  return fit_method(estimator, *args, **kwargs)


48017


In [161]:
###Support Vector Machines
#Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
svm_model = SVR(kernel='linear', C=1000)
svm_model.fit(X_train, y_train) #Train
y_pred_svm = svm_model.predict(X_test) #Test
rmse_svm = root_mean_squared_error(y_test, y_pred_svm) #RMSE
print(round(rmse_svm))

  y = column_or_1d(y, warn=True)


59363


In [162]:
#Comparing models
rmse_values = [rmse_lm, rmse_rf, rmse_svm]
rmse_data = {'Model': ['Linear Model', 'Random Forrest', 'Support Vector Machine'],
            'RMSE': [round(value) for value in rmse_values]
    }

rmse_df = pd.DataFrame(rmse_data) #Make dataframe of RMSE data

print(round(rmse_df))

                    Model   RMSE
0            Linear Model  52962
1          Random Forrest  48017
2  Support Vector Machine  59363


Implementing cross validation on the three models

In [163]:
#Linear model CV
# Define a pipeline with StandardScaler and LinearRegression
pipeline = make_pipeline(StandardScaler(), LinearRegression())

# Perform cross-validation
rmse_scores_lm = -cross_val_score(pipeline, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')

# Calculate mean and standard deviation of RMSE scores
mean_rmse_lm = np.mean(rmse_scores_lm)
std_rmse_lm = np.std(rmse_scores_lm)

# Print mean squared error from cross-validation
print("Root Mean Squared Error (Cross-validation):", round(mean_rmse_lm))
print("Standard Deviation of RMSE (Cross-validation):", round(std_rmse_lm))

Root Mean Squared Error (Cross-validation): 50861
Standard Deviation of RMSE (Cross-validation): 9730


In [164]:
#Random forest CV
rf_model = RandomForestRegressor(n_estimators=200, bootstrap=True)

#Cross-validation
rmse_scores_rf = -cross_val_score(rf_model, X, y, cv=10, scoring='neg_root_mean_squared_error')

# Calculate mean and standard deviation of RMSE scores
mean_rmse_rf = np.mean(rmse_scores_rf)
std_rmse_rf = np.std(rmse_scores_rf)

# Train model on full training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Calculate rmse on the test set
rmse_rf = root_mean_squared_error(y_test, y_pred_rf)

# Print rmse from cv and on test set
print("Root Mean Squared Error (Cross-validation):", round(mean_rmse_rf))
print("Standard Deviation of RMSE (Cross-validation):", round(std_rmse_rf)) #Large variation in rmse scores.
#Suggests that the splits from the small dataset has large influence on performance.
print("Root Mean Squared Error (Test set):", round(rmse_rf))

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Root Mean Squared Error (Cross-validation): 52270
Standard Deviation of RMSE (Cross-validation): 20524
Root Mean Squared Error (Test set): 48321


In [165]:
#Support Vector Machines CV
svm_model = SVR(C=1000)

#CV
rmse_scores_svm = -cross_val_score(svm_model, X, y, cv=10, scoring='neg_root_mean_squared_error')

# Calculate mean and standard deviation of RMSE scores
mean_rmse_svm = np.mean(rmse_scores_svm)
std_rmse_svm = np.std(rmse_scores_svm)

print("Root Mean Squared Error (Cross-validation):", round(mean_rmse_svm))
print("Standard Deviation of RMSE (Cross-validation):", round(std_rmse_svm))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Root Mean Squared Error (Cross-validation): 58424
Standard Deviation of RMSE (Cross-validation): 26623


In [166]:
#Comparing CV models
mean_rmse_values = [mean_rmse_lm, mean_rmse_rf, mean_rmse_svm]
mean_std_values = [std_rmse_lm, std_rmse_rf, std_rmse_svm]
rmse_data = {'CV Model': ['Linear Model', 'Random Forrest', 'Support Vector Machine'],
            'Mean RMSE': [round(value) for value in mean_rmse_values],
            'Std': [round(value) for value in mean_std_values]
        }

rmse_df = pd.DataFrame(rmse_data) #Make dataframe of RMSE and std data

print(rmse_df)

                 CV Model  Mean RMSE    Std
0            Linear Model      50861   9730
1          Random Forrest      52270  20524
2  Support Vector Machine      58424  26623
