# Importing Repositories

In [116]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import holidays
warnings.filterwarnings("ignore")
sns.set_theme(style="darkgrid")
plt.rcParams['figure.figsize']=(20,10)

# Functions

In [117]:
def missing_zero_values_table(df): 
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
        mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
        mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Our selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
    
        return mz_table

# Importing data files

## Paths for files

In [118]:
path_price = r'C:\Users\2836589S\OneDrive - University of Glasgow\Data\DayAheadPrices_12.1.D'
path_totalload = r'C:\Users\2836589S\OneDrive - University of Glasgow\Data\DayAheadTotalLoadForecast'
path_windsolar = r'C:\Users\2836589S\OneDrive - University of Glasgow\Data\DayAheadGenerationForecastForWindAndSolar_14.1.D'

## Price Timeseries Importing

In [119]:

df_price = pd.read_csv(r'C:\Users\2836589S\OneDrive - University of Glasgow\Data\Data Frames\price_timeseries_outliers.csv', 
parse_dates=['DateTime'])


In [120]:
df_price = df_price[['DateTime', 'Year', 'Month','Week', 'Day', 'Hour', 'Price', 'Outlier']]
df_price

Unnamed: 0,DateTime,Year,Month,Week,Day,Hour,Price,Outlier
0,2015-01-01 00:00:00,2015,1,1,3,0,0.00,N
1,2015-01-01 01:00:00,2015,1,1,3,1,0.00,N
2,2015-01-01 02:00:00,2015,1,1,3,2,0.00,N
3,2015-01-01 03:00:00,2015,1,1,3,3,0.00,N
4,2015-01-01 04:00:00,2015,1,1,3,4,0.00,N
...,...,...,...,...,...,...,...,...
52529,2020-12-31 18:00:00,2020,12,53,3,18,71.30,N
52530,2020-12-31 19:00:00,2020,12,53,3,19,61.04,N
52531,2020-12-31 20:00:00,2020,12,53,3,20,60.39,N
52532,2020-12-31 21:00:00,2020,12,53,3,21,57.96,N


## Wind and Solar Timeseries Importing

In [121]:
f_windsolar = glob.glob(path_windsolar + "/*.csv")
dwindsolar = [pd.read_csv(f, sep='\t', parse_dates=['DateTime']) for f in f_windsolar]

df_windsolar = pd.concat(dwindsolar)
df_windsolar = df_windsolar[df_windsolar['AreaCode'] == '10YGB----------A']
df_windsolar = df_windsolar[['DateTime', 'ProductionType', 'AggregatedGenerationForecast']]
df_windsolar.sort_values(by='DateTime', ascending=True)
df_windsolar.reset_index(drop=True, inplace=True)

In [122]:
df_windsolar

Unnamed: 0,DateTime,ProductionType,AggregatedGenerationForecast
0,2014-12-30 00:00:00,Solar,0.00
1,2014-12-30 01:00:00,Solar,0.00
2,2014-12-30 02:00:00,Solar,0.00
3,2014-12-30 03:00:00,Solar,0.00
4,2014-12-30 04:00:00,Solar,0.00
...,...,...,...
509107,2021-06-14 19:00:00,Wind Offshore,1563.23
509108,2021-06-14 20:00:00,Wind Offshore,1590.47
509109,2021-06-14 21:00:00,Wind Offshore,1574.14
509110,2021-06-14 22:00:00,Wind Offshore,1450.72


### Splitting the WindSolar Dataset into 3 Datasets per Solar, Wind Offshore, Wind Onshore

In [123]:
df_windsolar['ProductionType'].unique()

array(['Solar', 'Wind Offshore', 'Wind Onshore'], dtype=object)

In [124]:
df_solar = df_windsolar[df_windsolar['ProductionType'] == 'Solar']
df_solar.sort_values(by='DateTime', ascending=True)
df_solar.reset_index(drop=True, inplace=True)
df_solar.rename(columns={'AggregatedGenerationForecast':'SolarGeneration'}, inplace=True)
df_solar = df_solar[['DateTime','SolarGeneration']]
df_solar

Unnamed: 0,DateTime,SolarGeneration
0,2014-12-30 00:00:00,0.0
1,2014-12-30 01:00:00,0.0
2,2014-12-30 02:00:00,0.0
3,2014-12-30 03:00:00,0.0
4,2014-12-30 04:00:00,0.0
...,...,...
169699,2021-06-14 19:00:00,227.0
169700,2021-06-14 20:00:00,11.5
169701,2021-06-14 21:00:00,0.0
169702,2021-06-14 22:00:00,0.0


In [125]:
# Taking of the outliers
q1 , q3 = np.percentile(df_solar['SolarGeneration'], [25,75])
iqr = q3 - q1
ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr
df_solar = df_solar[(df_solar['SolarGeneration'] > ll) | (df_solar['SolarGeneration'] < ul) ]

In [126]:
df_wind_off = df_windsolar[df_windsolar['ProductionType'] == 'Wind Offshore']
df_wind_off.sort_values(by='DateTime', ascending=True)
df_wind_off.reset_index(drop=True, inplace=True)
df_wind_off.rename(columns={'AggregatedGenerationForecast':'WindOffGeneration'}, inplace=True)
df_wind_off = df_wind_off[['DateTime', 'WindOffGeneration']]
df_wind_off

Unnamed: 0,DateTime,WindOffGeneration
0,2014-12-30 00:00:00,996.58
1,2014-12-30 01:00:00,1059.20
2,2014-12-30 02:00:00,1123.27
3,2014-12-30 03:00:00,1177.54
4,2014-12-30 04:00:00,1188.21
...,...,...
169699,2021-06-14 19:00:00,1563.23
169700,2021-06-14 20:00:00,1590.47
169701,2021-06-14 21:00:00,1574.14
169702,2021-06-14 22:00:00,1450.72


In [127]:
# Taking care of the outliers
q1 , q3 = np.percentile(df_wind_off['WindOffGeneration'], [25,75])
iqr = q3 - q1
ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr
df_wind_off = df_wind_off[(df_wind_off['WindOffGeneration'] > ll) | (df_wind_off['WindOffGeneration']) ]

In [128]:
df_wind_on = df_windsolar[df_windsolar['ProductionType'] == 'Wind Onshore']
df_wind_on.sort_values(by='DateTime', ascending=True)
df_wind_on.reset_index(drop=True, inplace=True)
df_wind_on.rename(columns={'AggregatedGenerationForecast':'WindOnGeneration'}, inplace=True)
df_wind_on = df_wind_on[['DateTime','WindOnGeneration']]
df_wind_on

Unnamed: 0,DateTime,WindOnGeneration
0,2014-12-30 00:00:00,2363.80
1,2014-12-30 01:00:00,2413.88
2,2014-12-30 02:00:00,2450.75
3,2014-12-30 03:00:00,2500.78
4,2014-12-30 04:00:00,2536.07
...,...,...
169699,2021-06-14 19:00:00,4896.71
169700,2021-06-14 20:00:00,3928.19
169701,2021-06-14 21:00:00,3277.03
169702,2021-06-14 22:00:00,2880.91


In [129]:
# Taking care of the outliers
q1 , q3 = np.percentile(df_wind_on['WindOnGeneration'], [25,75])
iqr = q3 - q1
ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr
df_wind_on = df_wind_on[(df_wind_on['WindOnGeneration'] > ll) | (df_wind_on['WindOnGeneration']) ]

## Total Load Timeseries Importing

In [130]:
f_totalload = glob.glob(path_totalload + "/*.csv")
dtotalload = [pd.read_csv(f, encoding='utf-16', sep='\t', parse_dates=['DateTime']) for f in f_totalload]

df_totalload = pd.concat(dtotalload)
df_totalload = df_totalload[df_totalload['AreaCode'] == '10YGB----------A']
df_totalload = df_totalload[['DateTime', 'TotalLoadValue']]
df_totalload.reset_index(drop=True, inplace=True)


In [131]:
df_totalload.sort_values(by=['DateTime'], ascending=True)
df_totalload.reset_index(drop=True, inplace=True)
df_totalload

Unnamed: 0,DateTime,TotalLoadValue
0,2014-12-29 00:00:00,28798.0
1,2014-12-29 02:00:00,29534.0
2,2014-12-29 00:30:00,29961.0
3,2014-12-29 02:30:00,29627.0
4,2014-12-29 04:30:00,26717.0
...,...,...
202693,2020-09-30 14:30:00,34726.0
202694,2020-09-30 16:30:00,36430.0
202695,2020-09-30 18:30:00,36366.0
202696,2020-09-30 20:30:00,29449.0


# Merging Datasets

In [132]:
df_price = df_price[df_price['Outlier'] == 'N']
df_merged = pd.merge(df_price, df_solar, how='left', on='DateTime')
df_merged = pd.merge(df_merged, df_wind_off, how='left', on='DateTime')
df_merged = pd.merge(df_merged, df_wind_on, how='left', on='DateTime')

In [133]:
df_merged.drop_duplicates(subset=['DateTime'], keep='first', inplace=True)

In [134]:
df_model = pd.DataFrame()
df_model = df_merged[['Price','Year', 'Month', 'Day', 'Hour', 'SolarGeneration', 'WindOffGeneration',
'WindOnGeneration']]
df_model.set_index(df_merged['DateTime'], inplace=True)
df_model

Unnamed: 0_level_0,Price,Year,Month,Day,Hour,SolarGeneration,WindOffGeneration,WindOnGeneration
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-01 00:00:00,0.00,2015,1,3,0,0.0,2987.05,4013.58
2015-01-01 01:00:00,0.00,2015,1,3,1,0.0,2926.71,4074.53
2015-01-01 02:00:00,0.00,2015,1,3,2,0.0,2867.00,4157.48
2015-01-01 03:00:00,0.00,2015,1,3,3,0.0,2808.72,4229.50
2015-01-01 04:00:00,0.00,2015,1,3,4,0.0,2769.59,4133.56
...,...,...,...,...,...,...,...,...
2020-12-31 18:00:00,71.30,2020,12,3,18,0.0,2644.97,3028.41
2020-12-31 19:00:00,61.04,2020,12,3,19,0.0,2707.55,3046.72
2020-12-31 20:00:00,60.39,2020,12,3,20,0.0,2695.01,3053.60
2020-12-31 21:00:00,57.96,2020,12,3,21,0.0,2829.57,3086.61


## Transforming Solar, Wind Onshore and Wind Offshore Generation values

In [135]:
df_model['SolarGeneration'] = df_model['SolarGeneration'].transform('sqrt')
df_model['WindOffGeneration'] = df_model['WindOffGeneration'].transform('sqrt')
df_model['WindOnGeneration'] = df_model['WindOnGeneration'].transform('sqrt')

# Lasso Regression

In [136]:
lags = list(range(1,667))

In [137]:
def lag_col(df, variable, lags):
    for lag in lags:
        df[str(variable) + '_' + str(lag)] = df[variable].shift(lag)
    return df

In [138]:
lag_col(df_model, 'SolarGeneration', lags)
lag_col(df_model, 'WindOffGeneration', lags)
lag_col(df_model, 'WindOnGeneration', lags)

Unnamed: 0_level_0,Price,Year,Month,Day,Hour,SolarGeneration,WindOffGeneration,WindOnGeneration,SolarGeneration_1,SolarGeneration_2,...,WindOnGeneration_657,WindOnGeneration_658,WindOnGeneration_659,WindOnGeneration_660,WindOnGeneration_661,WindOnGeneration_662,WindOnGeneration_663,WindOnGeneration_664,WindOnGeneration_665,WindOnGeneration_666
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,0.00,2015,1,3,0,0.0,54.653911,63.352822,,,...,,,,,,,,,,
2015-01-01 01:00:00,0.00,2015,1,3,1,0.0,54.099076,63.832045,0.0,,...,,,,,,,,,,
2015-01-01 02:00:00,0.00,2015,1,3,2,0.0,53.544374,64.478524,0.0,0.000000,...,,,,,,,,,,
2015-01-01 03:00:00,0.00,2015,1,3,3,0.0,52.997358,65.034606,0.0,0.000000,...,,,,,,,,,,
2015-01-01 04:00:00,0.00,2015,1,3,4,0.0,52.626894,64.292768,0.0,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 18:00:00,71.30,2020,12,3,18,0.0,51.429272,55.030991,0.0,18.493242,...,62.768543,61.548924,60.452626,59.124191,57.716462,55.615106,53.171609,51.387547,50.427473,49.425499
2020-12-31 19:00:00,61.04,2020,12,3,19,0.0,52.034123,55.197101,0.0,0.000000,...,64.089235,62.768543,61.548924,60.452626,59.124191,57.716462,55.615106,53.171609,51.387547,50.427473
2020-12-31 20:00:00,60.39,2020,12,3,20,0.0,51.913486,55.259388,0.0,0.000000,...,66.144236,64.089235,62.768543,61.548924,60.452626,59.124191,57.716462,55.615106,53.171609,51.387547
2020-12-31 21:00:00,57.96,2020,12,3,21,0.0,53.193703,55.557268,0.0,0.000000,...,67.855508,66.144236,64.089235,62.768543,61.548924,60.452626,59.124191,57.716462,55.615106,53.171609


In [139]:
df_model.drop(columns=['Year','Month','Day','Hour'], inplace=True)

In [140]:
df_model.dropna(inplace=True)

In [141]:
df_model

Unnamed: 0_level_0,Price,SolarGeneration,WindOffGeneration,WindOnGeneration,SolarGeneration_1,SolarGeneration_2,SolarGeneration_3,SolarGeneration_4,SolarGeneration_5,SolarGeneration_6,...,WindOnGeneration_657,WindOnGeneration_658,WindOnGeneration_659,WindOnGeneration_660,WindOnGeneration_661,WindOnGeneration_662,WindOnGeneration_663,WindOnGeneration_664,WindOnGeneration_665,WindOnGeneration_666
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-30 22:00:00,34.75,0.0,52.314434,61.839389,0.0,0.000000,0.000000,0.000000,20.396078,26.739484,...,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768,65.034606,64.478524,63.832045,63.352822
2015-01-30 23:00:00,37.04,0.0,51.604263,62.640961,0.0,0.000000,0.000000,0.000000,0.000000,20.396078,...,68.319617,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768,65.034606,64.478524,63.832045
2015-01-31 00:00:00,36.86,0.0,44.168314,71.781195,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,69.889699,68.319617,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768,65.034606,64.478524
2015-01-31 01:00:00,33.01,0.0,44.359666,73.086456,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,70.940398,69.889699,68.319617,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768,65.034606
2015-01-31 02:00:00,31.46,0.0,44.520332,73.516801,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,71.690306,70.940398,69.889699,68.319617,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 18:00:00,71.30,0.0,51.429272,55.030991,0.0,18.493242,36.837481,46.260134,50.368641,50.635956,...,62.768543,61.548924,60.452626,59.124191,57.716462,55.615106,53.171609,51.387547,50.427473,49.425499
2020-12-31 19:00:00,61.04,0.0,52.034123,55.197101,0.0,0.000000,18.493242,36.837481,46.260134,50.368641,...,64.089235,62.768543,61.548924,60.452626,59.124191,57.716462,55.615106,53.171609,51.387547,50.427473
2020-12-31 20:00:00,60.39,0.0,51.913486,55.259388,0.0,0.000000,0.000000,18.493242,36.837481,46.260134,...,66.144236,64.089235,62.768543,61.548924,60.452626,59.124191,57.716462,55.615106,53.171609,51.387547
2020-12-31 21:00:00,57.96,0.0,53.193703,55.557268,0.0,0.000000,0.000000,0.000000,18.493242,36.837481,...,67.855508,66.144236,64.089235,62.768543,61.548924,60.452626,59.124191,57.716462,55.615106,53.171609


## Scikit Learn approach with LassoCV

In [142]:
X = df_model.drop(columns='Price')
y = df_model.Price

In [143]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

In [144]:
# Taken from Sklearn.model_selection documentation

for train, test in tscv.split(X):
    print("%s %s" % (train, test))

[   0    1    2 ... 8106 8107 8108] [ 8109  8110  8111 ... 16211 16212 16213]
[    0     1     2 ... 16211 16212 16213] [16214 16215 16216 ... 24316 24317 24318]
[    0     1     2 ... 24316 24317 24318] [24319 24320 24321 ... 32421 32422 32423]
[    0     1     2 ... 32421 32422 32423] [32424 32425 32426 ... 40526 40527 40528]
[    0     1     2 ... 40526 40527 40528] [40529 40530 40531 ... 48631 48632 48633]


In [145]:
from sklearn.linear_model import LassoCV

reg = LassoCV(cv=tscv, random_state=0).fit(X,y)

In [146]:
# R squared

reg.score(X,y)

0.3008032858110984

In [147]:
# Alpha value of our regressor

reg.alpha_

2.3626045371945605

In [148]:
# Matrix of alpha values used during the fitting


reg.alphas_

array([102.27151043,  95.37875292,  88.95054419,  82.95557522,
        77.36464709,  72.15052881,  67.28782465,  62.75285047,
        58.52351838,  54.57922911,  50.90077174,  47.47023008,
        44.27089545,  41.28718527,  38.50456716,  35.90948821,
        33.48930889,  31.23224155,  29.127293  ,  27.16421094,
        25.33343404,  23.62604537,  22.03372899,  20.5487294 ,
        19.16381381,  17.8722369 ,  16.66770795,  15.54436022,
        14.49672236,  13.5196918 ,  12.60850982,  11.75873845,
        10.96623884,  10.22715104,   9.53787529,   8.89505442,
         8.29555752,   7.73646471,   7.21505288,   6.72878246,
         6.27528505,   5.85235184,   5.45792291,   5.09007717,
         4.74702301,   4.42708955,   4.12871853,   3.85045672,
         3.59094882,   3.34893089,   3.12322416,   2.9127293 ,
         2.71642109,   2.5333434 ,   2.36260454,   2.2033729 ,
         2.05487294,   1.91638138,   1.78722369,   1.66677079,
         1.55443602,   1.44967224,   1.35196918,   1.26

In [149]:
predictors = X.columns.to_list()

coef_table = pd.DataFrame(list(predictors)).copy()
coef_table.insert(len(coef_table.columns), "Coefficients", reg.coef_.transpose())
coef_table.sort_values(by='Coefficients', ascending=False).reset_index(drop=True).head(20)

Unnamed: 0,0,Coefficients
0,SolarGeneration_7,0.057168
1,WindOnGeneration_666,0.046112
2,WindOnGeneration_17,0.033175
3,SolarGeneration_8,0.031703
4,WindOnGeneration_53,0.026094
5,SolarGeneration_649,0.021393
6,SolarGeneration_18,0.020215
7,WindOnGeneration_468,0.018002
8,WindOnGeneration_647,0.017115
9,WindOnGeneration_283,0.016908


## Pycaret

In [150]:
from pycaret.regression import *

In [151]:
s = setup(data=df_model, target='Price', fold_strategy=tscv, fold=5)


Unnamed: 0,Description,Value
0,Session id,1820
1,Target,Price
2,Target type,Regression
3,Data shape,"(48634, 2002)"
4,Train data shape,"(34043, 2002)"
5,Test data shape,"(14591, 2002)"
6,Numeric features,2001
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


In [152]:
lasso = create_model('lasso')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,8.4512,121.0422,11.0019,0.2853,0.3178,2.0159
1,8.5452,124.3796,11.1526,0.2749,0.3179,1.6857
2,8.6002,122.9505,11.0883,0.3073,0.3073,1.4194
3,8.4429,121.1816,11.0083,0.2985,0.312,0.8224
4,8.4657,121.5607,11.0255,0.302,0.2921,0.9124
Mean,8.5011,122.2229,11.0553,0.2936,0.3094,1.3712
Std,0.0615,1.2726,0.0575,0.0118,0.0095,0.4535


In [153]:
evaluate_model(lasso)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…