# Importing Repositories

In [121]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import holidays
warnings.filterwarnings("ignore")
sns.set_theme(style="darkgrid")
plt.rcParams['figure.figsize']=(20,10)

# Functions

In [122]:
def missing_zero_values_table(df): 
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
        mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
        mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Our selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
    
        return mz_table

# Importing data files

## Paths for files

In [123]:
path_price = r'C:\Users\2836589S\OneDrive - University of Glasgow\Data\DayAheadPrices_12.1.D'
path_totalload = r'C:\Users\2836589S\OneDrive - University of Glasgow\Data\DayAheadTotalLoadForecast'
path_windsolar = r'C:\Users\2836589S\OneDrive - University of Glasgow\Data\DayAheadGenerationForecastForWindAndSolar_14.1.D'

## Price Timeseries Importing

In [124]:

df_price = pd.read_csv(r'C:\Users\2836589S\OneDrive - University of Glasgow\Data\Data Frames\price_timeseries_outliers.csv', 
parse_dates=['DateTime'])


In [125]:
df_price = df_price[['DateTime', 'Year', 'Month','Week', 'Day', 'Hour', 'Price', 'Outlier']]
df_price

Unnamed: 0,DateTime,Year,Month,Week,Day,Hour,Price,Outlier
0,2015-01-01 00:00:00,2015,1,1,3,0,0.00,N
1,2015-01-01 01:00:00,2015,1,1,3,1,0.00,N
2,2015-01-01 02:00:00,2015,1,1,3,2,0.00,N
3,2015-01-01 03:00:00,2015,1,1,3,3,0.00,N
4,2015-01-01 04:00:00,2015,1,1,3,4,0.00,N
...,...,...,...,...,...,...,...,...
52529,2020-12-31 18:00:00,2020,12,53,3,18,71.30,N
52530,2020-12-31 19:00:00,2020,12,53,3,19,61.04,N
52531,2020-12-31 20:00:00,2020,12,53,3,20,60.39,N
52532,2020-12-31 21:00:00,2020,12,53,3,21,57.96,N


## Wind and Solar Timeseries Importing

In [126]:
f_windsolar = glob.glob(path_windsolar + "/*.csv")
dwindsolar = [pd.read_csv(f, sep='\t', parse_dates=['DateTime']) for f in f_windsolar]

df_windsolar = pd.concat(dwindsolar)
df_windsolar = df_windsolar[df_windsolar['AreaCode'] == '10YGB----------A']
df_windsolar = df_windsolar[['DateTime', 'ProductionType', 'AggregatedGenerationForecast']]
df_windsolar.sort_values(by='DateTime', ascending=True)
df_windsolar.reset_index(drop=True, inplace=True)

In [127]:
df_windsolar

Unnamed: 0,DateTime,ProductionType,AggregatedGenerationForecast
0,2014-12-30 00:00:00,Solar,0.00
1,2014-12-30 01:00:00,Solar,0.00
2,2014-12-30 02:00:00,Solar,0.00
3,2014-12-30 03:00:00,Solar,0.00
4,2014-12-30 04:00:00,Solar,0.00
...,...,...,...
509107,2021-06-14 19:00:00,Wind Offshore,1563.23
509108,2021-06-14 20:00:00,Wind Offshore,1590.47
509109,2021-06-14 21:00:00,Wind Offshore,1574.14
509110,2021-06-14 22:00:00,Wind Offshore,1450.72


### Splitting the WindSolar Dataset into 3 Datasets per Solar, Wind Offshore, Wind Onshore

In [128]:
df_windsolar['ProductionType'].unique()

array(['Solar', 'Wind Offshore', 'Wind Onshore'], dtype=object)

In [129]:
df_solar = df_windsolar[df_windsolar['ProductionType'] == 'Solar']
df_solar.sort_values(by='DateTime', ascending=True)
df_solar.reset_index(drop=True, inplace=True)
df_solar.rename(columns={'AggregatedGenerationForecast':'SolarGeneration'}, inplace=True)
df_solar = df_solar[['DateTime','SolarGeneration']]
df_solar

Unnamed: 0,DateTime,SolarGeneration
0,2014-12-30 00:00:00,0.0
1,2014-12-30 01:00:00,0.0
2,2014-12-30 02:00:00,0.0
3,2014-12-30 03:00:00,0.0
4,2014-12-30 04:00:00,0.0
...,...,...
169699,2021-06-14 19:00:00,227.0
169700,2021-06-14 20:00:00,11.5
169701,2021-06-14 21:00:00,0.0
169702,2021-06-14 22:00:00,0.0


In [130]:
# Taking of the outliers
q1 , q3 = np.percentile(df_solar['SolarGeneration'], [25,75])
iqr = q3 - q1
ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr
df_solar = df_solar[(df_solar['SolarGeneration'] > ll) | (df_solar['SolarGeneration'] < ul) ]

In [131]:
df_wind_off = df_windsolar[df_windsolar['ProductionType'] == 'Wind Offshore']
df_wind_off.sort_values(by='DateTime', ascending=True)
df_wind_off.reset_index(drop=True, inplace=True)
df_wind_off.rename(columns={'AggregatedGenerationForecast':'WindOffGeneration'}, inplace=True)
df_wind_off = df_wind_off[['DateTime', 'WindOffGeneration']]
df_wind_off

Unnamed: 0,DateTime,WindOffGeneration
0,2014-12-30 00:00:00,996.58
1,2014-12-30 01:00:00,1059.20
2,2014-12-30 02:00:00,1123.27
3,2014-12-30 03:00:00,1177.54
4,2014-12-30 04:00:00,1188.21
...,...,...
169699,2021-06-14 19:00:00,1563.23
169700,2021-06-14 20:00:00,1590.47
169701,2021-06-14 21:00:00,1574.14
169702,2021-06-14 22:00:00,1450.72


In [132]:
# Taking care of the outliers
q1 , q3 = np.percentile(df_wind_off['WindOffGeneration'], [25,75])
iqr = q3 - q1
ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr
df_wind_off = df_wind_off[(df_wind_off['WindOffGeneration'] > ll) | (df_wind_off['WindOffGeneration']) ]

In [133]:
df_wind_on = df_windsolar[df_windsolar['ProductionType'] == 'Wind Onshore']
df_wind_on.sort_values(by='DateTime', ascending=True)
df_wind_on.reset_index(drop=True, inplace=True)
df_wind_on.rename(columns={'AggregatedGenerationForecast':'WindOnGeneration'}, inplace=True)
df_wind_on = df_wind_on[['DateTime','WindOnGeneration']]
df_wind_on

Unnamed: 0,DateTime,WindOnGeneration
0,2014-12-30 00:00:00,2363.80
1,2014-12-30 01:00:00,2413.88
2,2014-12-30 02:00:00,2450.75
3,2014-12-30 03:00:00,2500.78
4,2014-12-30 04:00:00,2536.07
...,...,...
169699,2021-06-14 19:00:00,4896.71
169700,2021-06-14 20:00:00,3928.19
169701,2021-06-14 21:00:00,3277.03
169702,2021-06-14 22:00:00,2880.91


In [134]:
# Taking care of the outliers
q1 , q3 = np.percentile(df_wind_on['WindOnGeneration'], [25,75])
iqr = q3 - q1
ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr
df_wind_on = df_wind_on[(df_wind_on['WindOnGeneration'] > ll) | (df_wind_on['WindOnGeneration']) ]

## Total Load Timeseries Importing

In [135]:
f_totalload = glob.glob(path_totalload + "/*.csv")
dtotalload = [pd.read_csv(f, encoding='utf-16', sep='\t', parse_dates=['DateTime']) for f in f_totalload]

df_totalload = pd.concat(dtotalload)
df_totalload = df_totalload[df_totalload['AreaCode'] == '10YGB----------A']
df_totalload = df_totalload[['DateTime', 'TotalLoadValue']]
df_totalload.reset_index(drop=True, inplace=True)


In [136]:
df_totalload.sort_values(by=['DateTime'], ascending=True)
df_totalload.reset_index(drop=True, inplace=True)
df_totalload

Unnamed: 0,DateTime,TotalLoadValue
0,2014-12-29 00:00:00,28798.0
1,2014-12-29 02:00:00,29534.0
2,2014-12-29 00:30:00,29961.0
3,2014-12-29 02:30:00,29627.0
4,2014-12-29 04:30:00,26717.0
...,...,...
202693,2020-09-30 14:30:00,34726.0
202694,2020-09-30 16:30:00,36430.0
202695,2020-09-30 18:30:00,36366.0
202696,2020-09-30 20:30:00,29449.0


# Merging Datasets

In [137]:
df_price = df_price[df_price['Outlier'] == 'N']
df_merged = pd.merge(df_price, df_solar, how='left', on='DateTime')
df_merged = pd.merge(df_merged, df_wind_off, how='left', on='DateTime')
df_merged = pd.merge(df_merged, df_wind_on, how='left', on='DateTime')

In [138]:
df_merged.drop_duplicates(subset=['DateTime'], keep='first', inplace=True)

In [139]:
df_model = pd.DataFrame()
df_model = df_merged[['Price','Year', 'Month', 'Day', 'Hour', 'SolarGeneration', 'WindOffGeneration',
'WindOnGeneration']]
df_model.set_index(df_merged['DateTime'], inplace=True)
df_model

Unnamed: 0_level_0,Price,Year,Month,Day,Hour,SolarGeneration,WindOffGeneration,WindOnGeneration
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-01 00:00:00,0.00,2015,1,3,0,0.0,2987.05,4013.58
2015-01-01 01:00:00,0.00,2015,1,3,1,0.0,2926.71,4074.53
2015-01-01 02:00:00,0.00,2015,1,3,2,0.0,2867.00,4157.48
2015-01-01 03:00:00,0.00,2015,1,3,3,0.0,2808.72,4229.50
2015-01-01 04:00:00,0.00,2015,1,3,4,0.0,2769.59,4133.56
...,...,...,...,...,...,...,...,...
2020-12-31 18:00:00,71.30,2020,12,3,18,0.0,2644.97,3028.41
2020-12-31 19:00:00,61.04,2020,12,3,19,0.0,2707.55,3046.72
2020-12-31 20:00:00,60.39,2020,12,3,20,0.0,2695.01,3053.60
2020-12-31 21:00:00,57.96,2020,12,3,21,0.0,2829.57,3086.61


## Transforming Solar, Wind Onshore and Wind Offshore Generation values

In [140]:
df_model['SolarGeneration'] = df_model['SolarGeneration'].transform('sqrt')
df_model['WindOffGeneration'] = df_model['WindOffGeneration'].transform('sqrt')
df_model['WindOnGeneration'] = df_model['WindOnGeneration'].transform('sqrt')

# Lasso Regression

In [141]:
lags = list(range(1,690))

In [142]:
def lag_col(df, variable, lags):
    for lag in lags:
        df[str(variable) + '_' + str(lag)] = df[variable].shift(lag)
    return df

In [143]:
lag_col(df_model, 'SolarGeneration', lags)
lag_col(df_model, 'WindOffGeneration', lags)
lag_col(df_model, 'WindOnGeneration', lags)

Unnamed: 0_level_0,Price,Year,Month,Day,Hour,SolarGeneration,WindOffGeneration,WindOnGeneration,SolarGeneration_1,SolarGeneration_2,...,WindOnGeneration_680,WindOnGeneration_681,WindOnGeneration_682,WindOnGeneration_683,WindOnGeneration_684,WindOnGeneration_685,WindOnGeneration_686,WindOnGeneration_687,WindOnGeneration_688,WindOnGeneration_689
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,0.00,2015,1,3,0,0.0,54.653911,63.352822,,,...,,,,,,,,,,
2015-01-01 01:00:00,0.00,2015,1,3,1,0.0,54.099076,63.832045,0.0,,...,,,,,,,,,,
2015-01-01 02:00:00,0.00,2015,1,3,2,0.0,53.544374,64.478524,0.0,0.000000,...,,,,,,,,,,
2015-01-01 03:00:00,0.00,2015,1,3,3,0.0,52.997358,65.034606,0.0,0.000000,...,,,,,,,,,,
2015-01-01 04:00:00,0.00,2015,1,3,4,0.0,52.626894,64.292768,0.0,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 18:00:00,71.30,2020,12,3,18,0.0,51.429272,55.030991,0.0,18.493242,...,34.042180,33.996029,34.665689,35.777367,37.412698,39.173843,41.234452,43.266962,45.798799,48.506082
2020-12-31 19:00:00,61.04,2020,12,3,19,0.0,52.034123,55.197101,0.0,0.000000,...,35.543494,34.042180,33.996029,34.665689,35.777367,37.412698,39.173843,41.234452,43.266962,45.798799
2020-12-31 20:00:00,60.39,2020,12,3,20,0.0,51.913486,55.259388,0.0,0.000000,...,37.125598,35.543494,34.042180,33.996029,34.665689,35.777367,37.412698,39.173843,41.234452,43.266962
2020-12-31 21:00:00,57.96,2020,12,3,21,0.0,53.193703,55.557268,0.0,0.000000,...,38.868882,37.125598,35.543494,34.042180,33.996029,34.665689,35.777367,37.412698,39.173843,41.234452


In [144]:
df_model.drop(columns=['Year', 'Month','Day','Hour'], inplace=True)

In [146]:
df_model.dropna(inplace=True)

In [155]:
df_model

Unnamed: 0_level_0,Price,SolarGeneration,WindOffGeneration,WindOnGeneration,SolarGeneration_1,SolarGeneration_2,SolarGeneration_3,SolarGeneration_4,SolarGeneration_5,SolarGeneration_6,...,WindOnGeneration_680,WindOnGeneration_681,WindOnGeneration_682,WindOnGeneration_683,WindOnGeneration_684,WindOnGeneration_685,WindOnGeneration_686,WindOnGeneration_687,WindOnGeneration_688,WindOnGeneration_689
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-01 00:00:00,38.03,0.0,54.879778,80.640871,0.0,0.000000,0.000000,0.000000,0.000000,18.110770,...,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768,65.034606,64.478524,63.832045,63.352822
2015-02-01 01:00:00,34.86,0.0,54.670010,83.698805,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,68.319617,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768,65.034606,64.478524,63.832045
2015-02-01 02:00:00,32.50,0.0,54.613551,83.752731,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,69.889699,68.319617,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768,65.034606,64.478524
2015-02-01 03:00:00,29.20,0.0,54.405698,83.452382,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,70.940398,69.889699,68.319617,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768,65.034606
2015-02-01 04:00:00,28.10,0.0,54.711882,82.363099,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,71.690306,70.940398,69.889699,68.319617,65.442188,64.310108,63.262153,62.639764,63.287361,64.292768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 18:00:00,71.30,0.0,51.429272,55.030991,0.0,18.493242,36.837481,46.260134,50.368641,50.635956,...,34.042180,33.996029,34.665689,35.777367,37.412698,39.173843,41.234452,43.266962,45.798799,48.506082
2020-12-31 19:00:00,61.04,0.0,52.034123,55.197101,0.0,0.000000,18.493242,36.837481,46.260134,50.368641,...,35.543494,34.042180,33.996029,34.665689,35.777367,37.412698,39.173843,41.234452,43.266962,45.798799
2020-12-31 20:00:00,60.39,0.0,51.913486,55.259388,0.0,0.000000,0.000000,18.493242,36.837481,46.260134,...,37.125598,35.543494,34.042180,33.996029,34.665689,35.777367,37.412698,39.173843,41.234452,43.266962
2020-12-31 21:00:00,57.96,0.0,53.193703,55.557268,0.0,0.000000,0.000000,0.000000,18.493242,36.837481,...,38.868882,37.125598,35.543494,34.042180,33.996029,34.665689,35.777367,37.412698,39.173843,41.234452


## Split to train and test

In [151]:
X, y = df_model.drop('Price', axis=1), df_model.Price

In [152]:
lambdas = np.logspace(np.log10(1e-4), np.log10(35), 100)

In [154]:
from sklearn.model_selection import train_test_split

X = df_model.drop('Price', axis=1)
y = df_model.Price

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)

In [158]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()

In [160]:
numerical_features

['SolarGeneration',
 'WindOffGeneration',
 'WindOnGeneration',
 'SolarGeneration_1',
 'SolarGeneration_2',
 'SolarGeneration_3',
 'SolarGeneration_4',
 'SolarGeneration_5',
 'SolarGeneration_6',
 'SolarGeneration_7',
 'SolarGeneration_8',
 'SolarGeneration_9',
 'SolarGeneration_10',
 'SolarGeneration_11',
 'SolarGeneration_12',
 'SolarGeneration_13',
 'SolarGeneration_14',
 'SolarGeneration_15',
 'SolarGeneration_16',
 'SolarGeneration_17',
 'SolarGeneration_18',
 'SolarGeneration_19',
 'SolarGeneration_20',
 'SolarGeneration_21',
 'SolarGeneration_22',
 'SolarGeneration_23',
 'SolarGeneration_24',
 'SolarGeneration_25',
 'SolarGeneration_26',
 'SolarGeneration_27',
 'SolarGeneration_28',
 'SolarGeneration_29',
 'SolarGeneration_30',
 'SolarGeneration_31',
 'SolarGeneration_32',
 'SolarGeneration_33',
 'SolarGeneration_34',
 'SolarGeneration_35',
 'SolarGeneration_36',
 'SolarGeneration_37',
 'SolarGeneration_38',
 'SolarGeneration_39',
 'SolarGeneration_40',
 'SolarGeneration_41',
 'S