## Replication of electricity price forecasting

### Days of the week/ Holidays

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import holidays
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
from electricity_price_predictor.data import get_shifted_price, get_shifted_load

In [3]:
price = get_shifted_price()

In [4]:
price = price.loc[: '2020-11-23 16:00:00']

In [5]:
load = get_shifted_load()

In [6]:
# pd.date_range(start = '2015-01-01', end = '2020-11-22' ).difference(load.index)

In [7]:
merged = price.merge(load, how='inner', on='time')

In [8]:
merged_D = merged.resample('D').mean()

In [9]:
merged_D.isnull().sum()

price    0
load     0
dtype: int64

In [10]:
#create an object with all the holidays in denmakr
denmark_holidays = holidays.CountryHoliday('DK')

In [11]:
def get_holidays(start='1/1/2015', stop='31/12/2020', country='DK', frequency='D'):
    """
    Takes in a start and stop date and a country.
    
    Produces a dataframe with a daily date time index and columns:
    day_of_week - numerical day of the week identifier 0 for monday
    holiday_bool - boolean true or false for holiday
    holiday_name - name of the holiday if holiday_bool is true
    
    Returns a dataframe
    """
    
    #generate the range of daily dates
    dates = pd.date_range(start=start, end=stop, freq=frequency)
    
    #create the holiday object
    country_holidays = holidays.CountryHoliday(country)

    #create a list for the holiday bool and name
    holiday_list = []
    
    #loop through the dates
    for date in dates:
        #true if holiday in object, false otherwise
        holiday_bool = date in country_holidays
        holiday_names = country_holidays.get(date)
        
        holiday_list.append([holiday_bool, holiday_names])
        
    #create return dataframe
    holidays_data = pd.DataFrame(holiday_list, index=dates, columns=['holiday_bool', 'holiday_name'])
                  
    return holidays_data

In [12]:
hld_df = get_holidays()

In [13]:
hld_df = hld_df.loc[: '2020-11-23']

In [14]:
hld_df

Unnamed: 0,holiday_bool,holiday_name
2015-01-01,True,Nytårsdag
2015-01-02,False,
2015-01-03,False,
2015-01-04,False,
2015-01-05,False,
...,...,...
2020-11-19,False,
2020-11-20,False,
2020-11-21,False,
2020-11-22,False,


In [15]:
# def get_holidays(start='1/1/2015', stop='31/12/2020', country='DK'):
#     """
#     Takes in a start and stop date and a country.
    
#     Produces a dataframe with a daily date time index and columns:
#     day_of_week - numerical day of the week identifier 0 for monday
#     holiday_bool - boolean true or false for holiday
#     holiday_name - name of the holiday if holiday_bool is true
    
#     Returns a dataframe
#     """
    
#     #generate the range of daily dates
#     dates = pd.date_range(start=start, end=stop)
    
#     #create the holiday object
#     country_holidays = holidays.CountryHoliday(country)
    
#     #create a dataframe of weekday categories
#     days = pd.DataFrame(list(dates.weekday), index=dates, columns=['weekday_id'])

#     #create a list for the holiday bool and name
#     holiday_list = []
    
#     #loop through the dates
#     for date in dates:
#         #true if holiday in object, false otherwise
#         holiday_bool = date in country_holidays
#         holiday_names = country_holidays.get(date)
        
#         holiday_list.append([holiday_bool, holiday_names])
        
#     #create return dataframe
#     holidays_data = pd.DataFrame(holiday_list, index=dates, columns=['holiday_bool', 'holiday_name'])
          
#     #join the days and the holidays_data dataframes
#     data = pd.concat([days, holidays_data], axis=1)
                 
                    
#     return data

In [16]:
# holiday_df = get_holidays()

In [17]:
# holiday_df = holiday_df.loc[: '2020-11-23']

In [18]:
def get_days_dummies(start='1/1/2015', stop='31/12/2020', frequency='D'):
    """
    Takes in a start and stop date and frequency.
    
    Produces a dataframe with a date time index at the frequency input and columns:
    weekday_id - numerical day of the week identifier 0 for monday
    
    Returns a dataframe
    """
    
    #generate the range of daily dates
    dates = pd.date_range(start=start, end=stop, freq=frequency)
    
    #create a dataframe of weekday categories
    days = pd.DataFrame(list(dates.weekday), index=dates, columns=['weekday_id'])
    
    days = pd.get_dummies(days['weekday_id'])
    
    columns = ['mon', 'tue', 'wed', 'thur', 'fri', 'sat', 'sun']
    
    days.columns = columns
    
    return days

In [19]:
dow_df = get_days_dummies(start='2015-01-01', stop='2020-12-31')

In [20]:
dow_df = dow_df.loc[: '2020-11-23']

In [21]:
dow_df

Unnamed: 0,mon,tue,wed,thur,fri,sat,sun
2015-01-01,0,0,0,1,0,0,0
2015-01-02,0,0,0,0,1,0,0
2015-01-03,0,0,0,0,0,1,0
2015-01-04,0,0,0,0,0,0,1
2015-01-05,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
2020-11-19,0,0,0,1,0,0,0
2020-11-20,0,0,0,0,1,0,0
2020-11-21,0,0,0,0,0,1,0
2020-11-22,0,0,0,0,0,0,1


In [22]:
# merged_all = merged_D.merge(holiday_df, how='left',left_index=True, right_index=True)

In [23]:
merged_all = merged_D.merge(dow_df, how='left',left_index=True, right_index=True)

In [24]:
merged_all_1 = merged_all.merge(hld_df, how='left',left_index=True, right_index=True)

In [25]:
 merged_all_1['holiday_bool'] = merged_all_1['holiday_bool'].astype('int')

In [26]:
merged_all_1

Unnamed: 0_level_0,price,load,mon,tue,wed,thur,fri,sat,sun,holiday_bool,holiday_name
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-01,16.310417,2114.208333,0,0,0,1,0,0,0,1,Nytårsdag
2015-01-02,4.970417,2360.416667,0,0,0,0,1,0,0,0,
2015-01-03,15.291667,2196.791667,0,0,0,0,0,1,0,0,
2015-01-04,21.064167,2142.250000,0,0,0,0,0,0,1,0,
2015-01-05,37.997500,2601.625000,1,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...
2020-11-19,1.047083,2705.500000,0,0,0,1,0,0,0,0,
2020-11-20,33.003750,2715.333333,0,0,0,0,1,0,0,0,
2020-11-21,1.046250,2429.583333,0,0,0,0,0,1,0,0,
2020-11-22,1.158333,2330.458333,0,0,0,0,0,0,1,0,


In [27]:
# X = merged_all_1.drop(columns=['price','holiday_name', 'holiday_bool'])

# df['weather/temp/snow'], df['coal_price'], df['dow'], df['holidays'], df['month']

In [46]:
model2 = smf.ols(formula='price ~ load_norm + mon + tue + wed + thur + fri + sat + sun + holiday_bool', data=merged_all_1).fit()

check VIF 

In [47]:
model2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.086
Model:,OLS,Adj. R-squared:,0.082
Method:,Least Squares,F-statistic:,25.17
Date:,"Thu, 26 Nov 2020",Prob (F-statistic):,2.18e-37
Time:,17:31:18,Log-Likelihood:,-8427.7
No. Observations:,2154,AIC:,16870.0
Df Residuals:,2145,BIC:,16920.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,23.7974,0.885,26.904,0.000,22.063,25.532
load_norm,8.1383,1.887,4.313,0.000,4.438,11.838
mon,4.5697,0.691,6.610,0.000,3.214,5.926
tue,4.6649,0.711,6.562,0.000,3.271,6.059
wed,4.6079,0.714,6.455,0.000,3.208,6.008
thur,4.7435,0.712,6.659,0.000,3.347,6.141
fri,5.0113,0.663,7.554,0.000,3.710,6.312
sat,0.9969,0.687,1.452,0.147,-0.350,2.344
sun,-0.7968,0.692,-1.151,0.250,-2.154,0.561

0,1,2,3
Omnibus:,28.908,Durbin-Watson:,0.397
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.861
Skew:,0.227,Prob(JB):,7.32e-08
Kurtosis:,3.399,Cond. No.,4130000000000000.0


In [30]:
X = merged_all_1['load']
y = merged_all_1['price']

In [31]:
model = sm.OLS(y,X).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.867
Model:,OLS,Adj. R-squared (uncentered):,0.867
Method:,Least Squares,F-statistic:,14010.0
Date:,"Thu, 26 Nov 2020",Prob (F-statistic):,0.0
Time:,17:27:16,Log-Likelihood:,-8455.1
No. Observations:,2154,AIC:,16910.0
Df Residuals:,2153,BIC:,16920.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
load,0.0136,0.000,118.377,0.000,0.013,0.014

0,1,2,3
Omnibus:,25.225,Durbin-Watson:,0.399
Prob(Omnibus):,0.0,Jarque-Bera (JB):,33.155
Skew:,0.158,Prob(JB):,6.32e-08
Kurtosis:,3.52,Cond. No.,1.0


In [43]:
model.params

load    0.013627
dtype: float64

In [32]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import 

In [33]:
normalizer = MinMaxScaler()
normalizer.fit(merged_all_1[['load']])
merged_all_1['load_norm'] = normalizer.transform(merged_all_1[['load']])

In [34]:
merged_all_1

Unnamed: 0_level_0,price,load,mon,tue,wed,thur,fri,sat,sun,holiday_bool,holiday_name,load_norm
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01,16.310417,2114.208333,0,0,0,1,0,0,0,1,Nytårsdag,0.391026
2015-01-02,4.970417,2360.416667,0,0,0,0,1,0,0,0,,0.567083
2015-01-03,15.291667,2196.791667,0,0,0,0,0,1,0,0,,0.450079
2015-01-04,21.064167,2142.250000,0,0,0,0,0,0,1,0,,0.411078
2015-01-05,37.997500,2601.625000,1,0,0,0,0,0,0,0,,0.739564
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-19,1.047083,2705.500000,0,0,0,1,0,0,0,0,,0.813843
2020-11-20,33.003750,2715.333333,0,0,0,0,1,0,0,0,,0.820874
2020-11-21,1.046250,2429.583333,0,0,0,0,0,1,0,0,,0.616542
2020-11-22,1.158333,2330.458333,0,0,0,0,0,0,1,0,,0.545660


In [35]:
X=merged_all_1[['load', 'mon', 'tue', 'wed', 'thur','fri','sat','sun','holiday_bool']]
y = merged_all_1['price']

In [36]:
X.values

array([[2.11420833e+03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [2.36041667e+03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.19679167e+03, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.42958333e+03, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.33045833e+03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [2.76600000e+03, 1.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [37]:
reg = LinearRegression().fit(X.values, y.values)

In [38]:
reg.score(X, y.values)

0.08581406191574337

In [39]:
predictions = reg.predict(X)

In [40]:
predictions

array([24.9302422 , 33.42378596, 28.45719764, ..., 29.81192373,
       27.44138713, 35.34253726])

In [41]:
y.values-predictions

array([ -8.61982553, -28.45336929, -13.16553097, ..., -28.76567373,
       -26.28305379, -28.13606667])

In [42]:
for x, y in zip()

SyntaxError: invalid syntax (<ipython-input-42-806951d49420>, line 1)

transform_to_windows
converts the data from row data into windowed rows where each row is a day with 24 columns representing each hour of the day.