In [1]:
from electricity_price_predictor.data import get_shifted_load, get_shifted_price, get_weather
import holidays
import pandas as pd

In [2]:
def get_holidays(start='1/1/2015', stop='23/11/2020', country='DK', frequency='D'):
    """
    Takes in a start and stop date and a country.
    Produces a dataframe with a daily date time index and columns:
    day_of_week - numerical day of the week identifier 0 for monday
    holiday_bool - boolean true or false for holiday
    holiday_name - name of the holiday if holiday_bool is true
    Returns a dataframe
    """
    #generate the range of daily dates
    dates = pd.date_range(start=start, end=stop, freq=frequency)
    #create the holiday object
    country_holidays = holidays.CountryHoliday(country)
    #create a list for the holiday bool and name
    holiday_list = []
    #loop through the dates
    for date in dates:
        #true if holiday in object, false otherwise
        holiday_bool = date in country_holidays
        holiday_names = country_holidays.get(date)
        holiday_list.append([holiday_bool, holiday_names])
    #create return dataframe
    holidays_data = pd.DataFrame(holiday_list, index=dates, columns=['holiday_bool', 'holiday_name'])
    return holidays_data


def get_days_dummies(start='1/1/2015', stop='23/11/2020', frequency='D'):
    """
    Takes in a start and stop date and frequency.
    Produces a dataframe with a date time index at the frequency input and columns:
    weekday_id - numerical day of the week identifier 0 for monday
    Returns a dataframe
    """
    #generate the range of daily dates
    dates = pd.date_range(start=start, end=stop, freq=frequency)
    #create a dataframe of weekday categories
    days = pd.DataFrame(list(dates.weekday), index=dates, columns=['weekday_id'])
    days = pd.get_dummies(days['weekday_id'])
    columns = ['mon', 'tue', 'wed', 'thur', 'fri', 'sat', 'sun']
    days.columns = columns
    return days

In [3]:
price = get_shifted_price()
load = get_shifted_load()

In [4]:
price = price.loc[:'2020-11-23 16:00:00']

In [5]:
df = price.merge(load, on='time')
df = df.resample('D').mean()
df = df.reset_index().drop('time', axis=1)

In [8]:
df.tail()

Unnamed: 0,price,load
2149,1.047083,2705.5
2150,33.00375,2715.333333
2151,1.04625,2429.583333
2152,1.158333,2330.458333
2153,7.206471,2766.0


In [9]:
weather = get_weather()
holidays = get_holidays()
dayofweek = get_days_dummies()

In [18]:
weather = weather.sort_index()
weather = weather.loc[:'2020-11-23 16:00:00']

In [20]:
weather = weather.resample('D').mean()
weather = weather.reset_index().drop('dt', axis=1)
dayofweek = dayofweek.reset_index().drop('index', axis=1)
holidays = holidays.reset_index().drop('index', axis=1)

In [21]:
print(holidays.shape)
print(dayofweek.shape)
print(weather.shape)
print(df.shape)

(2154, 2)
(2154, 7)
(2154, 6)
(2154, 2)


In [22]:
df = df.merge(holidays, 
              right_index=True, left_index=True).merge(dayofweek, right_index=True, left_index=True).drop('holiday_name', axis=1)

In [23]:
df['holiday_bool'] = df['holiday_bool'].astype('int64')

In [28]:
df = df.merge(weather, right_index=True, left_index=True)

In [29]:
X = df.drop('price', axis=1)
y = df.price

In [33]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

In [34]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [35]:
model_2 = LinearRegression()
model_2.fit(X_scaled, y)
model_2.score(X_scaled, y)

0.37461512330976243

In [36]:
weather

Unnamed: 0,temp,feels_like,humidity,clouds_all,wind_speed,wind_deg
0,6.531499,0.070990,92.857741,91.531577,7.762724,230.669642
1,6.516418,-2.268839,84.475949,68.241743,10.727671,256.258159
2,4.117653,-2.365009,84.460324,42.119344,6.808548,269.906298
3,3.413488,-1.761679,80.915235,36.723827,4.657854,289.299073
4,4.752763,1.423782,94.189036,89.442873,2.842280,222.575805
...,...,...,...,...,...,...
2149,7.011803,0.472592,75.156521,28.890113,7.285472,265.325664
2150,4.028544,-0.088680,78.171198,21.466404,3.165625,247.966775
2151,8.117413,3.327187,93.313982,50.146651,5.922855,213.375186
2152,7.777668,2.726794,82.615943,28.811757,5.605095,249.433786


In [56]:
df.columns[1:]

Index(['load', 'holiday_bool', 'mon', 'tue', 'wed', 'thur', 'fri', 'sat',
       'sun', 'temp', 'feels_like', 'humidity', 'clouds_all', 'wind_speed',
       'wind_deg'],
      dtype='object')

In [59]:
to_drop =['holiday_bool', 'wind_deg', 'clouds_all', 'humidity']
new_df = df.drop(columns=to_drop)
new_x = new_df.drop('price', axis=1)

In [61]:
col = new_df.columns

In [66]:
formula = ' + '.join(col[1:])
formula = f"price ~ {formula}"

In [67]:
formula

'price ~ load + mon + tue + wed + thur + fri + sat + sun + temp + feels_like + wind_speed'

In [68]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [69]:
model2 = smf.ols(formula=formula, data=df).fit()

In [70]:
model2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.315
Model:,OLS,Adj. R-squared:,0.312
Method:,Least Squares,F-statistic:,98.54
Date:,"Thu, 26 Nov 2020",Prob (F-statistic):,4.94e-168
Time:,18:12:00,Log-Likelihood:,-8116.9
No. Observations:,2154,AIC:,16260.0
Df Residuals:,2143,BIC:,16320.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,17.6591,3.565,4.954,0.000,10.668,24.650
load,0.0122,0.001,8.537,0.000,0.009,0.015
mon,3.0086,0.846,3.557,0.000,1.350,4.667
tue,3.3097,0.900,3.679,0.000,1.545,5.074
wed,3.2765,0.905,3.620,0.000,1.502,5.052
thur,3.0914,0.889,3.476,0.001,1.348,4.835
fri,3.4130,0.782,4.364,0.000,1.879,4.947
sat,1.6794,0.587,2.860,0.004,0.528,2.831
sun,-0.1197,0.585,-0.205,0.838,-1.266,1.027

0,1,2,3
Omnibus:,52.836,Durbin-Watson:,0.331
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58.279
Skew:,0.355,Prob(JB):,2.21e-13
Kurtosis:,3.382,Cond. No.,1.01e+19


In [71]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [55]:
vif()

(2155, 6)