## Replication of electricity price forecasting

### Days of the week/ Holidays

In [None]:
from electricity_price_predictor.data import get_shifted_load, get_shifted_price, get_weather, get_holidays, get_days_dummies
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [None]:
last_date = '2020-11-23 16:00:00'

In [None]:
dayofweek = get_days_dummies()
holidays = get_holidays()
weather = get_weather().loc[:last_date]
price = get_shifted_price().loc[:last_date]
load = get_shifted_load()

In [None]:
# price and load into one df
df = price.merge(load, on='time')

## Downsample

In [None]:
weather = weather.resample('D').mean()
weather = weather.reset_index().drop('dt', axis=1)

In [None]:
df = df.resample('D').mean()
df = df.reset_index().drop('time', axis=1)

In [None]:
dayofweek = dayofweek.reset_index().drop('index', axis=1)
holidays = holidays.reset_index().drop('index', axis=1)

In [None]:
print(holidays.shape)
print(dayofweek.shape)
print(weather.shape)
print(df.shape)

## Merging dataframes

In [None]:
df = df.merge(
    holidays, right_index= True, left_index=True).merge(
    dayofweek, right_index= True, left_index=True).merge(
    weather, right_index= True, left_index= True).drop('holiday_name', axis=1)

df['holiday_bool'] = df['holiday_bool'].astype('int64')

In [None]:
def shift_by_days(data, num_days):
    """
    Input a timeseries of the form 24 hourly measurements per day
    
    Output returns 
    
    """
    data_shifted = data.shift(num_days)
    
    return data_shifted

In [None]:
df.columns

In [None]:
df_new = df[['load', 'holiday_bool', 'feels_like', 'wind_speed', ]]

In [None]:
df['price_t_1'] = shift_by_days(df['price'], 1)

In [None]:
df['price_t_7'] = shift_by_days(df['price'], 7)

In [None]:
df = df.dropna()

In [None]:
df.columns

In [None]:
df=df.drop(columns=['mon', 'tue', 'wed', 'thur', 'fri',
       'sat', 'sun'])

In [None]:
df = df.drop(columns=['price'])

In [None]:
df = df.drop(columns=['temp'])

In [None]:
df = df.drop(columns=['humidity'])

In [None]:
df.head()

In [None]:
df.corr().style.background_gradient(cmap='coolwarm')

In [None]:
df.shape

In [None]:
df1 = pd.DataFrame()
df1["vif_index"] = [vif(df_new.values, i) for i in range(df_new.shape[1])]
df1["features"] = df_new.columns
df1[['features', 'vif_index']].sort_values(by='vif_index', ascending=False)

## Define features and scale

In [None]:
X = df.drop('price', axis=1)
y = df.price

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

## Model


### LinearReg Sklearn

In [None]:
model_2 = LinearRegression()
model_2.fit(X_scaled, y)
model_2.score(X_scaled, y)  # R2

In [None]:
model_2.intercept_

### smf statsmodels

In [None]:
col = df.columns[1:]

In [None]:
formula = ' + '.join(col)
formula = f"price ~ {formula}"
formula

In [None]:
model_3 = smf.ols('price ~ load + holiday_bool + mon + tue + wed + thur + fri + sat + sun + temp + feels_like + humidity + clouds_all + wind_speed + price_t_1 + price_t_7', data=df).fit()
model_3.summary()

In [None]:
model = smf.ols(formula=formula, data=df).fit()

In [None]:
model.summary()

In [None]:
shift_by_days(merged_D.price, -1).head(10)

In [None]:
#create an object with all the holidays in denmakr
denmark_holidays = holidays.CountryHoliday('DK')

In [None]:
def get_holidays(start='1/1/2015', stop='31/12/2020', country='DK', frequency='D'):
    """
    Takes in a start and stop date and a country.
    
    Produces a dataframe with a daily date time index and columns:
    day_of_week - numerical day of the week identifier 0 for monday
    holiday_bool - boolean true or false for holiday
    holiday_name - name of the holiday if holiday_bool is true
    
    Returns a dataframe
    """
    
    #generate the range of daily dates
    dates = pd.date_range(start=start, end=stop, freq=frequency)
    
    #create the holiday object
    country_holidays = holidays.CountryHoliday(country)

    #create a list for the holiday bool and name
    holiday_list = []
    
    #loop through the dates
    for date in dates:
        #true if holiday in object, false otherwise
        holiday_bool = date in country_holidays
        holiday_names = country_holidays.get(date)
        
        holiday_list.append([holiday_bool, holiday_names])
        
    #create return dataframe
    holidays_data = pd.DataFrame(holiday_list, index=dates, columns=['holiday_bool', 'holiday_name'])
                  
    return holidays_data

In [None]:
hld_df = get_holidays()

In [None]:
hld_df = hld_df.loc[: '2020-11-23']

In [None]:
hld_df

In [None]:
# def get_holidays(start='1/1/2015', stop='31/12/2020', country='DK'):
#     """
#     Takes in a start and stop date and a country.
    
#     Produces a dataframe with a daily date time index and columns:
#     day_of_week - numerical day of the week identifier 0 for monday
#     holiday_bool - boolean true or false for holiday
#     holiday_name - name of the holiday if holiday_bool is true
    
#     Returns a dataframe
#     """
    
#     #generate the range of daily dates
#     dates = pd.date_range(start=start, end=stop)
    
#     #create the holiday object
#     country_holidays = holidays.CountryHoliday(country)
    
#     #create a dataframe of weekday categories
#     days = pd.DataFrame(list(dates.weekday), index=dates, columns=['weekday_id'])

#     #create a list for the holiday bool and name
#     holiday_list = []
    
#     #loop through the dates
#     for date in dates:
#         #true if holiday in object, false otherwise
#         holiday_bool = date in country_holidays
#         holiday_names = country_holidays.get(date)
        
#         holiday_list.append([holiday_bool, holiday_names])
        
#     #create return dataframe
#     holidays_data = pd.DataFrame(holiday_list, index=dates, columns=['holiday_bool', 'holiday_name'])
          
#     #join the days and the holidays_data dataframes
#     data = pd.concat([days, holidays_data], axis=1)
                 
                    
#     return data

In [None]:
# holiday_df = get_holidays()

In [None]:
# holiday_df = holiday_df.loc[: '2020-11-23']

In [None]:
def get_days_dummies(start='1/1/2015', stop='31/12/2020', frequency='D'):
    """
    Takes in a start and stop date and frequency.
    
    Produces a dataframe with a date time index at the frequency input and columns:
    weekday_id - numerical day of the week identifier 0 for monday
    
    Returns a dataframe
    """
    
    #generate the range of daily dates
    dates = pd.date_range(start=start, end=stop, freq=frequency)
    
    #create a dataframe of weekday categories
    days = pd.DataFrame(list(dates.weekday), index=dates, columns=['weekday_id'])
    
    days = pd.get_dummies(days['weekday_id'])
    
    columns = ['mon', 'tue', 'wed', 'thur', 'fri', 'sat', 'sun']
    
    days.columns = columns
    
    return days

In [None]:
dow_df = get_days_dummies(start='2015-01-01', stop='2020-12-31')

In [None]:
dow_df = dow_df.loc[: '2020-11-23']

In [None]:
dow_df

In [None]:
# merged_all = merged_D.merge(holiday_df, how='left',left_index=True, right_index=True)

In [None]:
merged_all = merged_D.merge(dow_df, how='left',left_index=True, right_index=True)

In [None]:
merged_all_1 = merged_all.merge(hld_df, how='left',left_index=True, right_index=True)

In [None]:
 merged_all_1['holiday_bool'] = merged_all_1['holiday_bool'].astype('int')

In [None]:
merged_all_1

In [None]:
# X = merged_all_1.drop(columns=['price','holiday_name', 'holiday_bool'])

# df['weather/temp/snow'], df['coal_price'], df['dow'], df['holidays'], df['month']

In [None]:
model2 = smf.ols(formula='price ~ load_norm + mon + tue + wed + thur + fri + sat + sun + holiday_bool', data=merged_all_1).fit()

check VIF 

In [None]:
model2.summary()

In [None]:
X = merged_all_1['load']
y = merged_all_1['price']

In [None]:
model = sm.OLS(y,X).fit()
model.summary()

In [None]:
model.params

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import 

In [None]:
normalizer = MinMaxScaler()
normalizer.fit(merged_all_1[['load']])
merged_all_1['load_norm'] = normalizer.transform(merged_all_1[['load']])

In [None]:
merged_all_1

In [None]:
X=merged_all_1[['load', 'mon', 'tue', 'wed', 'thur','fri','sat','sun','holiday_bool']]
y = merged_all_1['price']

In [None]:
X.values

In [None]:
reg = LinearRegression().fit(X.values, y.values)

In [None]:
reg.score(X, y.values)

In [None]:
predictions = reg.predict(X)

In [None]:
predictions

In [None]:
y.values-predictions

In [None]:
for x, y in zip()

transform_to_windows
converts the data from row data into windowed rows where each row is a day with 24 columns representing each hour of the day.