## Replication of electricity price forecasting

### Days of the week/ Holidays

In [1]:
from electricity_price_predictor.data import get_shifted_load, get_shifted_price, get_all, get_wind_prod, get_weather, get_holidays, get_days_dummies, get_coal_price
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [2]:
last_date = '2020-11-23 00:00:00'
first_date = '2015-11-25 00:00:00'

In [None]:
df  = get_all()

In [None]:
df

In [3]:
dayofweek = get_days_dummies().loc[first_date:last_date]
holidays = get_holidays().loc[first_date:last_date]
weather = get_weather().loc[first_date:last_date]
price = get_shifted_price().loc[first_date:last_date]
load = get_shifted_load().loc[first_date:last_date]
coal = get_coal_price().loc[first_date:last_date]
wind = get_wind_prod().loc[first_date:last_date]

In [None]:
# df = df.loc[first_date:last_date]

In [None]:
# df

In [4]:
# merge price and load into one df
df = price.merge(load, on='time')

## Downsample

In [None]:
# weather_D = weather[weather.index.hour==11]
# weather 

In [5]:
weather = weather.resample('D').mean()
weather = weather.reset_index().drop('dt', axis=1)

In [None]:
weather

In [6]:
df = df.resample('D').mean()
df = df.reset_index().drop('time', axis=1)

In [7]:
dayofweek = dayofweek.reset_index().drop('index', axis=1)
holidays = holidays.reset_index().drop('index', axis=1)

In [8]:
holidays

Unnamed: 0,holiday,holiday_name
0,0,
1,0,
2,0,
3,0,
4,0,
...,...,...
1821,0,
1822,0,
1823,0,
1824,0,


In [None]:
coal = coal.resample('D').mean()
coal = coal.reset_index().drop('time', axis=1)

In [None]:
print(f"'holidays:' {holidays.shape}")
print(f"'day of week:' {dayofweek.shape}")
print(f"'weather:' {weather.shape}")
print(f"'price, load:' {df.shape}")
print(f"'coal:' {coal.shape}")
print(f"'wind:' {wind.shape}")

## Merging dataframes

In [None]:
df = df.join(coal).join(
    holidays).join(
    dayofweek).join(
    weather).drop('holiday_name', axis=1)

# df['holiday_bool'] = df['holiday_bool'].astype('int64')

In [None]:
# df = df.merge(
#     holidays, right_index= True, left_index=True).merge(
#     dayofweek, right_index= True, left_index=True).merge(
#     weather, right_index= True, left_index= True).merge(
#     coal, right_index=True, left_index=True).drop('holiday_name', axis=1)

# df['holiday_bool'] = df['holiday_bool'].astype('int64')

In [None]:
df

In [None]:
def shift_by_days(data, num_days):
    """
    Input a timeseries of the form 24 hourly measurements per day
    
    Output returns 
    
    """
    data_shifted = data.shift(num_days)
    
    return data_shifted

In [None]:
df['price_t_1'] = shift_by_days(df['price'], 1)

In [None]:
df['price_t_7'] = shift_by_days(df['price'], 7)

In [None]:
df = df.dropna()

In [None]:
df

In [None]:
df_new = df[[ 'coal_price', 'holiday', 'total_prod', 'feels_like',
        'wind_share', 'price_t_7', 'price_t_1']]

In [None]:
df.corr().style.background_gradient(cmap='coolwarm')

In [None]:
df1 = pd.DataFrame()
df1["vif_index"] = [vif(df.values, i) for i in range(df.shape[1])]
df1["features"] = df.columns
df1[['features', 'vif_index']].sort_values(by='vif_index', ascending=False)

In [None]:
df1 = pd.DataFrame()
df1["vif_index"] = [vif(df_new.values, i) for i in range(df_new.shape[1])]
df1["features"] = df_new.columns
df1[['features', 'vif_index']].sort_values(by='vif_index', ascending=False)

## Define features and scale

In [None]:
X = df.drop(['price'], axis=1)
y = df.price

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

## Data visualization

In [None]:
df.describe().applymap(lambda x: round(x))

In [None]:
df[['price']].boxplot()

In [None]:
plt.figure(figsize=(10, 5))


plt.title('Seaborn')
sns.histplot(df['price'], kde=True);
sns.histplot(df['coal_price'], kde=True)

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.regplot('price_t_1', 'price', data=df, ci=95)
plt.subplot(2,2,2)
sns.regplot('coal_price', 'price', data=df.sample(10), ci=95)

In [None]:
sns.scatterplot(x='coal_price', y='price', data=df)

In [None]:
sns.regplot('coal_price', 'price', data=df)

## Model


### LinearReg Sklearn

In [2]:
model_2 = LinearRegression()
model_2.fit(X_scaled, y)
model_2.score(X_scaled, y)  # R2

NameError: name 'LinearRegression' is not defined

In [1]:
model_2.score

NameError: name 'model_2' is not defined

### smf statsmodels

In [None]:
col = df.columns[1:]

In [None]:
col

In [None]:
formula = ' + '.join(col)
formula = f"price ~ {formula}"
formula

In [None]:
model_3 = smf.ols('price ~ load + holiday + coal_price + feels_like + total_prod + wind_speed + price_t_1 + price_t_7', data=df).fit()
model_3.summary()

In [None]:
model_3_1 = smf.ols('price ~  holiday_bool + feels_like + coal_price  + clouds_all + wind_speed', data=df).fit()
model_3_1.summary()

In [None]:
train = df.iloc[: 1796,:]
test = df.iloc[1796:, :]

In [None]:
model_4 = smf.ols('price ~ load + holiday + coal_price + feels_like + total_prod + wind_speed + price_t_1 + price_t_7', data=train).fit()
model_4.summary()

In [None]:
test['pred_price'] =  model_4.predict(test)

In [None]:
MAPE = ((abs(test['price'] - test['pred_price']))/(test['price'])).mean()*100

In [None]:
MAPE

In [None]:
df.columns

In [None]:
model_3.predict()

In [None]:
residuals = model_3.resid
sns.distplot(model_3.resid)

In [None]:
predictions = model_3.predict()

In [None]:
# Check with Residuals vs. Fitted scatterplot
sns.scatterplot(predictions, residuals)
plt.xlabel('Predicted weight')
plt.ylabel('Residual weight')

In [None]:
from pandas.plotting import lag_plot
from pandas.plotting import autocorrelation_plot


In [None]:
lag_plot(df['price'], lag=1)

In [None]:
df['price'].plot()

In [None]:
df['coal_price'].plot()

In [None]:
autocorrelation_plot(df['temp'])

In [None]:
fig = plt.figure(figsize=(10,36))
fig = sm.graphics.plot_partregress_grid(model_3, fig=fig)

In [None]:
model = smf.ols(formula=formula, data=df).fit()

In [None]:
model.summary()

In [None]:
data_train, data_test = train_test_split

In [None]:
# from sklearn.model_selection import train_test_split

# # Ready X and y
# X = df.drop(['price', 'feels_like'], axis=1)
# y = df['price']

# # Split into Train/Test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = 4)

In [None]:
lin_model = LinearRegression()

# Train the model on the Training data
lin_model.fit(X_train, y_train)

# Score the model on the Training data
lin_model.score(X_train,y_train)

In [None]:
# Score the model on the Testing data
lin_model.score(X_test,y_test)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
y_pred = model_3.predict()

In [None]:
import math

mse = mean_squared_error(y, y_pred)

rmse = math.sqrt(mse)

In [None]:
rmse