# SARIMAX

In [184]:
from statsmodels.tsa.stattools import adfuller
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [185]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data exploration

In [250]:
from electricity_price_predictor.data_2 import get_shifted_price

In [255]:
df = get_shifted_price()
df.head()

Unnamed: 0_level_0,price
time,Unnamed: 1_level_1
2015-01-01 00:00:00,25.02
2015-01-01 01:00:00,18.29
2015-01-01 02:00:00,16.04
2015-01-01 03:00:00,14.6
2015-01-01 04:00:00,14.95


In [256]:
df.isnull().sum()

price    0
dtype: int64

In [252]:
df.describe()

Unnamed: 0,price
count,51720.0
mean,31.107946
std,15.005782
min,-58.8
25%,22.35
50%,30.18
75%,39.97
max,200.04


### Decomposition

- components:
 - Trend
 - Seaonal
 - cyclical 

- choose:
 - additive or multiplicative decomposition?   plot the residuls of both models and decide  

- statsmodel.tsa [Deterministic Processes](https://www.statsmodels.org/stable/tsa.html#)      

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
# Additive Decomposition (y = Trend + Seasonal + Residuals)
result_add = seasonal_decompose(df, model='additive', period=24*365)
result_add.plot()

In [None]:
# Multiplicative Decomposition (y = Trend * Seasonal * Residuals)
result_mul = seasonal_decompose(df['price'], model='multiplicative', period=24*365)
result_mul.plot();

### Explore stationarity

In [None]:
df['log'] = df.price.apply(np.log)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(20,8))
# Original series
axs[0,0].plot(df.log); axs[0,0].set_title('Original Series')

# Normal differencing
axs[0,1].plot(df.log.diff(1)); axs[0,1].set_title('1st Order Differencing')

# Seasonal differencing
axs[1,0].plot(df.log.diff(12))
axs[1,0].set_title('Seasonal differencing of period 12')

# Sesonal + Normal differencing
axs[1,1].plot(df.log.diff(12).diff(1))
axs[1,1].set_title('First order diff of seasonal differencing 12');

In [None]:
# ADF test on lag day
day_ADF = adfuller(df.price, maxlag=24)
print(pd.Series(day_ADF[0:4], index=['Test Statistic','p-value','#Lags','Observations']))

In [None]:
# ADF test on lag week
week_ADF = adfuller(df.price, maxlag=24*7)
print(pd.Series(week_ADF[0:4], index=['Test Statistic','p-value','#Lags','Observations']))

In [None]:
# ADF test on lag month **need GCP/Google colab to run it**
'''month_ADF = adfuller(df.price, maxlag=24*7*30)
print(pd.Series(month_ADF[0:4], index=['Test Statistic','p-value','#Lags','Observations']))'''

### Autocorrelation

In [None]:
# plot ACF and PACF
from statsmodels.graphics.tsaplots import plot_pacf
fig, axes = plt.subplots(1,2, figsize=(16,3))
plot_acf(df.value, lags=50, ax=axes[0]);
plot_pacf(df.value, lags=50, ax=axes[1], color='r')

## hyperparameters

In [None]:
# grid search the hyper params
import pmdarima as pm
smodel = pm.auto_arima(train, seasonal=True, m=12, 
                       start_p=0, max_p=1, max_d=1, start_q=0, max_q=1,
                       start_P=0, max_P=2, max_D=1, start_Q=0, max_Q=2, 
                       trace=True, error_action='ignore', suppress_warnings=True) # The lower AIC the better

## SARIMAX Model

In [None]:
# Create a correct Training/Test split to predict the last 50 points
train = df.linearized[0:150]
test = df.linearized[150:]

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Build Model
sarima = SARIMAX(engod=df['electricity_price'],
                 exog=df['day_of_week'],
                 order=(3, 0, 0),seasonal_order=(0,1,2,12)
                )
sarima = sarima.fit()

# Forecast
results = sarima.get_forecast(len(test), alpha=0.05) # 95% confidence
forecast = results.predicted_mean
confidence_int = results.conf_int()

## Recomposition

In [None]:
# re-compose back to initial time series

forecast_recons = np.exp(forecast)*result_mul.seasonal[150:]
train_recons = np.exp(train)*result_mul.seasonal[0:150]
test_recons = np.exp(test)*result_mul.seasonal[150:]
lower_recons = np.exp(confidence_int)[:,0]*result_mul.seasonal[150:]
upper_recons = np.exp(confidence_int)[:,1]*result_mul.seasonal[150:]

# plt 
plot_forecast(forecast_recons, train_recons, test_recons, lower_recons.values, upper_recons.values)

## Inference validity

In [None]:
## plot residuals to see whether their normal distributed
residuals = pd.DataFrame(arima.resid)

fig, ax = plt.subplots(1,2, figsize=(16,3))
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()

# Prophet

[reference for model_b](https://towardsdatascience.com/multivariate-time-series-forecasting-using-random-forest-2372f3ecbad1)