In [9]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
from scipy import signal
from pandas.plotting import autocorrelation_plot
from pandas.plotting import lag_plot
from pmdarima.arima.utils import ndiffs
from pmdarima.arima import auto_arima
from pmdarima.arima import ADFTest
from pmdarima import acf
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=ValueError)

In [None]:
df = pd.read_csv('G:/My Drive/IN/Data/Prophet/amazon.csv', parse_dates=['week'])
df = df[df['client_name'] == 'KIND']
df = df.drop_duplicates(subset=['sales', 'week'])
df['week'] = df['week'] = pd.to_datetime(df['week'])
df = df.sort_values(by=['week'])
df['sales'] = df['sales'].values
df['index'] = df['week']
df.set_index('index', inplace=True)
df = df[['week','sales']]

The following metrics are excellent in different situations; for example, RMSE is
excellent for comparing similar models.

In [None]:
def forecast_accuracy(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    me = np.mean(forecast - actual)             # ME
    mae = np.mean(np.abs(forecast - actual))    # MAE
    mpe = np.mean((forecast - actual)/actual)   # MPE
    rmse = np.mean((forecast - actual)**2)**.5  # RMSE
    corr = np.corrcoef(forecast, actual)[0,1]   # corr
    mins = np.amin(np.hstack([forecast[:,None],
                              actual[:,None]]), axis=1)
    maxs = np.amax(np.hstack([forecast[:,None],
                              actual[:,None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmax
    return({'MAPE':mape, 'ME':me, 'MAE': mae,
            'MPE': mpe, 'RMSE':rmse,
            'corr':corr, 'minmax':minmax})

There are three parameters of interest for an ARIMA model: d,p, and q. D refers to
differencing each previous value to make the model stationary. P is the term used
for how many lags can be used for prediction. Q is the order of moving average
to improve the model.

In [None]:
def plot_df(df, x, y, title="", xlabel='Date', ylabel='Sales', dpi=100):
    plt.figure(figsize=(12, 4), dpi=dpi)
    plt.plot(x, y, color='blue')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()
plot_df(df, df['week'], df['sales'], title='Sales Over Time')

An increasing trend is clearly visible, though the sesaonality is not
quite as obvious, so we will test for it later.

The Augmented Dickey-Fuller Test checks for the important
condition of stationarity. This test has failed, meaning we have to
correct for non-stationarity.

In [None]:
adf_test = ADFTest(alpha = .05)
adf_test.should_diff(df['sales'])

In [None]:
train = df[:170]
test = df[-40:]
plt.plot(train)
plt.plot(test)

Value = Base Level + Trend + Seasonality + Error - Additive Decomposition
Value = Base Level x Trend x Seasonality x Error - Multiplicative Decompisition

In [None]:
multiplicative_decomposition = seasonal_decompose(df['sales'].values,
                                                  model='multiplicative',
                                                  period=52)
additive_decomposition = seasonal_decompose(df['sales'].values, model='additive',
                                            period=52)
plt.rcParams.update({'figure.figsize': (20,14)})
multiplicative_decomposition.plot().suptitle('Multiplicative Decomposition', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
additive_decomposition.plot().suptitle('Additive Decomposition', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

Both plots have a positive trend and display some seasonality, but the additive
seems to have significantly better reesiduals.

In [None]:
detrended = signal.detrend(df['sales'].values)
plt.rcParams.update({'figure.figsize': (12,4)})
plt.plot(detrended)
plt.title('Sales Detrended', fontsize=16)

The plot no longer seems to be increasing in trend, hence we have detrended it.

In [None]:
result_mul = seasonal_decompose(df['sales'].values, model='multiplicative', period=52)
deseasonalized = df['sales'].values / result_mul.seasonal
plt.plot(deseasonalized)
plt.title('Sales Deseasonalized', fontsize=16)
plt.plot()

The plot does not show a strong removal of seasonality, so move on to some testing

The following plot does not have any drastic spikes that suggest strong seasonality.

In [None]:
plt.rcParams.update({'figure.figsize':(10,4), 'figure.dpi':120})
autocorrelation_plot(df['sales'].tolist())

Autocorrelation refers to correlation of a series to its own lags. Partial autocorrelation
refers to the correlation to a lag without reference to the lags between.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 3), dpi=100)
plot_acf(df['sales'].tolist(), lags=52, ax=axes[0])
plot_pacf(df['sales'].tolist(), lags=52, ax=axes[1])

Along with the above plots, the following metrics will quantify each degree of
lag. Perfect autocorrelation is one, and positive means that the next value in
the series will likely be higher.

In [None]:
ac1 = df['sales'].autocorr(lag=1)
print("One week Lag: ", ac1)
ac2 = df['sales'].autocorr(lag=2)
print("Two week Lag: ", ac2)
ac3 = df['sales'].autocorr(lag=3)
print("Three week Lag: ", ac3)
ac4 = df['sales'].autocorr(lag=4)
print("Four Week Lag: ", ac4)
ac5 = df['sales'].autocorr(lag=5)
print("Five Week Lag: ", ac5)
ac6 = df['sales'].autocorr(lag=6)
print("Six Week Lag: ", ac6)
ac7 = df['sales'].autocorr(lag=7)
print("Seven Week Lag: ", ac7)
ac8 = df['sales'].autocorr(lag=8)
print("Eight Week Lag: ", ac8)
ac9 = df['sales'].autocorr(lag=9)
print("Nine Week Lag: ", ac9)
ac10 = df['sales'].autocorr(lag=10)
print("Ten Week Lag: ", ac10)
ac11 = df['sales'].autocorr(lag=11)
print("Eleven Week Lag: ", ac11)
ac12 = df['sales'].autocorr(lag=12)
print("Twelve Week Lag: ", ac12)

These twelve weeks rather strong logs show that autocorrelation is somethng to
be mindful in this model.

In [None]:
plt.rcParams.update({'ytick.left' : False, 'axes.titlepad':10})
fig, axes = plt.subplots(3, 4, figsize=(16, 8), sharex=True, sharey=True, dpi=100)
for i, ax in enumerate(axes.flatten()[:12]):
    lag_plot(df['sales'], lag=i+1, ax=ax, c='blue')
    ax.set_title('Lag ' + str(i+1))
fig.suptitle('Lag Plots of Sales', y=1.05)
fig.tight_layout()
plt.show()

Decreasing linear trend between lag plots demonstrates similar results to
lag scores, suggesting some positive autocorrelation between first week

In [None]:
train = df['sales'][:105]
test = df['sales'][105:]
model = auto_arima(train, trace=True, error_action='ignore', suppress_warnings=True)
model.fit(train)
forecast = model.predict(n_periods=len(test))
forecast = pd.DataFrame(forecast,index = test.index,columns=['Prediction'])

SARIMAX MODEL

In [None]:
mod = sm.tsa.statespace.SARIMAX(df['sales'],
                                order=(1, 1, 1),
                                seasonal_order=(0, 0, 0, 0),
                                enforce_stationarity=True,
                                enforce_invertibility=True,
                                freq='W')
results = mod.fit()
results.plot_diagnostics(figsize=(12, 12))
plt.show()

SARIMAX Diagnostics

In [None]:
forecast = forecast.squeeze()
forecast_accuracy(forecast, test)

In [None]:
results.summary()

In [None]:
df_small = df[['week', 'sales']]

In [None]:
result = adfuller(df_small.sales.dropna())
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])

In this case, we fail the ADF test and will examine differencing
as a method to make our series stationary,

In [None]:
plt.rcParams.update({'figure.figsize':(12,8), 'figure.dpi':120})

fig, axes = plt.subplots(3, 2, sharex=False)
axes[0, 0].plot(df_small.sales); axes[0, 0].set_title('Original Series')
plot_acf(df_small.sales, ax=axes[0, 1])

axes[1, 0].plot(df_small.sales.diff()); axes[1, 0].set_title('1st Order Differencing')
plot_acf(df_small.sales.diff().dropna(), ax=axes[1, 1])

axes[2, 0].plot(df_small.sales.diff().diff()); axes[2, 0].set_title('2nd Order '
                                                                  'Differencing')
plot_acf(df_small.sales.diff().diff().dropna(), ax=axes[2, 1])
fig.tight_layout()
plt.show()

By the time the first differencing plot and its correlation plot, we can see
that the model has achieved a decent amount of stationarity.

In [None]:
# Fix Training and Test Data for Remaining Functions
df_small = df_small.sort_values(by=['week'])
df_model = df_small.set_index('week')
df_small = df_small.reset_index()
df_small = df_small[['week', 'sales']]
train = df_small['sales'][:105]
test = df_small['sales'][105:]
train_model = df_model['sales'][:105]
test_model = df_model['sales'][105:]

Now we will test the auto_arima model to see if it performs better.

In [None]:
train = df_model['sales']
arima_model = auto_arima(train, start_p=0, d=1, start_q=0, max_p=5,
                         max_d=5, max_q=5, start_P=0, D=1, start_Q=0,
                         max_P=5, max_D=5, max_Q=5, m=12, seasonal=True,
                         error_action='warn', trace=True,
                         suppress_warnings=True, stepwise=True,
                         random_state=13, n_fits=50)
arima_model.summary()

In [None]:
train = df_model['sales'][:160]
test = df_model['sales'][-50:]
prediction = pd.DataFrame(arima_model.predict(n_periods=50), index=test.index)
shift = test.iloc[0] - prediction.iloc[0,0]
prediction = prediction + shift

In [None]:
plt.plot(train, label='Train')
plt.plot(test, label='Test')
plt.plot(prediction, label='Prediction')
plt.title('Predicted Sales')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend(loc='upper left', fontsize=8)
plt.show()

Arima Model Statistics

In [None]:
prediction = prediction.squeeze()
forecast_accuracy(prediction, test)

Note the RMSE is significantly higher in the second model.

Just to demonstrate one more train/test plot with error
above and below, voila.

In [None]:
def forecast_to_df(model, steps=12):
    forecast = model.get_forecast(steps=steps)
    pred_df = forecast.conf_int()
    pred_df['pred'] = forecast.predicted_mean
    pred_df.columns = ['lower', 'upper', 'pred']
    return pred_df

In [None]:
pred_df = forecast_to_df(results, steps = len(test))

pred = pred_df['pred']

In [None]:
def plot_train_test_pred(train,test,pred_df):
    fig,ax = plt.subplots(figsize=(12,7))
    kws = dict(marker='o')

    ax.plot(train,label='Train',**kws)
    ax.plot(test,label='Test',**kws)
    ax.plot(pred_df['pred'],label='Prediction',ls='--',linewidth=3)

    ax.fill_between(x=pred_df.index,y1=pred_df['lower'],y2=pred_df['upper'],alpha=0.3)
    ax.set_title('Model Validation', fontsize=22)
    ax.legend(loc='upper left')
    fig.tight_layout()
    return fig,ax

plot_train_test_pred(train, test, pred_df)