In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
from scipy import signal
from pandas.plotting import autocorrelation_plot
from pandas.plotting import lag_plot
from pmdarima.arima.utils import ndiffs
from pmdarima.arima import auto_arima
from pmdarima.arima import ADFTest
from pmdarima import acf
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from IPython.display import display, Markdown
from IPython.display import Math
import datetime
import math
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=ValueError)

In [4]:
df = pd.read_csv('C:/Users/norri/Desktop/tyson_EDA.csv')
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54039 entries, 0 to 54038
Data columns (total 48 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   ClientId                                      54039 non-null  int64  
 1   Program Id                                    54039 non-null  int64  
 2   Program Name                                  54039 non-null  object 
 3   Retailers                                     54039 non-null  object 
 4   TacticId                                      54039 non-null  int64  
 5   Tactic                                        54039 non-null  object 
 6   CategoryId                                    54039 non-null  int64  
 7   Tactic Category                               54039 non-null  object 
 8   VendorId                                      53135 non-null  float64
 9   Vendor                                        54039 non-null 

In [5]:
print('Missing Values')
print(df['y'].isnull().sum() / len(df) * 100)
print('Zeroes')
print((df['y'] == 0).sum())

Missing Values


KeyError: 'y'

Initially we have over half missing, as well as a few zeroes which can be problematic during forecasting. I'll drop the zeroes since they make up a small part of the dataset.

In [None]:
df = df[df.y != 0]

In [None]:
print('Missing Values')
print(df['y'].isnull().sum() / len(df) * 100)
print('Zeroes')
print((df['y'] == 0).sum())

In [None]:
df.nunique()

With so many more duplicated y values, looking through the data it shows that for the same date the revenue is replicated several times over. Here I group by dates and take the average of the revenue to have a single value for revenue per date value.

In [None]:
df_a = df.groupby('ds').apply('y').mean().reset_index()
print('Missing Values')
print(df_a['y'].isnull().sum() / len(df_a) * 100)
print('Zeroes')
print((df_a['y'] == 0).sum())

This is something we had to do when modeling Tyson originally, but we have to drop all of the NA's, which is about 30,000 original observations.

In [None]:
df_a= df.dropna()

The date values aren't daily; they skip a day here or there, so I decided to group them into weeks.

In [None]:
df_a['date'] = pd.to_datetime(df_a['ds']) - pd.to_timedelta(7, unit='d')
weekly = df_a.groupby([pd.Grouper(key='date', freq='W')])['y'].sum().reset_index()
weekly = weekly.rename(columns={'y': 'revenue'})

In [None]:
print('Missing Values')
print(weekly['revenue'].isnull().sum() / len(weekly) * 100)
print('Zeroes')
print((weekly['revenue'] == 0).sum())

Still seven weeks at zero, so they have to be dropped for future methods.

In [None]:
weekly = weekly[weekly.revenue != 0]

In [None]:
print('Missing Values')
print(weekly['revenue'].isnull().sum() / len(weekly) * 100)
print('Zeroes')
print((weekly['revenue'] == 0).sum())

In [None]:
def plot_df(df, x, y, title="", xlabel='Date', ylabel='Sales', dpi=100):
    plt.figure(figsize=(12, 4), dpi=dpi)
    plt.plot(x, y, color='blue')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()
plot_df(weekly, weekly['date'], weekly['revenue'], title='Sales Over Time')

The plot shows massive spikes and is wildly inconsistent. It appears that from the minimum to the maximum is five orders of magnitude.

In [None]:
weekly['revenue'].describe()

In [None]:
mod = sm.tsa.statespace.SARIMAX(weekly['revenue'],
                                order=(1, 1, 1),
                                seasonal_order=(0, 0, 0, 0),
                                enforce_stationarity=True,
                                enforce_invertibility=True)
results = mod.fit()
results.plot_diagnostics(figsize=(12, 12))

THe Q-Q plots show that ARIMA is fitting very poorly here. The following autocorrelation plot is not great, but not terrible. Still with the uncertainty the data is showing, it is hard to trust it.

In [None]:
plt.rcParams.update({'figure.figsize':(10,4), 'figure.dpi':120})
autocorrelation_plot(weekly['revenue'].tolist())

I'll give auto ARIMA a chance to perfect the parameters to see if it's any better.

In [None]:
train = weekly['revenue']
mod_auto_arima = auto_arima(train, start_p=0, d=1, start_q=0, max_p=5,
                         max_d=5, max_q=5, start_P=0, D=1, start_Q=0,
                         max_P=5, max_D=5, max_Q=5, m=12, seasonal=True,
                         error_action='warn', trace=True,
                         suppress_warnings=True, stepwise=True,
                         random_state=13, n_fits=50)

In [None]:
train = weekly['revenue'][-12:]
test = weekly['revenue'][:12].reset_index() # missing pass sixteen weeks
model = auto_arima(train, trace=True, error_action='ignore', suppress_warnings=True)
model.fit(train)
forecast = mod_auto_arima.predict(n_periods=12)
fcst = pd.DataFrame(columns = ['forecast', 'test'])
fcst['forecast'] = forecast
fcst = fcst.reset_index()
fcst['test'] = test['revenue']

It's a truly terrible forecast. The 80 preceding values had no real capability of predicting the following twelve.

In [None]:
plt.plot(fcst['forecast'], label='Forecast')
plt.plot(fcst['test'], label='Actuals')
leg = plt.legend()
plt.show()