In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

import arviz as az
import pymc3 as pm
from theano import tensor as tt

from fitter import Fitter, get_common_distributions, get_distributions
import mean_squared_error

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
from scipy import signal
from pandas.plotting import autocorrelation_plot
from pandas.plotting import lag_plot

from pmdarima.arima import auto_arima
from pmdarima.arima import ADFTest

In [None]:
print(os.getcwd())

df = pd.read_csv('C:/Users/norri/Documents/GitHub/mercury-ds/attribution/test.csv')
df_bu = df
df.describe()
df.info()

In [None]:
def plot_df(df, x, y, title="", xlabel='Date', ylabel='Sales', dpi=100):
    plt.figure(figsize=(12, 4), dpi=dpi)
    plt.plot(x, y, color='blue')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()
plot_df(df, df['week'], df['sales'], title='Sales Over Time')

In [None]:
# these drop column snippets are not used often here,
# but have been useful, especially with large datasets

# df = df.drop(['DATE',
#     ], axis=1)
segment = [var for var in df.columns if df[var].dtype == 'O']
print('There are {} numerical/continuous variables\n'.format(len(segment)))
print('The numerical/continuous variables are :\n\n', segment)
print(df[segment].isnull().sum() / len(df))
df.info()
df.describe()
print(df.nunique())

In [None]:
# use VIF along with variable importance to check new dataframes
vif_df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]

X = vif_df
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

corr_vif = vif_df.df(method="spearman").round(2)
mask = np.triu(np.ones_like(corr_vif, dtype=bool))
plt.subplots(figsize=(18, 18))
cmap = sns.diverging_palette(250, 1, as_cmap=True)
sns.heatmap(df, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
corr_vif.describe()

In [None]:
viz = plot_corr_heatmap(df_num, figsize=(20, 20))
viz.view()

In [None]:
adf_test = ADFTest(alpha = .05)
adf_test.should_diff(df['sales'])

In [None]:
figure, axes = plt.subplots(2, 3, figsize=(15, 15))
sns.kdeplot(ax=axes[0, 0], x='Sum of Dollar Sales', data=df_num)
sns.kdeplot(ax=axes[0, 1], x='Dollar Share of Category', data=df_num)
sns.kdeplot(ax=axes[0, 2], x='Sum of Dollar Sales Any Display', data=df_num)
sns.kdeplot(ax=axes[1, 0], x='Sum of Dollar Sales Any Merch', data=df_num)
sns.kdeplot(ax=axes[1, 1], x='Sum of Dollar Sales No Merch (non-promo)', data=df_num)
sns.kdeplot(ax=axes[1, 2], x='Sum of Dollar Sales Any Price Reduction', data=df_num)
plt.show()

Selecting from the most likely distributions of the data,
finds the most likely distribution for the data, its parameters,
and is useful for any data transformations.

In [None]:
dist_list = ['gamma', 'expon', 'cauchy', 'norm', 'uniform']

for var in df:
    dist_test = df[var].dropna()
    dist_test = dist_test.values
    f = Fitter(dist_test, distributions=dist_list, timeout=60)
    f.fit()
    print(var)
    print(f.summary(plot=False))
    print(f.get_best(method='sumsquare_error'))

In [None]:
detrended = signal.detrend(df['sales'].values)
plt.rcParams.update({'figure.figsize': (12,4)})
plt.plot(detrended)
plt.title('Sales Detrended', fontsize=16)

In [None]:
result_mul = seasonal_decompose(df['sales'].values, model='multiplicative', period=52)
deseasonalized = df['sales'].values / result_mul.seasonal
plt.plot(deseasonalized)
plt.title('Sales Deseasonalized', fontsize=16)
plt.plot()

In [None]:
plt.rcParams.update({'figure.figsize':(10,4), 'figure.dpi':120})
autocorrelation_plot(df['sales'].tolist())

In [None]:
# this section should come later; for larger datasets takes too long to run
# with too little return in information
cols = 5
rows = 20
num_cols = df.select_dtypes(exclude='object').columns
fig = plt.figure(figsize=(cols * 5, rows * 5))
for i, col in enumerate(num_cols):
    ax = fig.add_subplot(rows, cols, i + 1)
    sns.histplot(x=df[col], ax=ax)
fig.tight_layout()
plt.show()