In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import xgboost as xgb
# from xgboost import XGBRegressor
from fitter import Fitter
# from scipy import signal
from pandas.plotting import autocorrelation_plot
from pmdarima.arima import ADFTest
# from sklearn import preprocessing
# from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.impute import SimpleImputer
# from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model
sns.set_style('darkgrid')
# import arviz as az
# import pymc3 as pm
# from theano import tensor as tt
# from fitter import Fitter, get_common_distributions, get_distributions
# import mean_squared_error
# from statsmodels.tsa.stattools import adfuller
# from statsmodels.tsa.seasonal import seasonal_decompose
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# from statsmodels.tsa.arima.model import ARIMA
# from statsmodels.stats.outliers_influence import variance_inflation_factor
# from statsmodels.tools.tools import add_constant
# from sklearn.metrics import accuracy_score,confusion_matrix
# from pandas.plotting import lag_plot
# from pmdarima.arima import auto_arima
%matplotlib inline
# import shap

In [None]:
os.chdir('G:/My Drive/To_Do/MMM/Raw_Hain_Data/Pivot')

In [None]:
# smd_pivot = pd.read_csv('smd_pivot.csv')
# ts_pivot = pd.read_csv('ts_pivot.csv')
# ct = pd.read_csv('ct_spend.csv') # need information on revenue or impressions
dm_pivot = pd.read_csv('dm_pivot_2.csv')

This summary of different stats for the original model can be compared to the .describe()
of the imputed dataset to see how it changed (or didn't change).

In [None]:
df = dm_pivot
df.describe()

In [None]:
def missing_values(df):
    names = [var for var in df.columns]
    missing_count = df[names].isnull().sum()
    var_count = np.array(df[names].isnull().sum() * 100/ len(df)).round(2)
    missing = pd.DataFrame(index=names)
    missing["Count Missing"] = missing_count
    missing["Percent Missing"] = var_count
    print(missing)


def dickey_fuller(df):
    adf_test = ADFTest(alpha = .05)
    print('A value of True means that the ADFTest null hypothesis that the time series is non-stationary is correct.')
    result = adf_test.should_diff(df['revenue'])
    return result


def unique(df):
    percent_unique = np.array(100 * df.nunique()/len(df.index)).round(2)
    count_unique = df.nunique()
    names = [var for var in df.columns]
    unique_df = pd.DataFrame(index=names)
    unique_df["Count Unique"] = count_unique
    unique_df["Percent Unique"] = percent_unique
    print(unique_df)


def corr_plot(df):
    corr_temp = df.drop(['DATE'], axis=1)
    corr_names = corr_temp.columns.tolist()
    temp_df = df[corr_names]
    corr = temp_df.corr(method="pearson").round(2)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    f, ax = plt.subplots(figsize=(18, 18))
    cmap = sns.diverging_palette(250, 1, as_cmap=True)
    sns.heatmap(corr, annot=True, mask=mask, cmap=cmap,
                vmax=1, vmin=-1, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})


def summary(df):
    print(missing_values(df))
    print(unique(df))
    corr_plot(df)

This summary gives us information on missing values, the number of unique values, and correlation before imputation.
In some cases, it suggests that columns of entirely missing data might be dropped right away. Once again, it is valuable
to compare to the imputed result.

In [None]:
summary(df)

Helps drop variables when necessary and creates a few variables that will be used later.

In [None]:
# df = df.drop([], axis=1)
temp_week = df['DATE']
corr_temp = df.drop(['DATE'], axis=1)
corr_names = corr_temp.columns.tolist()

The KNNImputer uses a method similar to regression and nearby non-missing values to fill in missing values.
The number of neighboring values can be adjusted to find better fits for the missing values.

In [None]:
# # KNN Imputation
# df_knn = temp_df.filter([], axis=1).copy()
# # Define scaler to set values between 0 and 1
# scaler = MinMaxScaler(feature_range=(0, 1))
# df_knn = pd.DataFrame(scaler.fit_transform(df_knn), columns = df_knn.columns)
# # Define KNN imputer and fill missing values
# knn_imputer = KNNImputer(n_neighbors=12, weights='distance', metric='nan_euclidean')
# df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_knn), columns=df_knn.columns)

In [None]:
df = df.drop(columns='DATE')
imputer = KNNImputer(n_neighbors=10, weights='distance', metric='nan_euclidean')
imputed_KNN = imputer.fit_transform(df)
imputed_KNN = pd.DataFrame(imputed_KNN, columns = df.columns)
KNN_imputation = pd.concat([imputed_KNN, temp_week], axis=1)

Multiple Imputation by Chained Equations (MICE) uses iterations of Bayesian Ridge Linear models
and takes the averages of their results to determine the imputed values. Compared to most other methods,
it performs better with extremely sparse data.

In [None]:
mice_names = tuple(corr_names)
mice_temp = df[corr_names]
df_mice = mice_temp.filter(mice_names, axis=1).copy()

mice_estimator = IterativeImputer(estimator=linear_model.BayesianRidge(), sample_posterior=True, max_iter=40,
                                n_nearest_features=10, imputation_order='random', min_value=500)
df_mice_imputed = pd.DataFrame(mice_estimator.fit_transform(df_mice), columns=df_mice.columns)
imputed_mice = pd.concat([df_mice_imputed, temp_week], axis=1)

In [None]:
# imputed_mice.to_csv('smd_mice_1.csv', encoding='utf-8', index=False)
# KNN_imputation.to_csv('smd_KNN_1.csv', encoding='utf-8', index=False)

### Post Hoc Tests

The Augmented Dickey-Fuller function tests if a time series needs differencing,
which returns True, or if the time series is stationary. It also implies that
if the test returns True, it is not a random walk and the imputation is somewhat
decent, at the minimum.

In [None]:
df = imputed_mice

In [None]:
dickey_fuller(df)

Autocorrelation is another test for time series that tests if the current date's value is correlated
with previous observations. If the blue line is above the dotted line, this suggests the time series
is autocorrelated as well as the time series not being random.

In [None]:
plt.rcParams.update({'figure.figsize':(10,4), 'figure.dpi':120})
autocorrelation_plot(df['revenue'].tolist())

The following tests plot the distribution of the variables in the imputed dataset,
as well as give you the most likely distribution of its parameters. This can be useful
for testing if the distribution is known or comparing against a similar variable whose
distribution as known. It could also be used in creating synthetic data.

In [None]:
temp_df = df.drop(['DATE'], axis=1)
dist_list = ['gamma', 'expon', 'cauchy', 'norm', 'uniform']

for var in temp_df:
    dist_test = temp_df[var].dropna()
    dist_test = dist_test.values
    f = Fitter(dist_test, distributions=dist_list, timeout=60)
    f.fit()
    print(var)
    print(f.summary(plot=False))
    print(f.get_best(method='sumsquare_error'))

In [None]:
figure, axes = plt.subplots(9, 1, figsize=(23, 14))
sns.kdeplot(ax=axes[0], x='onlinedisplay_S', data=df)
sns.kdeplot(ax=axes[1], x='onlinedisplay_I', data=df)
sns.kdeplot(ax=axes[2], x='onlinevideo_S', data=df)
sns.kdeplot(ax=axes[3], x='onlinevideo_I', data=df)
sns.kdeplot(ax=axes[4], x='paidsearch_S', data=df)
sns.kdeplot(ax=axes[5], x='paidsearch_I', data=df)
sns.kdeplot(ax=axes[6], x='social_S', data=df)
sns.kdeplot(ax=axes[7], x='social_I', data=df)
sns.kdeplot(ax=axes[8], x='revenue', data=df)