#### The purpose of this EDA notebook is the following:
- Better understand the nature of the relationship between the independent variables
- Create an initial model with reasonable economic assumptions that may be dropped in later versions
- Explore methods of imputation for missing variables to provide more data samples
- Avoid linear combinations that might be more difficult to spot in the Bayesian Modeling process
- Establish a reasonable measure of variable importance, which along with correlation plots may inform initial hierarchies
- Create visualizations of poor quality data and also establish probability distributions for the likelihood function

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
from rfpimp import *
from rfpimp import plot_corr_heatmap
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from scipy.optimize import curve_fit
from scipy.stats import linregress
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from multiprocessing import Pool
from sklearn.inspection import permutation_importance
import category_encoders as ce
from catboost import CatBoostRegressor, Pool
import arviz as az
import pymc3 as pm
from theano import shared
from sklearn import preprocessing
import shap
import bambi as bmb
import formulae
from statsmodels.stats.outliers_influence import variance_inflation_factor
from fitter import Fitter, get_common_distributions, get_distributions

In [None]:
df = pd.read_csv('/home/matt/Documents/cortex_Push.csv')
df.describe()
df.info()

Pandas dataframes will have string, int, and float columns. The following
three sections will look for columns that need to be fixed or dropped altogether

In [None]:
# these drop column snippets are not used often here,
# but have been useful, especially with large datasets

df = df.drop(['Program Name', 'Retailers', 'Tactic', 'Vendor', 'Tactic Start Date', 'Tactic End Date', 'Brand'], axis=1)
segment = [var for var in df.columns if df[var].dtype == 'O']
print('There are {} categorical variables\n'.format(len(segment)))
print('The categorical variables are :\n\n', segment)
print(df[segment].isnull().sum() / len(df))
df_cat = df.select_dtypes(include=object)
df_cat.info()
df_cat.describe()
print(df_cat.nunique())

In [None]:
df.RMN.replace(('Yes', 'No'), (1, 0), inplace=True)

In [None]:
for col in df.select_dtypes(include='object'):
    if df[col].nunique() <= 25:
        sns.countplot(y=col, data=df)
        plt.show()

In [None]:
# target = 'Total Sales'
# CAT_FEATURES = ['Tactic Category']
#
# df_cat = df.dropna(subset=['Total Sales'])
# df_cat = df_cat[['Total Sales', 'Tactic Category']]
# df_train, df_test = train_test_split(df_cat, test_size=0.3)
# X_train, y_train = df_train.drop(target, axis=1), df_train[target]
# X_test, y_test = df_test.drop(target, axis=1), df_test[target]
#
# catboost_model = CatBoostRegressor(n_estimators=200,
#                                    loss_function = 'RMSE',
#                                    eval_metric = 'RMSE',
#                                    cat_features = CAT_FEATURES, one_hot_max_size=20)
# catboost_model.fit(X_train, y_train, cat_features = CAT_FEATURES,
#                    eval_set = (X_test, y_test),
#                    use_best_model = True,
#                    plot = True)
# shap_values = catboost_model.get_feature_importance(Pool(
#     X_train,
#     label = y_train,
#     cat_features = CAT_FEATURES
# ),type = "ShapValues")
# shap_values = shap_values[:,:-1]
# shap.summary_plot(shap_values, X_train, max_display=50)

In [None]:
# df = df.drop(
#     [], axis=1)
integer = [var for var in df.columns if df[var].dtype == 'int64']
print('There are {} integer variables\n'.format(len(integer)))
print('The integer variables are :\n\n', integer)
print(df[integer].isnull().sum())
df_int = df.select_dtypes(include=int)
if len(df_int.columns) > 0.0:
    df_int.info()
    df_int.describe()

In the below section we address the large number of missing values and also
the columns consisting entirely of zeroes, and drop them accordingly

In [None]:
df = df.drop(
        ['Base $', 'Incr $', 'Base Units', 'Incr Units',
         '$ Shr - Ty Subcategory', 'Units Shr - Ty Category',
         'Units Shr - Ty Subcategory'], axis=1)
fp = [var for var in df.columns if df[var].dtype == 'float64']
print('There are {} float variables\n'.format(len(fp)))
print('The float variables are :\n\n', fp)
fp_na = df[fp].isnull().sum() / len(df) * 100
print(fp_na[fp_na > 10])
fp_zero = df[fp].sum()
print(fp_zero[fp_zero == 0.0])

In [None]:
df = df.drop(
        ['ClientId', 'Program Id', 'TacticId', 'CategoryId',
         'BrandId', 'Nielsen_Week_Year', 'VendorId'], axis=1)
df_num = df.select_dtypes(exclude='object')
df_num.describe()

The following heat maps are obviously important for understanding relationships,
but more importantly their dataframes provide the ability to fill df.colnames
that will be key to making a decision on what variables to explore for feature importance

In [None]:
corr = df_num.corr(method="spearman").round(2)
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(18, 18))
cmap = sns.diverging_palette(250, 1, as_cmap=True)
sns.heatmap(corr, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
corr.describe()

In [None]:
viz = plot_corr_heatmap(df_num, cmap='CMRmap', value_fontsize=14,
                        label_fontsize=14, figsize=(16, 16))
viz.view()

Here we use the correlation df and filter by a minimum threshold, while
eliminating one to avoid including the variable itself
by converting it to a list, we can use it in our feature importance plots

In [None]:
# check for new dataframes

vif_df = df_num[~df_num.isin([np.nan, np.inf, -np.inf]).any(1)]

X = vif_df
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

corr_vif = vif_df.corr(method="spearman").round(2)
mask = np.triu(np.ones_like(corr_vif, dtype=bool))
f, ax = plt.subplots(figsize=(18, 18))
cmap = sns.diverging_palette(250, 1, as_cmap=True)
sns.heatmap(corr, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
corr_vif.describe()

In [None]:
# keep Total Sales from the VIF calculation, Units due to its relationship to Total Sales
# drop either Total Impressions or Impressions per Week
# drop Any Promo % ACV because of its relationship to all other promos
# corr = corr.drop(['Units', 'Impressions per Week',
#                   'Any Promo %ACV', '%ACV Distribution'], axis=1)

df_num = df_num.drop(['Units', 'Impressions per Week',
                      'Any Promo %ACV', '%ACV Distribution'], axis=1)
corr = df_num.corr(method="spearman").round(2)

In [None]:
def imp_plots(target, features):
    """Form three importance plots

    :param target:'dependent' component
    :param features:'predictive' component
    """
    target = target
    df_all = df_num.dropna().astype(dtype='int32')
    df_all = df_all[features + [target]]
    df_train, df_test = train_test_split(df_all, test_size=0.15)
    X_train, y_train = df_train.drop(target, axis=1), df_train[target]
    X_test, y_test = df_test.drop(target, axis=1), df_test[target]
    rf = RandomForestClassifier(n_estimators=100, n_jobs=-1,
                                max_features=1.0,
                                min_samples_leaf=10, oob_score=True)
    rf.fit(X_train, y_train)
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                           max_depth=None, max_features=1.0, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=10, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                           oob_score=True, random_state=None, verbose=0, warm_start=False)
    figure, (ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1, figsize=(10, 10))
    imp1 = importances(rf, X_test, y_test)
    plot_importances(imp1, width=16, vscale=4, ax=ax1)

    imp = pd.DataFrame()
    imp['Feature'] = X_train.columns
    imp['Importance'] = rf.feature_importances_
    imp = imp.sort_values('Importance', ascending=False)
    imp2 = imp.set_index('Feature')
    plot_importances(imp2, width=16, vscale=4, ax=ax2)

    perm_importance = permutation_importance(rf, X_test, y_test)
    perm = pd.DataFrame()
    perm['Feature'] = X_test.columns
    perm['Importance'] = perm_importance.importances_mean
    perm = perm.sort_values('Importance', ascending=False)
    perm = perm.set_index('Feature')
    plot_importances(perm, width=16, vscale=4, ax=ax3)
    a = imp1.sort_values(by='Feature')
    b = imp2.sort_values(by='Feature')
    c = perm.sort_values(by='Feature')
    d = (np.abs(a) + np.abs(b) + np.abs(c)).sort_values('Importance', ascending=False).mean(axis=1)
    plt.show()
    return d

The following three importance plots look at different ways to measure importance
in relation to predicting our variable of interest. We can continue this
process many times to develop our Bayesian Hierarchy

In [None]:
corr_imp = corr[abs(corr['Total Sales'] > .20) & (corr['Total Sales'] < 1.0)]
corr_imp = corr_imp[['Total Sales']]
features = corr_imp.index.tolist()
imp_sales = imp_plots('Total Sales', features)
print(imp_sales)

Once the previous importance plots have given us insight into the most important
variables at that level of the hierarchy, we can continue by choosing the most important
for the next level of the hierarchy

In [None]:
corr_imp = corr[
    abs(corr['Number of UPCs Selling'] > .20) & (corr['Number of UPCs Selling'] < 1.0)]
corr_imp = corr_imp[['Number of UPCs Selling']]
features = corr_imp.index.tolist()
imp_UPC = imp_plots('Number of UPCs Selling', features)
print(imp_UPC)

In [None]:
corr_imp = corr[abs(corr['Price Decr Only %ACV'] > .20) & (corr['Price Decr Only %ACV'] < 1.0)]
corr_imp = corr_imp[['Price Decr Only %ACV']]
features = corr_imp.index.tolist()
imp_price_decr = imp_plots('Price Decr Only %ACV', features)
print(imp_price_decr)

In [None]:
corr_imp = corr[abs(corr['Any Promo Units'] > .20) & (corr['Any Promo Units'] < 1.0)]
corr_imp = corr_imp[['Any Promo Units']]
features = corr_imp.index.tolist()
imp_promo_unit = imp_plots('Any Promo Units', features)
print(imp_promo_unit)

In [None]:
corr_imp = corr[abs(corr['Feat w/o Disp %ACV'] > .20) & (corr['Feat w/o Disp %ACV'] < 1.0)]
corr_imp = corr_imp[['Feat w/o Disp %ACV']]
features = corr_imp.index.tolist()
imp_feat_no_disp = imp_plots('Feat w/o Disp %ACV', features)
print(imp_feat_no_disp)

This can be repeated for every variable of interest in the hierarchy
The below kde plots are crucial to understanding the likelihood function
distribution and beginning the Bayesian modeling

In [None]:
# final_vars = ['Number of UPCs Selling', 'Any Promo Units', '%ACV Distribution',
#               'Feat w/o Disp %ACV', 'Price Decr Only %ACV', 'Disp w/o Feat %ACV',
#               'Total Sales', 'Feat & Disp %ACV', 'Weeks', 'RMN','Tactic Category']

final_vars = ['Number of UPCs Selling', 'Any Promo Units', '%ACV Distribution',
              'Feat w/o Disp %ACV', 'Price Decr Only %ACV', 'Disp w/o Feat %ACV',
              'Total Sales', 'Feat & Disp %ACV', 'Weeks', 'RMN']

In [None]:
df_final = df[final_vars]

for var in final_vars:
    dist_test = df_final[var].dropna()
    dist_test = dist_test.values
    f = Fitter(dist_test, distributions=get_common_distributions(), timeout=45)
    f.fit()
    print(var)
    print(f.summary())
    print(f.get_best(method = 'sumsquare_error'))

In [None]:
# az.plot_kde(df['Total Sales'].values, rug=True, label='Total Sales', figsize=(12, 8))
# plt.yticks([0], alpha=0);
# # most likely gamma

In [None]:
# az.plot_kde(df['Number of UPCs Selling'].values, rug=True, label='Number of UPCs Selling', figsize=(12, 8))
# plt.yticks([0], alpha=0);

In [None]:
# az.plot_kde(df['RMN'].values, rug=True, label='Any Promo Units', figsize=(12, 8))
# plt.yticks([0], alpha=0);

In [None]:
# az.plot_kde(df['%ACV Distribution'].values, rug=True, label='%ACV Distribution', figsize=(12, 8))
# plt.yticks([0], alpha=0);

In [None]:
# az.plot_kde(df['Feat w/o Disp %ACV'].values, rug=True, label='Feat w/o Disp %ACV', figsize=(12, 8))
# plt.yticks([0], alpha=0);

In [None]:
# az.plot_kde(df['Price Decr Only %ACV'].values, rug=True, label='Price Decr Only %ACV', figsize=(12, 8))
# plt.yticks([0], alpha=0);

In [None]:
# az.plot_kde(df['Disp w/o Feat %ACV'].values, rug=True, label='Disp w/o Feat %ACV', figsize=(12, 8))
# plt.yticks([0], alpha=0);

In [None]:
# az.plot_kde(df['Feat & Disp %ACV'].values, rug=True, label='Feat & Disp %ACV', figsize=(12, 8))
# plt.yticks([0], alpha=0);

# az.plot_kde(df['Weeks'].values, rug=True, label='Weeks', figsize=(12, 8))
# plt.yticks([0], alpha=0);

In [None]:
# removing all zeroes for the initial testing

df_sales_nz = df[df['Total Sales'] > 0.0]
print(df_sales_nz[df_sales_nz['Total Sales'] == 0.0])

In [None]:
with pm.Model() as model_sales:
    alpha = pm.Exponential('alpha', 100)
    beta = pm.Exponential('beta', 1000)
    g = pm.Gamma('g', alpha=alpha, beta=beta, observed=df_sales_nz['Total Sales'].values)
    trace_sales = pm.sample(5000, tune=5000, return_inferencedata=True)

In [None]:
az.plot_trace(trace_sales)

In [None]:
az.summary(trace_sales)

In [None]:
az.mcse(trace_sales)

In [None]:
az.ess(trace_sales)

In [None]:
az.plot_forest(trace_sales, var_names=['alpha', 'beta'], combined=True);

In [None]:
az.plot_posterior(trace_sales, hdi_prob=0.99);

In [None]:
az.plot_forest(trace_sales, r_hat=True);

In [None]:
az.plot_energy(trace_sales);

In [None]:
ppc = pm.sample_posterior_predictive(trace_sales, samples=20000, model=model_sales)
_, ax = plt.subplots(figsize=(10, 5))
ax.hist([g.mean() for g in ppc['g']], bins=19, alpha=0.5)
ax.axvline(df_sales_nz['Total Sales'].mean())
ax.set(title='Posterior predictive of the mean', xlabel='mean(x)', ylabel='Frequency');

In [None]:
df_final.columns = df_final.columns.str.replace('[#,@,&,%,''//''," "]','')
print(df_final.columns)

In [None]:
test_model = bmb.Model('TotalSales ~ NumberofUPCsSelling + AnyPromoUnits + ACVDistribution + FeatwoDispACV + PriceDecrOnlyACV + DispwoFeatACV + FeatDispACV + Weeks', data=df_final, dropna=True)

test_fitted = test_model.fit(draws=1000, chains=4)
test_model.predict(test_fitted, kind="pps", draws=1000)

In [None]:
az.plot_trace(test_fitted, compact=False);

In [None]:
az.summary(test_fitted)

In [None]:
az.plot_ppc(test_fitted, figsize=(12, 12))

In [None]:
az.plot_energy(test_fitted)

In [None]:
# # check for new dataframes
#
# vif_df = df_final[~df_final.isin([np.nan, np.inf, -np.inf]).any(1)]
# vif_df = vif_df.drop(['Units', 'AnyPromoACV', 'TotalSales'], axis=1)
# X = vif_df
# vif_data = pd.DataFrame()
# vif_data["feature"] = X.columns
# vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
# print(vif_data)
#
# corr = vif_df.corr(method="spearman").round(2)
# mask = np.triu(np.ones_like(corr, dtype=bool))
# f, ax = plt.subplots(figsize=(18, 18))
# cmap = sns.diverging_palette(250, 1, as_cmap=True)
# sns.heatmap(corr, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5})
# corr.describe()

In [None]:
# target = 'Total Sales'
# CAT_FEATURES = ['Tactic Category']
#
# df_cat = df_final.dropna(subset=['Total Sales'])
# # df_cat = df_cat[['Total Sales', 'Tactic Category']]
# df_train, df_test = train_test_split(df_cat, test_size=0.3)
# X_train, y_train = df_train.drop(target, axis=1), df_train[target]
# X_test, y_test = df_test.drop(target, axis=1), df_test[target]
# # X_train, y_train = df_train, df_train[target]
# # X_test, y_test = df_test, df_test[target]
#
# catboost_model = CatBoostRegressor(n_estimators=200,
#                                    loss_function = 'RMSE',
#                                    eval_metric = 'RMSE',
#                                    cat_features = CAT_FEATURES, one_hot_max_size=20)
# catboost_model.fit(X_train, y_train, cat_features = CAT_FEATURES,
#                    eval_set = (X_test, y_test),
#                    use_best_model = True,
#                    plot = True)
# shap_values = catboost_model.get_feature_importance(Pool(
#     X_train,
#     label = y_train,
#     cat_features = CAT_FEATURES
# ),type = "ShapValues")
# shap_values = shap_values[:,:-1]
# shap.summary_plot(shap_values, X_train, max_display=50)