#### The purpose of this EDA notebook is the following:
- Better understand the nature of the relationship between the independent variables
- Create an initial model with reasonable economic assumptions that may be dropped in later versions
- Explore methods of imputation for missing variables to provide more data samples
- Avoid linear combinations that might be more difficult to spot in the Bayesian Modeling process
- Establish a reasonable measure of variable importance, which along with correlation plots may inform initial hierarchies
- Create visualizations of poor quality data and also establish probability distributions for the likelihood function

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
from rfpimp import *
from rfpimp import plot_corr_heatmap
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from scipy.optimize import curve_fit
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import category_encoders as ce

ModuleNotFoundError: No module named 'category_encoders'

#### In an effort to generalize for future EDA, 'df' makes it easier to debug

In [None]:
df = pd.read_csv('C:/Users/norri/Desktop/all_ab_data.csv')
df.describe()
df.info()

### The info function above tells us we have three data types and counts of each. The next section will explore those variables in groups.

In [None]:
df = df.drop(['CONCAT', 'Brewer Value'], axis=1)
segment = [var for var in df.columns if df[var].dtype == 'O']
print('There are {} categorical variables\n'.format(len(segment)))
print('The categorical variables are :\n\n', segment)
print(df[segment].isnull().sum() / len(df))
df_cat = df.select_dtypes(include=object)
df_cat.info()
df_cat.describe()


In [None]:
df = df.drop(
    ['F81', 'Product Development Index', 'Sum of Dollar Sales Checkout Display',
     'Sum of Dollar Sales Outside Display', 'Sum of Dollar Sales Signage and Feature',
     'Sum of Dollar Sales Signage, Feature and Display'], axis=1)
integer = [var for var in df.columns if df[var].dtype == 'int64']
print('There are {} integer variables\n'.format(len(integer)))
print('The integer variables are :\n\n', integer)
print(df[integer].isnull().sum())
df_int = df.select_dtypes(include=int)
if len(df_int.columns) > 0.0:
    df_int.info()
    df_int.describe()

In [None]:
df = df.drop(
    ['Unit Share of SubCategory', 'Dollar Share of SubCategory',
     'Incremental Dollars', 'Incremental Dollars % Change vs YA',
     'Incremental Units', 'Incremental Units % Change vs YA',
     '% Increase in Dollars by Merch Any Special Pack',
     '% Increase in Dollars by Merch Special Pack Only',
     '% Increase in Units by Merch',
     '% Increase in Units by Merch Any Special Pack',
     '% Increase in Units by Merch Special Pack Only',
     '% Incremental Units by Merch Any Special Pack',
     '% Incremental Units by Merch Special Pack Only',
     '% Increase in Units by Merch No Merch'], axis=1)
fp = [var for var in df.columns if df[var].dtype == 'float64']
print('There are {} float variables\n'.format(len(fp)))
print('The float variables are :\n\n', fp)
fp_na = df[fp].isnull().sum() / len(df) * 100
print(fp_na[fp_na > 10])
fp_zero = df[fp].sum()
print(fp_zero[fp_zero == 0.0])

In [None]:
# agreed upon drops
df = df.drop(
    ['Dollar Sales', 'Sum of Base Dollar Sales',
     'Sum of Incremental Dollars', 'Base Dollar Sales',
     'Base Dollar Sales % Change vs YA', 'Base Unit Sales'], axis=1)

In [None]:
# can drop year ago columns
df_fp = df.select_dtypes('float')
df_num = df_fp
df_fp.describe()

### Clearly the float variables contain all the missing data in the dataset. In
### cases like this, dropping the missing values are a trade-off to consider
### against dropping an entire column and losing its input into the model

In [None]:
# this section should come later; for larger datasets takes too long to run
# with too little return in information
cols = 5
rows = 20
num_cols = df_num.select_dtypes(exclude='object').columns
fig = plt.figure(figsize=(cols * 5, rows * 5))
for i, col in enumerate(num_cols):
    ax = fig.add_subplot(rows, cols, i + 1)
    sns.histplot(x=df[col], ax=ax)
fig.tight_layout()
plt.show()

In [None]:
# these visualizations provide some clue as to how to model the large number
# of variables and to examine their relationships
corr = df_num.corr(method="pearson")
corr.style.background_gradient(cmap="coolwarm").set_precision(2)

##### Imputation of missing values is too unreliable to base the rest of the
##### model on. Later tests will tell if there is any bias present. This is when
##### the most standout variables should be chosen for the model

In [None]:
# this is just another visualization of correlation that I like, I may switch it with others
# is nice because it is easier to zoom in for inspection
viz = plot_corr_heatmap(df_num, figsize=(20, 20))
viz.view()

In [None]:
corr_imp = corr[abs(corr['Sum of Dollar Sales']) >.5]
corr_imp = corr_imp[['Sum of Dollar Sales']]
print(corr_imp)

In [None]:
figure, axes = plt.subplots(2, 3, figsize=(15, 15))
sns.kdeplot(ax=axes[0, 0], x='Sum of Dollar Sales', data=df_num)
sns.kdeplot(ax=axes[0, 1], x='Dollar Share of Category', data=df_num)
sns.kdeplot(ax=axes[0, 2], x='Sum of Dollar Sales Any Display', data=df_num)
sns.kdeplot(ax=axes[1, 0], x='Sum of Dollar Sales Any Merch', data=df_num)
sns.kdeplot(ax=axes[1, 1], x='Sum of Dollar Sales No Merch (non-promo)', data=df_num)
sns.kdeplot(ax=axes[1, 2], x='Sum of Dollar Sales Any Price Reduction', data=df_num)
plt.show()

### These kde plots will a primary tool in determining the likelihood
### distributions and giving information on the prior

In [None]:
features = ['Sum of Dollar Sales Any Merch', 'Dollar Share of Category',
            'Sum of Dollar Sales Any Display',
            'Sum of Dollar Sales No Merch (non-promo)',
            'Sum of Dollar Sales Any Price Reduction',
            'Sum of Dollar Sales Feature and/or Display',
            'Dollar Sales per Pt of Distribution',
            'Units per Store Selling',
            'Avg Weekly Units per Store Selling']
target = 'Sum of Dollar Sales'

df_all = df_num.dropna().astype(dtype='int32')
df_all = df_all[features + [target]]
df_train, df_test = train_test_split(df_all, test_size=0.15)

X_train, y_train = df_train.drop('Sum of Dollar Sales', axis=1), df_train['Sum of Dollar Sales']
X_test, y_test = df_test.drop('Sum of Dollar Sales', axis=1), df_test['Sum of Dollar Sales']

rf = RandomForestClassifier(n_estimators=100, n_jobs=-1,
                            max_features=1.0,
                            min_samples_leaf=10, oob_score=True)
rf.fit(X_train, y_train)

In [None]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=1.0, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                       oob_score=True, random_state=None, verbose=0, warm_start=False)

I = importances(rf, X_test, y_test)
plot_importances(I, width=12, vscale=3)

### This importance plot from a RandomForestClassifier is one of the key
### ways we'll understand which variables are the most important

#### With an array of all the variables, I could make an importance plot
#### for everything but the categorical variables, but I would have to change
#### the dependent variable each time. Remember that we dropped missing values
#### instead of dropping columns, which we could do with a column that did not
#### make an impression on the correlation plot or importance plot

In [None]:
I = pd.DataFrame()

I['Feature'] = X_train.columns
I['Importance'] = rf.feature_importances_
I = I.sort_values('Importance', ascending=False)
I = I.set_index('Feature')
viz = plot_importances(I, width=16, vscale=4)

#### Our categorical variables shouldn't be forgotten; just alter the threshold
#### down from 250 if the axis titles start to look messy

In [None]:
for col in df.select_dtypes(include='object'):
    if df[col].nunique() <= 30:
        sns.countplot(y=col, data=df)
        plt.show()

In [None]:
df_reg = df[['Sum of Dollar Sales', 'Average Weekly ACV Distribution Feature Only',
     'Average Weekly ACV Distribution Display Only']]

viz = plot_corr_heatmap(df_reg, cmap='gist_heat_r', value_fontsize=10,
                        figsize=(10, 10))
viz.view()

In [None]:
df_reg['dollar_log'] = np.log(df_reg['Sum of Dollar Sales'])
df_reg['feature_sq'] = np.sqrt(df_reg['Average Weekly ACV Distribution Feature Only'])
df_reg['display_sq'] = np.sqrt(df_reg['Average Weekly ACV Distribution Display Only'])

df_reg = df_reg.drop(['Sum of Dollar Sales',
                      'Average Weekly ACV Distribution Feature Only',
                      'Average Weekly ACV Distribution Display Only'], axis=1)

viz = plot_corr_heatmap(df_reg, cmap='gist_heat_r', value_fontsize=10,
                        figsize=(10, 10))
viz.view()

In [None]:
df_reg['retailer'] = df['Geography']
df_reg['brand'] = df['Brand Value']

df_reg.dropna()

ch_ohe = ce.OneHotEncoder(cols=['retailer', 'brand'], use_cat_names=True)
df_reg_ce = ch_ohe.fit_transform(df_reg)

y_reg = df_reg_ce['dollar_log']
X_reg = df_reg_ce.drop(['dollar_log'], axis=1)

X_train, X_test, y_train, y_test = \
    train_test_split(X_reg, y_reg, test_size = 0.2, random_state = 13)

#### The regression formula is ln(y) = b_0 + b_1 * sqrt(feature) +
#### b_2 * sqrt(display) + b_m * retailer + b_n*brand

In [None]:
reg = LinearRegression()
reg_1 = reg.fit(X_train, y_train)

y_pred = reg_1.predict(X_test)
r_2 = r2_score(y_test, y_pred)
print(r_2)

lr = sm.OLS(y_reg, X_reg)
lr_res = lr.fit()
print(lr_res.summary())

# constant was statistically insignificant