In [1]:
import pandas as pd
import numpy as np
import dtale
import sweetviz as sv
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# first_demo is the Facebook supplied test data, second_demo was created from the largest relevant data we have.
first_demo = pd.read_csv('C:/Users/norri/Desktop/dt_simulated.csv')
second_demo = pd.read_csv('C:/Users/norri/Desktop/robyn_zeroes.csv')
first_demo.round(2)
second_demo.round(2)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/norri/Desktop/dt_simulated.csv'

I initially compared the first weekly demo data from Robyn with the most recent, which they were. There have been a few changes to the program, it seems, but not the data. There was one thing of note:  if you examined the correlation between revenue and competitor_sales_B, it was the only high correlation, and in experiment 2, when we dropped it, the entire model changed for the worse.

On a side note, D-Tale and SweetViz are fantastic EDA tools, with a few interactive functions, but SweetViz will open a new browser window for every run it makes.

In [None]:
dtale.show(first_demo) # note the predictive power score for revenue: it's the only scored variable at all and it's at .63

In [None]:
dtale.show(second_demo) # there are quite a few more correlated variables with revenue, but nothing much about .5.

SweetViz can tell you a great deal about the numerical variables in your data, as well as provide an 'Associations' tab much like a correlation plot.

In [None]:
sweet_report = sv.analyze(first_demo)
sweet_report.show_html('sweet_report.html')

The accuracy of this report is certainly affected by all the missing values, but we can compare it to the data after we impute the missings.

In [None]:
sweet_report_2 = sv.analyze(second_demo)
sweet_report_2.show_html('sweet_report_2.html')

Because of the missing_values, the comparison between the two datasets is not of much use as of yet.

In [None]:
sweet_report_3 = sv.compare(first_demo, second_demo)
sweet_report_3.show_html('sweet_report_3.html')

To give you a better impression of how much missing data there is, I provided the actual missing values, out of 146, the percentage of missing values, and the average of the percentage missing.

In [None]:
missing_values = second_demo.isnull().sum()
print(missing_values)
print(np.average(missing_values))

The average is 27, so any number 28 or above will have to be dropped, along with its matching spend or impressions. Otherwise, it could affect the imputation and therefore the correlation in Roybn prediction.

In [None]:
second_demo = second_demo.drop(['text_I', 'text_S', 'website_I', 'website_S', 'ecom_I', 'ecom_S', 'event_I',
                                'event_S', 'sweepstakes_I', 'sweepstakes_S', 'mail_I', 'mail_S', 'radio_I',
                                'radio_S', 'text_S', 'text_I', 'other_I', 'other_S'], axis=1)

In [None]:
corr = second_demo.corr(method="spearman").round(2)
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(12, 12))
cmap = sns.diverging_palette(250, 1, as_cmap=True)
sns.heatmap(corr, annot=True, cmap=cmap, mask=mask, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
corr.describe()

Here I begin the process of imputing missing values. I have to split the date off for this method to work.

In [None]:
second_demo['ID'] = second_demo.reset_index().index
date_split = second_demo[['DATE']]
imp_split = second_demo.drop(['DATE'], axis=1)
date_split['ID'] = imp_split['ID']
col_names = list(imp_split.columns)
second_demo.describe()

In [None]:
imp_split.to_numpy().astype(np.int)

IterativeImputer is one of the newest imputation MLs out, and it is very resource intensive. It's imputations are supposed to be excellent, but you can only do so much with so much missing data.

In [None]:
it_imp = IterativeImputer(random_state=42, max_iter=10, initial_strategy='most_frequent', imputation_order='ascending',
                          min_value=15000, max_value=64000, verbose=2, sample_posterior=True)

In [None]:
it_imp.fit(imp_split)

In [None]:
imputed = it_imp.transform(imp_split)

I merged the dates back in, and as you can see, the missing values are gone, and they all look pretty reasonable. I'll repeat most of the tests I ran earlier to see if there's anything not obvious.

In [None]:
second_demo = pd.DataFrame(imputed, columns=col_names)
second_demo = second_demo.set_axis(col_names, axis=1)
second_demo = pd.merge(second_demo, date_split, how='inner')
second_demo = second_demo.drop(['ID'], axis=1)
round(second_demo, 2)

In [None]:
dtale.show(second_demo) # the revenue correlations look much more reasonable, with all of the present, but only banner spend breaking .5. Also, the power analysis doesn't reveal anything strange towards revenue.

The histograms look more normal, and the 'Associations' plot actually works this time, but there are some warnings that the calculations are a bit off.

In [None]:
sweet_report_4 = sv.analyze(second_demo)
sweet_report_4.show_html('sweet_report_4.html')

Similar to the plot above, this is to compare how everything changed after imputation.

While I can't run all the same tests on Tyson data or Robyn, I will be doing comparisons. I filled the missings with zeroes for this dataset before the imputation, and will run both in Robyn to see how they perform. For Tyson data, I'll drop any variables that would not be useful to Robyn, and examine the quality that is left.

In [None]:
# corr = second_demo.corr(method="spearman").round(2)
# mask = np.triu(np.ones_like(corr, dtype=bool))
# f, ax = plt.subplots(figsize=(18, 18))
# cmap = sns.diverging_palette(250, 1, as_cmap=True)
# ax = sns.heatmap(corr, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5})
# corr.describe()

Though there's not universal agreement, most agree that a VIF of 1 is not correlated, between 1 and 5 moderately correlated, and greater than 5 highly correlated. Robyn uses ridge regression, which does deflate the coefficients of high VIF variables, but since we have limited spots in our Robyn model, it could be a good criteria for elimination.

In [None]:
temp_split = second_demo.drop(['DATE'], axis=1)
vif_df = temp_split[~temp_split.isin([np.nan, np.inf, -np.inf]).any(1)]
X = vif_df
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

In [None]:
corr_vif = vif_df.corr(method="spearman").round(2)
mask = np.triu(np.ones_like(corr_vif, dtype=bool))
plt.subplots(figsize=(18, 18))
cmap = sns.diverging_palette(250, 1, as_cmap=True)
sns.heatmap(corr_vif, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
second_demo = second_demo.drop(['display_I', 'display_S'], axis=1)

It still is a little high, but it shouldn't affect a Ridge Regression.

In [None]:
second_demo.to_csv('robyn_imputed.csv', index=False)