In [None]:
import pandas as pd
import itertools
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
from prophet.plot import add_changepoints_to_plot
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
from prophet.plot import plot_cross_validation_metric
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
df = pd.read_csv('tbd')
df['ds'] = pd.to_datetime(df['ds'])
df['feature'] = pd.read_csv('feature')
df['display'] = pd.read.csv('display')

## Data Exploration

Definition of importance plots if they need to be used.

In [None]:
# def imp_plots(target, predictors):
#     """Form three importance plots
#
#     :param target:'dependent' component
#     :param predictors:'predictive' component
#     """
#     target = target
#     df_all = df_num.dropna().astype(dtype='int32')
#     df_all = df_all[predictors + [target]]
#     df_train, df_test = train_test_split(df_all, test_size=0.15)
#     X_train, y_train = df_train.drop(target, axis=1), df_train[target]
#     X_test, y_test = df_test.drop(target, axis=1), df_test[target]
#     rf = RandomForestClassifier(n_estimators=100, n_jobs=-1,
#                                 max_features=1.0,
#                                 min_samples_leaf=10, oob_score=True)
#     rf.fit(X_train, y_train)
#     RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#                            max_depth=None, max_features=1.0, max_leaf_nodes=None,
#                            min_impurity_decrease=0.0, min_samples_leaf=10,
#                            min_samples_split=2,
#                            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
#                            oob_score=True, random_state=None, verbose=0, warm_start=False)
#     figure, (ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1, figsize=(10, 10))
#     imp1 = importances(rf, X_test, y_test)
#     plot_importances(imp1, width=16, vscale=4, ax=ax1)
#
#     imp = pd.DataFrame()
#     imp['Feature'] = X_train.columns
#     imp['Importance'] = rf.feature_importances_
#     imp = imp.sort_values('Importance', ascending=False)
#     imp2 = imp.set_index('Feature')
#     plot_importances(imp2, width=16, vscale=4, ax=ax2)
#
#     perm_importance = permutation_importance(rf, X_test, y_test)
#     perm = pd.DataFrame()
#     perm['Feature'] = X_test.columns
#     perm['Importance'] = perm_importance.importances_mean
#     perm = perm.sort_values('Importance', ascending=False)
#     perm = perm.set_index('Feature')
#     plot_importances(perm, width=16, vscale=4, ax=ax3)
#     a = imp1.sort_values(by='Feature')
#     b = imp2.sort_values(by='Feature')
#     c = perm.sort_values(by='Feature')
#     d = (np.abs(a) + np.abs(b) + np.abs(c)).sort_values('Importance',
#                                                         ascending=False).mean(axis=1)
#     plt.show()
#     return d

Basic check on the dataframe and determination of missing values. Initially, missing values will be dropped

In [None]:
df.describe()
df.info()

In [None]:
print('Missing Values')
print(df['y'].isnull().sum() / len(df) * 100)
print('Zeroes')
print((df['y'] == 0).sum())

To be written when data is corrected: a check on the average, variance, skew, etc. of the two ACV features to check if averaging is appropriate.

This is a crucial section right here. The data is not reported weekly, but reported on different days in the week, often multiple times per week. Grouping the revenue by date and summing the revenue will provide the structure needed for a forecast. Similar code will have to be written for ACV Feature and ACD Display, but most likely using averages instead of sums.

In [None]:
df['ds'] = pd.to_datetime(df['ds']) - pd.to_timedelta(7, unit='d')
df = df.groupby([pd.Grouper(key='df', freq='W')])['y'].sum().reset_index()

This will be a standard correlation map. It will be particularly interesting to see the correlation between ACV Feature and Display, and also important how strongly both are correlated to revenue.

In [None]:
corr = df.corr(method="spearman").round(2)
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(18, 18))
cmap = sns.diverging_palette(250, 1, as_cmap=True)
sns.heatmap(corr, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
corr.describe()

VIF checks for multicollinearity more accurately than a correlation plot does. A VIF greater than 5 would suggest ACF Feature and Display should not be used together.

In [None]:
vif_df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]

X = vif_df
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

Here is a section that should check ACV Feature and Display to see if the values are randomly dispersed or concentrated in certain locations. I would also suggest multiplying both by 100 to avoid any issues with logarithms or squre roots.

## Prophet modeling

In [None]:
df_train = df[df['ds'] <= 'some_date']
df_test = df[df['ds'] >= 'some_date']

This section sets all the parameters for the Prophet model. Setting the growth as linear for now, but worth experimenting with. Changepoints are locations where the rate of change is potentially allowed to change.
The yearly, weekly, and daily seasonality are set here as auto for yearly and weekly, so Prophet can detect it, and daily for false, so it doesn't detect any false readings. Seasonality mode is defined to as either additive or multiplicative for better fitting.
The following parameters are set to avoid overfitting. Note that at the end of the notebook there is a bit that runs a check on the priors.
Fit should be set to false to add extra regressors.

In [None]:
m = Prophet(growth = 'linear',
            n_changepoints = 25,
            changepoint_range = 0.8,
            yearly_seasonalityc= 'auto',
            weekly_seasonality = 'auto',
            daily_seasonality = 'False',
            holidays = 'prophet_holidays',
            seasonality_mode = 'additive',
            seasonality_prior_scale = 10.0,
            holidays_prior_scale = 10.0,
            changepoint_prior_scale = 0.05,
            mcmc_samples = 800,
            interval_width = 0.20,
            uncertainty_samples = 500,
            # stan_backend = 'False',
            fit = False)

Earlier in the code the columns of ACV Feature and Display were added. This should define them so Prophet runs its model with them included. When I did this in R, it took some tweaks to get running properly. It then fits the Prophet model with the parameters set above.

In [None]:
m.add_regressor('feature, display')
m.fit(df_train)

This creates a dataframe the same length of the input data, but with four additional weeks.

In [None]:
future = m.make_future_dataframe(periods=4, freq='W')

Using the Prophet model, it makes it projects for the four weeks.

In [None]:
forecast = m.predict(future)

## Model Diagnostics

In [None]:
fig1 = m.plot(forecast)

In [None]:
fig2 = m.plot_components(forecast)

In [None]:
plot_components_plotly(m, forecast)

In [None]:
fig = m.plot(forecast)
a = add_changepoints_to_plot(fig.gca(), m, forecast)

The cross_validation function here uses historical data to measure forecast error. The next function prints off the different measures it uses, like RMSE, SMAPE, etc.

In [None]:
df_cv = cross_validation(m, initial='700 days', period='360 days', horizon = '30 days')

In [None]:
df_cv_p = performance_metrics(df_cv)

In [None]:
# param_grid = {
#     'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
#     'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
# }
#
# # Generate all combinations of parameters
# all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
# rmses = []  # Store the RMSEs for each params here
#
# # Use cross validation to evaluate all parameters
# for params in all_params:
#     m = Prophet(**params).fit(df)  # Fit model with given params
#     df_cv = cross_validation(m, cutoffs=cutoffs, horizon='30 days', parallel="processes")
#     df_p = performance_metrics(df_cv, rolling_window=1)
#     rmses.append(df_p['rmse'].values[0])
#
# # Find the best parameters
# tuning_results = pd.DataFrame(all_params)
# tuning_results['rmse'] = rmses
# print(tuning_results)

This plots the difference between the forecast and the actuals.

In [None]:
plt.plot(forecast['forecast'], label='Forecast')
plt.plot(df['y'], label='Actuals')
leg = plt.legend()
plt.show()

Similar metrics to those from the cross_validation function could be used to compare the historical error to the model error.