# 0.0 IMPORTS

In [1]:
import numpy               as np
import pandas              as pd
import math
import random

from scipy import stats as ss

from sklearn.metrics import mean_absolute_error, mean_squared_error

import datetime

import xgboost as xgb

## 0.1 Helper Functions

In [2]:
#Used in section 4.3.2 - EDA, Multivariate Analysis, Categorical Variables
def cat_corr(x, y):
    '''Cross frequencies'''
    cont_mat = pd.crosstab(x, y).to_numpy()
    
    n = cont_mat.sum()
    r, k = cont_mat.shape
    
    '''chi square statistics'''
    chi2 = ss.chi2_contingency(cont_mat)[0]
    
    '''Corrected variables to compensate for bias:'''
    chi2corr = max(0, (chi2/n) - (k - 1)*(r - 1)/(n-1))
    r_corr = r - (r-1)**2/(n-1)
    k_corr = k - (k-1)**2/(n-1)
    
    '''Cramérs V correlation'''''
    cramer_v = np.sqrt((chi2corr)/(min(k_corr-1, r_corr-1)))
    return cramer_v


#Used in section 7 - Machine Learning Modelling
def mean_percentage_error( y, yhat ):
    return np.mean( ( y - yhat ) / y )
     
    
def mean_absolute_percentage_error( y, yhat ):
    return np.mean( np.abs( ( y - yhat ) / y ) )

    
def ml_error( model_name, y, yhat ):
    mae = mean_absolute_error( y, yhat )
    mape = mean_absolute_percentage_error( y, yhat )
    rmse = np.sqrt( mean_squared_error( y, yhat ) )
    
    return pd.DataFrame( { 'Model Name': model_name, 
                           'MAE': mae, 
                           'MAPE': mape,
                           'RMSE': rmse }, index=[0] )


#Used in section 7 - Machine Learning Modelling (cross validation)
def cross_validation(training_data, kfolds, model, model_name, verbose=False):
  mae_list = []
  mape_list = []
  rmse_list = []
  for k in reversed(range(1, kfolds+1)):
    if verbose:
      print('\nKFold Number: {}'.format(k))
    #time intervals of 6 weeks
    validation_start_date = training_data['date'].max() - datetime.timedelta(days=k*6*7)
    validation_end_date = training_data['date'].max() - datetime.timedelta(days=(k-1)*6*7)

    #split data into training and validation
    training = training_data[training_data['date'] < validation_start_date]
    validation = training_data[(training_data['date'] >= validation_start_date) & (training_data['date'] <= validation_end_date)]

    xtraining = training.drop(['date', 'sales'], axis=1)
    ytraining = training['sales']

    xvalidation = validation.drop(['date', 'sales'], axis=1)
    yvalidation = validation['sales']

    #implement a model to get the results
    m = model.fit(xtraining, ytraining)
    yhat = m.predict(xvalidation)
    m_result = ml_error(model_name, np.expm1(yvalidation), np.expm1(yhat))

    mae_list.append(m_result['MAE'])
    mape_list.append(m_result['MAPE'])
    rmse_list.append(m_result['RMSE'])

  return pd.DataFrame({'Model name':model_name,
                       'MAE':np.round(np.mean(mae_list), 2).astype(str) + '+/-' + np.round(np.std(mae_list), 2).astype(str),
                       'MAPE':np.round(np.mean(mape_list), 2).astype(str) + '+/-' + np.round(np.std(mape_list), 2).astype(str),
                       'RMSE':np.round(np.mean(rmse_list), 2).astype(str) + '+/-' + np.round(np.std(rmse_list), 2).astype(str)}, index=[0])

## 0.2 Loading Data

In [3]:
x_training = pd.read_pickle('ml_models_comp/x_training.pkl')
x_testing = pd.read_pickle('ml_models_comp/x_testing.pkl')

# 8.0 Hyperparameter Fine Tuning

## 8.1 Random Search

In [4]:
param = {'n_estimators':[15, 17, 25, 30, 35],
         'eta':[0.01, 0.03],
         'max_depth':[3, 5, 9],
         'subsample':[0.1, 0.5, 0.7],
         'colsample_bytree':[0.3, 0.7, 0.9],
         'min_child_weight':[3, 8, 15],
        }

MAX_EVAL = 2

In [5]:
final_result = pd.DataFrame()

for i in range(MAX_EVAL):
    #randomly selected hyperparameters
    hp = {k:random.sample(v, 1)[0] for k, v in param.items()}
    print(hp)
    #model
    model_xgb = xgb.XGBRegressor(objective='reg:squarederror',
                                 n_estimators=hp['n_estimators'],
                                 eta=hp['eta'],
                                 max_depth=hp['max_depth'],
                                 subsample=hp['subsample'],
                                 colsample_bytree=hp['colsample_bytree'],
                                 min_child_weight=hp['min_child_weight']
                                 )
    result = cross_validation(x_training, 2, model_xgb, 'XGBoost Regressor', verbose=False)
    final_result = pd.concat([final_result, result])

final_result

{'n_estimators': 25, 'eta': 0.03, 'max_depth': 9, 'subsample': 0.1, 'colsample_bytree': 0.9, 'min_child_weight': 15}
{'n_estimators': 30, 'eta': 0.03, 'max_depth': 3, 'subsample': 0.5, 'colsample_bytree': 0.9, 'min_child_weight': 15}


Unnamed: 0,Model name,MAE,MAPE,RMSE
0,XGBoost Regressor,7291.56+/-127.58,0.98+/-0.0,7953.76+/-175.12
0,XGBoost Regressor,7193.76+/-127.06,0.96+/-0.0,7862.32+/-174.85


## 8.2 Final Model

In [6]:
param_tuned = {'n_estimators':25,
         'eta':0.03,
         'max_depth':9,
         'subsample':0.7,
         'colsample_bytree':0.7,
         'min_child_weight':3,
        }

In [7]:
x_train = x_training.drop(['date', 'sales'], axis=1)
y_train = x_training['sales']

x_test = x_testing.drop(['date', 'sales'], axis=1)
y_test = x_testing['sales']

In [8]:
#model
model_xgb_tuned = xgb.XGBRegressor(objective='reg:squarederror',
                                   n_estimators=param_tuned['n_estimators'],
                                   eta=param_tuned['eta'],
                                   max_depth=param_tuned['max_depth'],
                                   subsample=param_tuned['subsample'],
                                   colsample_bytree=param_tuned['colsample_bytree'],
                                   min_child_weight=param_tuned['min_child_weight']
                                  ).fit(x_train, y_train)

yhat_xgb_tuned = model_xgb_tuned.predict(x_test)

xgb_result = ml_error('XGBoost Regressor', np.expm1(y_test), np.expm1(yhat_xgb_tuned))
xgb_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,XGBoost Regressor,6860.731664,0.977765,7502.412248
