In [1]:
"""
0.0 IMPORTS
"""

import pandas as pd
import numpy as np

import datetime

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso

import xgboost as xgb

In [2]:
"""
0.1 Functions
"""


def mean_percentage_error(y, yhat):
    return np.mean((y - yhat)/y)


def mean_absolute_percentage_error(y, yhat):
    return np.mean(np.abs((y - yhat)/y))


def ml_error(model_name, y, yhat):
    mae = mean_absolute_error(y, yhat)
    mape = mean_absolute_percentage_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    return pd.DataFrame({'Model Name': model_name, 'MAE': mae, 'MAPE': mape, 'RMSE': rmse}, index=[0])


def cross_validation(training_data, kfolds, model, model_name, verbose=False):
    mae_list = []
    mape_list = []
    rmse_list = []
    for k in reversed(range(1, kfolds+1)):
        if verbose:
            print('\nKFold Number: {}'.format(k))
        # time intervals of 6 weeks
        validation_start_date = training_data['date'].max() - datetime.timedelta(days=k*6*7)
        validation_end_date = training_data['date'].max() - datetime.timedelta(days=(k-1)*6*7)

        # split data into training and validation
        training = training_data[training_data['date'] < validation_start_date]
        validation = training_data[(training_data['date'] >= validation_start_date) & (training_data['date'] <= validation_end_date)]

        xtraining = training.drop(['date', 'sales'], axis=1)
        ytraining = training['sales']

        xvalidation = validation.drop(['date', 'sales'], axis=1)
        yvalidation = validation['sales']

        # implement a model to get the results
        m = model.fit(xtraining, ytraining)
        yhat = m.predict(xvalidation)
        m_result = ml_error(model_name, np.expm1(yvalidation), np.expm1(yhat))

        mae_list.append(m_result['MAE'])
        mape_list.append(m_result['MAPE'])
        rmse_list.append(m_result['RMSE'])

    return pd.DataFrame({'Model name':model_name,
    'MAE':np.round(np.mean(mae_list), 2).astype(str) + '+/-' + np.round(np.std(mae_list), 2).astype(str),
    'MAPE':np.round(np.mean(mape_list), 2).astype(str) + '+/-' + np.round(np.std(mape_list), 2).astype(str),
    'RMSE':np.round(np.mean(rmse_list), 2).astype(str) + '+/-' + np.round(np.std(rmse_list), 2).astype(str)}, index=[0])

In [3]:
"""
Load data
"""

x_training = pd.read_pickle('x_training.pkl')

In [4]:
"""
Compare models' performance - cross validation
"""

# XGBoost
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, eta=0.01, max_depth=10, subsample=0.7, colsample_bytree=0.9)
xgb_cv = cross_validation(x_training, 5, model, 'XGBoost', verbose=True)


print(xgb_cv.to_markdown(tablefmt='grid'))


xgb_cv.to_pickle("xgb.pkl")


KFold Number: 5

KFold Number: 4

KFold Number: 3

KFold Number: 2

KFold Number: 1
+----+--------------+------------------+-------------+------------------+
|    | Model name   | MAE              | MAPE        | RMSE             |
|  0 | XGBoost      | 1177.07+/-245.14 | 0.16+/-0.02 | 1707.26+/-355.68 |
+----+--------------+------------------+-------------+------------------+
