# Training the XGBoost model with hyperparameters tuned

In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
#Import the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import math
import seaborn as sns

import xgboost as xgb
from sklearn import metrics, model_selection

import tikzplotlib

In [None]:
#load the data set
df_features = pd.read_pickle("df_features_final_var.pkl")
df_features

In [None]:
# Correlation matrix showing the correlation between the features 
correlation_matrix = df_features.corr()
labels_corr_features = correlation_matrix.index
plt.figure(figsize = (20,20))
g = sns.heatmap(df_features[labels_corr_features].corr(), annot = True)

In [None]:
from sklearn.model_selection import train_test_split

#split the data into a train and test set
train, validation = train_test_split(df_features, test_size=0.2, random_state= 1)

In [None]:
X_train = train.drop(['t_0'], axis = 1)
y_train = train['t_0']

X_validation = validation.drop(['t_0'], axis = 1)
y_validation = validation['t_0']

In [None]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

y_predict = model.predict(X_validation)
y_predict_train = model.predict(X_train)
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
print('Mean Absolute error validation set:', mean_absolute_error(y_validation,y_predict))
print('Mean Absolute error train set:', mean_absolute_error(y_train,y_predict_train))
print('R2 validation set:', r2_score(y_validation,y_predict))
print('R2 train set:', r2_score(y_train,y_predict_train))
print('RMSE validation set:', mean_squared_error(y_validation,y_predict))
print('RMSE train set:', mean_squared_error(y_train,y_predict_train))


### Hyperparameter tuning setup

In [None]:
#specify parameter range

# base_score = [0.25,0.5,0.75,1]
# n_estimators = [10,100,500,800,1000,1200,1500]
# max_depth = [2,3,4,5,8,10,15]
# booster = ['gbtree','gblinear']
# learning_rate = [0.01,0.05,0.1,0.15,0.2]
# min_child_wight = [1,2,3,4]


# hyperparameter_grid = {
#     'n_estimators':n_estimators,
#     'max_depth': max_depth,
#     'learning_rate': learning_rate,
#     'min_child_weight': min_child_wight,
#     'booster': booster,
#     'base_score':base_score
# }

In [None]:
model = xgb.XGBRegressor()

In [None]:
# Hyperparameter tuning

# random_cv = model_selection.RandomizedSearchCV(estimator = model,
#                                               param_distributions=hyperparameter_grid,
#                                               cv = 5,
#                                               n_iter = 50,
#                                               scoring='neg_mean_absolute_error',
#                                               n_jobs = 4,
#                                               return_train_score=True,
#                                               random_state=42)

# random_cv.fit(X_train,y_train)
# random_cv.best_estimator_


# dict_results = random_cv.cv_results_

# plotting the hyperparameters
# df_results = pd.DataFrame(dict_results)

# neg_mean_abs_val = df_results['mean_test_score'].values
# plt.plot(neg_mean_abs_val)
# plt.xlabel('Hyper-parameter configuration number')
# plt.ylabel('Negative mean absolute error')

In [None]:
#model with second best hyperparameters
xgb_2best_model = xgb.XGBRegressor(base_score=1, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=10, min_child_weight=3, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [None]:
# model with best hyperparameters
xgb_best_model = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.2, max_delta_step=0,
             max_depth=15, min_child_weight=2, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [None]:
xgb_2best_model.fit(X_train, y_train)

y_predict = xgb_2best_model.predict(X_validation)
y_predict_train = xgb_2best_model.predict(X_train)
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
print('Mean Absolute error validation set:', mean_absolute_error(y_validation,y_predict))
print('Mean Absolute error train set:', mean_absolute_error(y_train,y_predict_train))
print('R2 validation set:', r2_score(y_validation,y_predict))
print('R2 train set:', r2_score(y_train,y_predict_train))
print('RMSE validation set:', mean_squared_error(y_validation,y_predict))
print('RMSE train set:', mean_squared_error(y_train,y_predict_train))

In [None]:
xgb_best_model.fit(X_train, y_train)

y_predict = xgb_best_model.predict(X_validation)
y_predict_train = xgb_best_model.predict(X_train)
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
print('Mean Absolute error validation set:', mean_absolute_error(y_validation,y_predict))
print('Mean Absolute error train set:', mean_absolute_error(y_train,y_predict_train))
print('R2 validation set:', r2_score(y_validation,y_predict))
print('R2 train set:', r2_score(y_train,y_predict_train))
print('RMSE validation set:', mean_squared_error(y_validation,y_predict))
print('RMSE train set:', mean_squared_error(y_train,y_predict_train))

In [None]:
_ = xgb.plot_importance(xgb_2best_model, height=0.9)

In [None]:
_ = xgb.plot_importance(xgb_best_model, height=0.9)

## Test it on the Test set

In [None]:
#load the data set
df_test = pd.read_pickle("df_test_final_var.pkl")
df_test['variance'] = np.sqrt(df_test['variance'])
df_test = df_test.drop(['product_seq', 'days_since_seq'], axis = 1)
df_test
# df_val = df_val.drop(['max_days','min_days','avg_days'], axis = 1)

In [None]:
X_test = df_test.drop(['t_0','num_times_purchased'], axis = 1)
y_test = df_test['t_0']

In [None]:
#pd.DataFrame(xgb_best_model.predict(X_test)).to_pickle('xgb_results_hyperparameter_model.pkl')

In [None]:
test_with_col = pd.DataFrame.copy(df_test)

In [None]:
test_with_col['t_xgboost'] = xgb_best_model.predict(X_test)

In [None]:
metrics.mean_absolute_error(test_with_col['t_0'],test_with_col['t_xgboost'])

In [None]:
metrics.mean_absolute_error(test_with_col['t_0'],test_with_col['avg_days'])

In [None]:
abs_error_xg = np.abs(test_with_col['t_0'] - test_with_col['t_xgboost']).values
idx = np.argsort(abs_error_xg)

In [None]:
abs_error_aaa = np.abs(test_with_col['t_0'] - test_with_col['avg_days']).values

In [None]:
plt.plot(abs_error_xg[idx])
# plt.plot(abs_error_aaa[idx])

In [None]:
abs_error = np.abs(test_with_col['t_0'] - test_with_col['t_xgboost'])
plt.plot(np.sort(abs_error)[:110000], label = "XG_BOOST")

abs_error = np.abs(test_with_col['t_0'] - test_with_col['avg_days'])
plt.plot(np.sort(abs_error)[:110000], label = "AAA")
plt.legend()

In [None]:
abs_error = np.abs(test_with_col['t_0'] - test_with_col['t_xgboost'])
np.mean(np.sort(abs_error)[:80000])

In [None]:
abs_error = np.abs(test_with_col['t_0'] - test_with_col['avg_days'])
np.mean(np.sort(abs_error)[:80000])