In [14]:
#Download important modules
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from matplotlib import pyplot as plt
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
from xgboost import XGBRegressor
import pandas as pd
import pickle

In [2]:
#Retrieve the data
x_train = pd.read_pickle("data/train_x.pkl")
x_val = pd.read_pickle("data/val_x.pkl")
x_test = pd.read_pickle("data/test_x.pkl")
x_nulls = pd.read_pickle("data/x_nulls.pkl")

y_train = pd.read_pickle("data/train_y.pkl")
y_val = pd.read_pickle("data/val_y.pkl")
y_test = pd.read_pickle("data/test_y.pkl")
y_nulls = pd.read_pickle("data/y_nulls.pkl")

In [3]:
#Define performance function
def model_performance(model, x_train, y_train, x_val, y_val, x_test=None, y_test=None, include_test=False):
    prediction_train = model.predict(x_train)
    mae_train = mean_absolute_error(y_train , prediction_train)
    prediction_val = model.predict(x_val)
    mae_val = mean_absolute_error(y_val , prediction_val)
    prediction_test = model.predict(x_test)

    print('Model train MAE = ', mae_train)
    print('Model validation MAE = ', mae_val)
    if include_test:
        mae_test = mean_absolute_error(y_test , prediction_test)
        print('Model test MAE = ', mae_test)

In [4]:
#Fit the model
model = XGBRegressor()
model.fit(x_train,y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [5]:
#Some overfitting, so lets try tuning the hyperparameters
model_performance(model, x_train, y_train, x_val, y_val, x_test, y_test, include_test=False)

Model train MAE =  16.86612792444182
Model validation MAE =  17.479989334905248


In [6]:
#First do a random grid search, and then refine the search when we know best possible values
random_grid = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.30 ] ,
     "max_depth"        : [ 3, 5, 8, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

model_2 = XGBRegressor()
xgboost_random = RandomizedSearchCV(estimator = model_2, param_distributions = random_grid, n_iter = 50, 
                               cv = 3, verbose=2, random_state= 30, n_jobs = -1, scoring="neg_mean_absolute_error")
xgboost_random.fit(x_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  6.4min finished




RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'eta': [0.05, 0.1, 0.15, 0.2, 0.3], 'max_depth': [3, 5, 8, 12, 15], 'min_child_weight': [1, 3, 5, 7], 'gamma': [0.0, 0.1, 0.2, 0.4], 'colsample_bytree': [0.3, 0.4, 0.5, 0.7]},
          pre_dispatch='2*n_jobs', random_state=30, refit=True,
          return_train_score='warn', scoring='neg_mean_absolute_error',
          verbose=2)

In [7]:
#Get the best parameters so far
print(xgboost_random.best_params_)

#Evaluate the new model
best_model = xgboost_random.best_estimator_
model_performance(best_model, x_train, y_train, x_val, y_val, x_test, y_test, include_test=True)

{'min_child_weight': 3, 'max_depth': 3, 'gamma': 0.0, 'eta': 0.05, 'colsample_bytree': 0.7}
Model train MAE =  16.80219102760765
Model validation MAE =  17.537460270734627
Model test MAE =  18.00076711157307


In [8]:
#Refine the search in a grid search
grid = {
     "eta"    : [0.0, 0.025, 0.05] ,
     "max_depth"        : [ 2,3,4],
     "min_child_weight" : [ 2, 3, 4 ],
     "gamma"            : [ 0.0, 0.03, 0.06 ],
     "colsample_bytree" : [ 0.7 ]
     }

model_3 = XGBRegressor()
xgboost_grid = GridSearchCV(model_3, grid, 
                               cv = 5, verbose=2, n_jobs = -1, scoring="neg_mean_absolute_error")
xgboost_grid.fit(x_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 10.0min finished




GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'eta': [0.0, 0.025, 0.05], 'max_depth': [2, 3, 4], 'min_child_weight': [2, 3, 4], 'gamma': [0.0, 0.03, 0.06], 'colsample_bytree': [0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=2)

In [11]:
#Get best parameters so far
print(xgboost_grid.best_params_)

#Evaluate the new model
grid_model = xgboost_grid.best_estimator_
model_performance(grid_model, x_train, y_train, x_val, y_val, x_test, y_test, include_test=False)

{'colsample_bytree': 0.7, 'eta': 0.0, 'gamma': 0.0, 'max_depth': 4, 'min_child_weight': 2}
Model train MAE =  16.088280873013197
Model validation MAE =  17.418176263240774


In [10]:
#Reported mean absolute error of 18.0
test_predictions = model.predict(x_test)
mae_test = mean_absolute_error(y_test,test_predictions)
print('Mean absolute error in test set : ', mae_test)

Mean absolute error in test set :  17.999291074620853


In [12]:
#Dump the predictions
with open("predictions/xgboost.csv", "w") as f:
    predictions = model.predict(x_nulls)
    [f.write("{}, {}\n".format(i,j)) for (i,j) in zip(y_nulls.order_id, predictions)]

In [16]:
#Dump the model
pickle.dump(best_model, open("models_dump/xgboost.sav", 'wb'))