<h1 style="text-align:center">Machine Learning</h1>

In [2]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV

import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import DMatrix
from xgboost import cv

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from time import time
from datetime import datetime

<h3 style="text-align:center">Load datasets</h3>

In [3]:
train_df = pd.read_pickle('train_df.p')
test_df = pd.read_pickle('test_df.p')
target = pd.read_pickle('target.p')

In [4]:
train_df.shape, test_df.shape, target.shape

((1458621, 16), (625134, 15), (1458621, 1))

<h3 style="text-align:center">Split-out dataset</h3>

In [5]:
# Test options and evaluation metric
num_folds = 10 
seed = 46
test_size = 0.05
scoring = 'r2'

In [6]:
X = train_df.drop(['id', 'trip_duration'], axis=1).values
Y = target.values.ravel()

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=test_size, \
                                                 random_state=seed)

In [7]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((1385689, 14), (1385689,), (72932, 14), (72932,))

<h3 style="text-align:center">Tuning Algorithm</h3>

In [13]:
# Create DMatrix to make XGBoost more efficient
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(test_df.drop(['id'], axis=1).values)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [9]:
# FOREVER_COMPUTING_FLAG = False
# xgb_pars = []
# for MCW in [10, 20, 50, 75, 100]:
#     for ETA in [0.05, 0.1, 0.15]:
#         for CS in [0.3, 0.4, 0.5]:
#             for MD in [6, 8, 10, 12, 15]:
#                 for SS in [0.5, 0.6, 0.7, 0.8, 0.9]:
#                     for LAMBDA in [0.5, 1., 1.5,  2., 3.]:
#                         xgb_pars.append({'min_child_weight': MCW, 'eta': ETA, 
#                                          'colsample_bytree': CS, 'max_depth': MD,
#                                          'subsample': SS, 'lambda': LAMBDA, 
#                                          'nthread': -1, 'booster' : 'gbtree', 'eval_metric': 'rmse',
#                                          'silent': 1, 'objective': 'reg:linear'})

# while FOREVER_COMPUTING_FLAG:
#     xgb_par = np.random.choice(xgb_pars, 1)[0]
#     print(xgb_par)
#     model = xgb.train(xgb_par, dtrain, 2000, watchlist, early_stopping_rounds=50,
#                       maximize=False, verbose_eval=100)
#     print('Modeling RMSLE %.5f' % model.best_score)

In [25]:
start = time()

xgb_pars = []
for MCW in [10, 20, 50, 75, 100]:
    for ETA in [0.05, 0.1, 0.15]:
        for CS in [0.3, 0.4, 0.5]:
            for MD in [6, 8, 10, 12, 15]:
                for SS in [0.5, 0.6, 0.7, 0.8, 0.9]:
                    for LAMBDA in [0.5, 1., 1.5,  2., 3.]:
                        xgb_pars.append({'min_child_weight': MCW, 'eta': ETA, 
                                         'colsample_bytree': CS, 'max_depth': MD,
                                         'subsample': SS, 'lambda': LAMBDA, 
                                         'nthread': -1, 'booster' : 'gbtree', 'eval_metric': 'rmse',
                                         'silent': 1, 'objective': 'reg:linear'})

print("elapsed time:", round(time()-start, 3), "s")

elapsed time: 0.005 s


In [26]:
result = []
# Todo: printing the number of iteration at which we are to keep track how far we are
start = time()

num_iteration = 100
for _ in range(num_iteration):
    xgb_par = np.random.choice(xgb_pars, 1)[0]
    print(xgb_par)
    model = xgb.train(xgb_par, dtrain, 2000, watchlist, early_stopping_rounds=50,
                      maximize=False, verbose_eval=100)
    print('Modeling RMSLE %.5f' % model.best_score)    
    
    result.append((xgb_par, model.best_score))

print("elapsed time:", round(time()-start, 3), "s")

{'min_child_weight': 75, 'eta': 0.05, 'colsample_bytree': 0.3, 'max_depth': 6, 'subsample': 0.9, 'lambda': 1.0, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'objective': 'reg:linear'}
[0]	train-rmse:5.72199	valid-rmse:5.72171
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.473914	valid-rmse:0.475264
[200]	train-rmse:0.451814	valid-rmse:0.45389
[300]	train-rmse:0.44603	valid-rmse:0.448625
[400]	train-rmse:0.443414	valid-rmse:0.446582
[500]	train-rmse:0.441818	valid-rmse:0.445349
[600]	train-rmse:0.44033	valid-rmse:0.444329
[700]	train-rmse:0.439371	valid-rmse:0.443825
[800]	train-rmse:0.438532	valid-rmse:0.443356
[900]	train-rmse:0.437655	valid-rmse:0.442964
[1000]	train-rmse:0.43684	valid-rmse:0.442685
[1100]	train-rmse:0.436289	valid-rmse:0.442541
[1200]	train-rmse:0.435748	valid-rmse:0.442362
[1300]	train-rmse:0.435233	valid-rmse:0.442143
[140

[0]	train-rmse:5.13109	valid-rmse:5.13108
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.439981	valid-rmse:0.444684
[200]	train-rmse:0.435184	valid-rmse:0.442593
Stopping. Best iteration:
[188]	train-rmse:0.435589	valid-rmse:0.442442

Modeling RMSLE 0.44244
{'min_child_weight': 50, 'eta': 0.1, 'colsample_bytree': 0.3, 'max_depth': 6, 'subsample': 0.8, 'lambda': 2.0, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'objective': 'reg:linear'}
[0]	train-rmse:5.42647	valid-rmse:5.4263
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.450043	valid-rmse:0.452328
[200]	train-rmse:0.443059	valid-rmse:0.446174
[300]	train-rmse:0.440313	valid-rmse:0.444708
[400]	train-rmse:0.438793	valid-rmse:0.444011
[500]	train-rmse:0.43741	valid-rmse:0.443704


[0]	train-rmse:5.42652	valid-rmse:5.42635
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.446123	valid-rmse:0.448504
[200]	train-rmse:0.441084	valid-rmse:0.444654
[300]	train-rmse:0.438968	valid-rmse:0.443781
[400]	train-rmse:0.437558	valid-rmse:0.443307
[500]	train-rmse:0.436378	valid-rmse:0.442753
[600]	train-rmse:0.435192	valid-rmse:0.442503
Stopping. Best iteration:
[628]	train-rmse:0.434901	valid-rmse:0.442446

Modeling RMSLE 0.44245
{'min_child_weight': 75, 'eta': 0.1, 'colsample_bytree': 0.5, 'max_depth': 8, 'subsample': 0.5, 'lambda': 0.5, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'objective': 'reg:linear'}
[0]	train-rmse:5.42052	valid-rmse:5.42026
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.436902	valid-rmse:0.44215

Modeling RMSLE 0.43786
{'min_child_weight': 10, 'eta': 0.05, 'colsample_bytree': 0.3, 'max_depth': 12, 'subsample': 0.8, 'lambda': 1.5, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'objective': 'reg:linear'}
[0]	train-rmse:5.72192	valid-rmse:5.72166
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.454375	valid-rmse:0.466473
[200]	train-rmse:0.42519	valid-rmse:0.44628
[300]	train-rmse:0.413858	valid-rmse:0.441766
[400]	train-rmse:0.406824	valid-rmse:0.44027
[500]	train-rmse:0.401052	valid-rmse:0.439465
[600]	train-rmse:0.396812	valid-rmse:0.439181
[700]	train-rmse:0.393313	valid-rmse:0.439005
Stopping. Best iteration:
[741]	train-rmse:0.39194	valid-rmse:0.438887

Modeling RMSLE 0.43889
{'min_child_weight': 10, 'eta': 0.15, 'colsample_bytree': 0.3, 'max_depth': 8, 'subsample': 0.6, 'lambda': 3.0, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': '

[400]	train-rmse:0.423936	valid-rmse:0.440626
[500]	train-rmse:0.420239	valid-rmse:0.439543
[600]	train-rmse:0.417423	valid-rmse:0.43902
[700]	train-rmse:0.415352	valid-rmse:0.438774
[800]	train-rmse:0.413566	valid-rmse:0.438682
Stopping. Best iteration:
[773]	train-rmse:0.413964	valid-rmse:0.43862

Modeling RMSLE 0.43862
{'min_child_weight': 10, 'eta': 0.05, 'colsample_bytree': 0.3, 'max_depth': 8, 'subsample': 0.5, 'lambda': 1.0, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'objective': 'reg:linear'}
[0]	train-rmse:5.72194	valid-rmse:5.72168
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.467814	valid-rmse:0.470439
[200]	train-rmse:0.445288	valid-rmse:0.449797
[300]	train-rmse:0.438619	valid-rmse:0.44482
[400]	train-rmse:0.435199	valid-rmse:0.442901
[500]	train-rmse:0.432778	valid-rmse:0.441788
[600]	train-rmse:0.430753	valid-rmse:0.441245
[70

[500]	train-rmse:0.430206	valid-rmse:0.441943
Stopping. Best iteration:
[512]	train-rmse:0.42994	valid-rmse:0.44181

Modeling RMSLE 0.44181
{'min_child_weight': 50, 'eta': 0.1, 'colsample_bytree': 0.5, 'max_depth': 8, 'subsample': 0.7, 'lambda': 1.5, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'objective': 'reg:linear'}
[0]	train-rmse:5.42053	valid-rmse:5.42027
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.435662	valid-rmse:0.441918
[200]	train-rmse:0.431218	valid-rmse:0.439885
[300]	train-rmse:0.428311	valid-rmse:0.439596
Stopping. Best iteration:
[316]	train-rmse:0.427933	valid-rmse:0.439551

Modeling RMSLE 0.43955
{'min_child_weight': 75, 'eta': 0.1, 'colsample_bytree': 0.3, 'max_depth': 12, 'subsample': 0.8, 'lambda': 3.0, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'objective': 'reg:linear'}
[0]	train-rmse:5.4

Stopping. Best iteration:
[745]	train-rmse:0.426821	valid-rmse:0.441282

Modeling RMSLE 0.44128
{'min_child_weight': 50, 'eta': 0.1, 'colsample_bytree': 0.3, 'max_depth': 10, 'subsample': 0.7, 'lambda': 3.0, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'objective': 'reg:linear'}
[0]	train-rmse:5.42633	valid-rmse:5.42621
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.437707	valid-rmse:0.445935
[200]	train-rmse:0.428507	valid-rmse:0.440824
[300]	train-rmse:0.424211	valid-rmse:0.44002
[400]	train-rmse:0.420954	valid-rmse:0.439687
[500]	train-rmse:0.418371	valid-rmse:0.439462
Stopping. Best iteration:
[497]	train-rmse:0.418453	valid-rmse:0.439445

Modeling RMSLE 0.43944
{'min_child_weight': 10, 'eta': 0.05, 'colsample_bytree': 0.4, 'max_depth': 6, 'subsample': 0.6, 'lambda': 1.0, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 

KeyboardInterrupt: 

In [23]:
result

[({'booster': 'gbtree',
   'colsample_bytree': 0.5,
   'eta': 0.15,
   'eval_metric': 'rmse',
   'lambda': 0.5,
   'max_depth': 6,
   'min_child_weight': 20,
   'nthread': -1,
   'objective': 'reg:linear',
   'silent': 1,
   'subsample': 0.6},
  0.442697),
 ({'booster': 'gbtree',
   'colsample_bytree': 0.4,
   'eta': 0.1,
   'eval_metric': 'rmse',
   'lambda': 1.0,
   'max_depth': 10,
   'min_child_weight': 75,
   'nthread': -1,
   'objective': 'reg:linear',
   'silent': 1,
   'subsample': 0.5},
  0.438897)]

In [22]:
min(result, key=lambda x:x[1])

({'booster': 'gbtree',
  'colsample_bytree': 0.4,
  'eta': 0.1,
  'eval_metric': 'rmse',
  'lambda': 1.0,
  'max_depth': 10,
  'min_child_weight': 75,
  'nthread': -1,
  'objective': 'reg:linear',
  'silent': 1,
  'subsample': 0.5},
 0.438897)

In [27]:
len(result)

75

In [28]:
result

[({'booster': 'gbtree',
   'colsample_bytree': 0.3,
   'eta': 0.05,
   'eval_metric': 'rmse',
   'lambda': 1.0,
   'max_depth': 6,
   'min_child_weight': 75,
   'nthread': -1,
   'objective': 'reg:linear',
   'silent': 1,
   'subsample': 0.9},
  0.441596),
 ({'booster': 'gbtree',
   'colsample_bytree': 0.3,
   'eta': 0.1,
   'eval_metric': 'rmse',
   'lambda': 2.0,
   'max_depth': 12,
   'min_child_weight': 20,
   'nthread': -1,
   'objective': 'reg:linear',
   'silent': 1,
   'subsample': 0.9},
  0.438612),
 ({'booster': 'gbtree',
   'colsample_bytree': 0.4,
   'eta': 0.1,
   'eval_metric': 'rmse',
   'lambda': 1.0,
   'max_depth': 12,
   'min_child_weight': 75,
   'nthread': -1,
   'objective': 'reg:linear',
   'silent': 1,
   'subsample': 0.9},
  0.436946),
 ({'booster': 'gbtree',
   'colsample_bytree': 0.5,
   'eta': 0.15,
   'eval_metric': 'rmse',
   'lambda': 3.0,
   'max_depth': 6,
   'min_child_weight': 20,
   'nthread': -1,
   'objective': 'reg:linear',
   'silent': 1,
   'sub

Saving the result

In [29]:
np.save('result.npy', result)

Loading

In [30]:
result_from_aws = np.load('result.npy')

In [32]:
len(result_from_aws)

75

In [33]:
type(result_from_aws)

numpy.ndarray

In [34]:
min(result_from_aws, key=lambda x:x[1])

array([ {'min_child_weight': 10, 'eta': 0.05, 'colsample_bytree': 0.5, 'max_depth': 12, 'subsample': 0.8, 'lambda': 2.0, 'nthread': -1, 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'objective': 'reg:linear'},
       0.43526], dtype=object)