### XGBoost Hyperparameter tuning using sklearn's GridSearchCV.

In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics 
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import xgboost as xgb

In [3]:
data = pd.read_csv('C:/Users/User/Desktop/Mayada Kh/University/Дипломна/materials/py code/readydata.csv')
droplist = ['total_pymnt','total_pymnt_inv','total_rec_late_fee','recoveries', 'collection_recovery_fee','last_pymnt_amnt']
data = data.drop(droplist,axis=1)
X = data.drop('target', axis =1 )
Y = data.target
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size = 0.2,random_state=1)

In [4]:
model = XGBRegressor()
model

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)

#### Tuning max_depth and min_child_weight

In [5]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=42, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4, cv=5)

gsearch1.fit(x_train,y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_



({'mean_fit_time': array([ 3.66703882,  3.25089293,  3.08754988,  5.47200327,  5.51952982,
          5.64353428,  8.75744185,  9.12306833,  8.68455896, 12.73080015,
         12.14052835, 11.12432356]),
  'std_fit_time': array([0.08835962, 0.15768085, 0.05469101, 0.22768322, 0.0541693 ,
         0.22336452, 0.35822078, 0.59486806, 0.86026479, 0.67254483,
         0.57575599, 0.81527504]),
  'mean_score_time': array([0.0345479 , 0.04153738, 0.03534622, 0.02995486, 0.03614526,
         0.03335061, 0.03993764, 0.03414688, 0.03614273, 0.03554325,
         0.03674264, 0.05631309]),
  'std_score_time': array([0.00101772, 0.00981636, 0.00534352, 0.00282416, 0.00203349,
         0.00300314, 0.00575285, 0.00270896, 0.00590221, 0.00457952,
         0.00278208, 0.04285977]),
  'param_max_depth': masked_array(data=[3, 3, 3, 5, 5, 5, 7, 7, 7, 9, 9, 9],
               mask=[False, False, False, False, False, False, False, False,
                     False, False, False, False],
         fill_value='?

In [8]:
# tuning with fixed lists
param_test2 = {
 'max_depth':[2,3,4],
 'min_child_weight': [4,5,6] }
gsearch2 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=42, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='r2',n_jobs=4, cv=5)

gsearch2.fit(x_train,y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_



({'mean_fit_time': array([2.44775448, 3.40025611, 2.72551847, 4.57940602, 3.38468113,
         3.63449011, 4.16186147, 4.280474  , 4.25770974]),
  'std_fit_time': array([0.47115364, 0.50244143, 0.25660557, 0.68734004, 0.32796803,
         0.1872243 , 0.15510868, 0.27504942, 0.59745067]),
  'mean_score_time': array([0.03534527, 0.03694348, 0.07149   , 0.03135228, 0.03554683,
         0.03754206, 0.03055582, 0.04612999, 0.03075404]),
  'std_score_time': array([0.00545455, 0.00595849, 0.0374385 , 0.00232821, 0.00325584,
         0.0105977 , 0.00205703, 0.02506418, 0.00330555]),
  'param_max_depth': masked_array(data=[2, 2, 2, 3, 3, 3, 4, 4, 4],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'param_min_child_weight': masked_array(data=[4, 5, 6, 4, 5, 6, 4, 5, 6],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         

#### Tuning gamma

In [9]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=42, max_depth=4,
 min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test3, scoring='r2',n_jobs=4, cv=5)

gsearch3.fit(x_train,y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_



({'mean_fit_time': array([4.49114428, 4.25451703, 4.37912169, 4.1912149 , 3.42701516]),
  'std_fit_time': array([0.15265541, 0.06599222, 0.11420652, 0.14319583, 0.76723046]),
  'mean_score_time': array([0.03614497, 0.03674431, 0.03594489, 0.03714375, 0.02975388]),
  'std_score_time': array([0.00698478, 0.00750882, 0.00550559, 0.00353744, 0.00381023]),
  'param_gamma': masked_array(data=[0.0, 0.1, 0.2, 0.3, 0.4],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'gamma': 0.0},
   {'gamma': 0.1},
   {'gamma': 0.2},
   {'gamma': 0.3},
   {'gamma': 0.4}],
  'split0_test_score': array([0.05163807, 0.05166072, 0.05137318, 0.05137049, 0.05161482]),
  'split1_test_score': array([0.07114378, 0.07114378, 0.0711643 , 0.07199824, 0.07118522]),
  'split2_test_score': array([0.07422491, 0.07417422, 0.07417246, 0.07452226, 0.07449525]),
  'split3_test_score': array([0.06112169, 0.06117883, 0.06117756, 0.06122749, 0.06131906]),

#### Tuning subsample and colsample_bytree

In [11]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch4 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=42, max_depth=2,
 min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test4, scoring='r2',n_jobs=4, cv=5)

gsearch4.fit(x_train,y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_




({'mean_fit_time': array([1.8447    , 1.9419487 , 1.91898432, 1.89981265, 2.29659004,
         3.13986764, 2.63366137, 2.93742023, 2.2402802 , 2.51205292,
         2.94017792, 2.43217893, 2.44336128, 2.86290331, 2.77164507,
         2.33293262]),
  'std_fit_time': array([0.02069771, 0.08444026, 0.04758266, 0.04484923, 0.53349865,
         0.17350562, 0.20904738, 0.2719504 , 0.03699858, 0.26522136,
         0.10054839, 0.25202097, 0.14446934, 0.20835834, 0.20281915,
         0.15242621]),
  'mean_score_time': array([0.04952393, 0.03734388, 0.03754287, 0.03734164, 0.05271873,
         0.08147173, 0.05930848, 0.04153838, 0.03194885, 0.05032487,
         0.0445292 , 0.03894076, 0.05930748, 0.04413404, 0.03035512,
         0.02955494]),
  'std_score_time': array([0.01886556, 0.01231769, 0.00349234, 0.00534275, 0.02199755,
         0.03373975, 0.01313247, 0.00699172, 0.00468067, 0.01366778,
         0.00839124, 0.01424492, 0.03454283, 0.01785473, 0.00162246,
         0.00286741]),
  'param_c

#### Tuning Regularization Parameters

In [12]:
param_test5 = {
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch5 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=42, max_depth=2,
 min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test5, scoring='r2',n_jobs=4, cv=5)

gsearch5.fit(x_train,y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_



({'mean_fit_time': array([2.04378824, 2.08133149, 2.50913615, 2.85447664, 2.21372089]),
  'std_fit_time': array([0.03224362, 0.10157883, 0.26707109, 0.10997136, 0.58649177]),
  'mean_score_time': array([0.04253607, 0.04173765, 0.04153805, 0.04452896, 0.02695956]),
  'std_score_time': array([0.0187286 , 0.01106916, 0.00366312, 0.01121873, 0.00227841]),
  'param_reg_lambda': masked_array(data=[1e-05, 0.01, 0.1, 1, 100],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'reg_lambda': 1e-05},
   {'reg_lambda': 0.01},
   {'reg_lambda': 0.1},
   {'reg_lambda': 1},
   {'reg_lambda': 100}],
  'split0_test_score': array([0.0468742 , 0.04687428, 0.04687483, 0.04744039, 0.04784036]),
  'split1_test_score': array([0.06334441, 0.06334456, 0.06357285, 0.06360387, 0.06466804]),
  'split2_test_score': array([0.06195353, 0.06195301, 0.06104648, 0.06184383, 0.061045  ]),
  'split3_test_score': array([0.05806282, 0.05806265, 0.058