# seasonal XGB TUNING DS4A Project - Team 18 - Vaccine Acceptance

To tune XGB, first choose a wide range of hyperparameters run the CV then choose smaller range.

see https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

---
Authorship: Marie-anne

---

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.inspection import permutation_importance

In [3]:
#Import dfs
features = pd.read_csv(os.path.join(os.getcwd(), 'Data/training_set_features.csv'))
labels = pd.read_csv(os.path.join(os.getcwd(), 'Data/training_set_labels.csv'))
imp_feat = pd.read_csv(os.path.join(os.getcwd(), 'Data/imputed_train_hot_encoded.csv'))
imp_feat_not_hot = pd.read_csv(os.path.join(os.getcwd(), 'Data/imputed_train.csv'))

In [4]:
# set label index
labels.set_index('respondent_id', inplace=True)

In [5]:
#IMPUTED 
imp_feat.set_index('Unnamed: 0', inplace=True)
imp_feat.sort_index(inplace=True)


In [6]:
#merge_df options

merged_df = imp_feat.join(labels)
#merged_df = imp_feat_small.join(labels)


df_h1n1 = merged_df.reset_index(drop=True).drop(['h1n1_vaccine'], axis=1)
df_h1n1.shape

(24036, 45)

### Train test split

In [7]:
X = df_h1n1.iloc[:, :-1]
y= df_h1n1.iloc[:,-1]

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    test_size=0.1,
                                                    stratify=y,
                                                    random_state=42
                                                 )
# get feature names
feature_names=list(X_train)

#check shape
print(X.shape)
X_train.shape

(24036, 44)


(21632, 44)

In [8]:
# IMPUTED Scaling and 
X_train = StandardScaler().fit_transform(X_train)
print(X_train.shape)
X_train

(21632, 44)


array([[ 0.41723886,  1.19336245, -0.22541347, ...,  3.29047534,
        -0.34584792, -0.36604693],
       [-1.78281257, -2.04502637, -0.22541347, ..., -0.30390746,
        -0.34584792, -0.36604693],
       [ 0.41723886, -0.42583196, -0.22541347, ..., -0.30390746,
        -0.34584792, -0.36604693],
       ...,
       [ 0.41723886, -0.42583196,  4.43629218, ..., -0.30390746,
        -0.34584792, -0.36604693],
       [ 0.41723886, -0.42583196, -0.22541347, ...,  3.29047534,
        -0.34584792, -0.36604693],
       [-0.68278686, -0.42583196, -0.22541347, ...,  3.29047534,
        -0.34584792, -0.36604693]])

### xgb CV

In [9]:
def XGB_CV(model, boost=200, early_stopping_rounds=30):
    '''set up xgb.cv
    Args:
        - model: initiated xgb model
        - boost : number of round (default 200)
        - early_stopping_rounds: stop if model did not improve (default 50) 
    return result table
    '''
    params = model.get_xgb_params()
    
    df_matrix = xgb.DMatrix(data=X_train,label=y_train, 
                        feature_names=feature_names
                       )
    tuned_xgb = xgb.cv(dtrain=df_matrix,
                   params=params, 
                   num_boost_round=boost, 
                   stratified=True,
                   nfold = 5,
                    folds = StratifiedKFold(n_splits=10),
                   metrics=['auc', 'aucpr'],
                    early_stopping_rounds=early_stopping_rounds,
                    verbose_eval=True,
                   as_pandas=True, seed=42)
    
    model.set_params(num_parallel_tree=boost) 
    
    return tuned_xgb

### GridSearch CV

In [10]:
def grid_csv(model, params):
    '''set up sklearn gridsearchCV
    Args:
        - model: initiated xgb model
        - params : dict of parameters
    return best estimator and table of results
    '''
    
    GSCV = GridSearchCV(model, param_grid = params, scoring = 'average_precision', cv = 5, n_jobs=-1, verbose=True)
    best_clf = GSCV.fit(X_train, y_train)
    best_hyperparams = best_clf.best_params_
    best_score = best_clf.best_score_
    estimator = best_clf.best_estimator_
    print(best_score, best_hyperparams, estimator)
    table = best_clf.cv_results_
    return best_hyperparams, table

In [37]:
clf = xgb.XGBClassifier(objective= 'binary:logistic',
                        eval_metric = 'aucpr',
                        learning_rate = 0.01, 
                        max_depth=3,
                        n_estimators=1072,
                        min_child_weight=2,
                        gamma=0.1,
                        alpha=1e-04,
                        subsample=0.45,
                        colsample_bytree=0.9,
                        nthread=4,
                        scale_pos_weight=1,
                        seed=42, metrics='auc')

In [38]:
#Save at the end
xgbCV = XGB_CV(clf, boost=2000)
xgbCV.to_csv('Results/XGB_CV_seasonal.csv')


Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to X

In [14]:
param_test1 = {
 'max_depth':range(3,15,2),
 'min_child_weight':range(1,6,2)
}
grid_csv(clf, param_test1)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.4min finished


Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.825898684122938 {'max_depth': 3, 'min_child_weight': 5} XGBClassifier(alpha=1e-06, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.85,
              eval_metric='aucpr', gamma=0.7, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=3, metrics='auc', min_child_weight=5, missing=nan,
              monotone_constraints='()', n_estimators=66, n_jobs=4, nthread=4,
              num_parallel_tree=1, random_state=42, reg_alpha=9.99999997e-07,
              reg_lambda=1, scale_pos_weight=1, seed=42, subsample=0.5,
              tree_method='exact', validate_parameters=1

({'max_depth': 3, 'min_child_weight': 5},
 {'mean_fit_time': array([ 6.2733799 ,  6.29701738,  6.15886493,  9.97648606,  9.86585326,
         10.2609704 , 14.29248753, 14.3766017 , 13.80725951, 20.9032207 ,
         19.33689251, 18.72871256, 28.90741367, 24.23439012, 22.40218692,
         29.8229352 , 27.1539289 , 22.47274866]),
  'std_fit_time': array([0.21374153, 0.49609108, 0.26157737, 0.47713922, 0.27804189,
         0.44023844, 0.7079428 , 0.63140894, 0.27757431, 1.58210321,
         1.31892955, 0.87482922, 0.8310562 , 1.05086181, 0.51042156,
         2.68789745, 1.5247198 , 0.80807758]),
  'mean_score_time': array([0.05905056, 0.10701551, 0.06411991, 0.07907872, 0.0764535 ,
         0.06267033, 0.08844137, 0.07924275, 0.08670154, 0.10130687,
         0.10981989, 0.09779983, 0.11010108, 0.11344838, 0.13102503,
         0.12855611, 0.11337309, 0.06065588]),
  'std_score_time': array([0.00988561, 0.08026847, 0.00325369, 0.00905706, 0.01681203,
         0.00554127, 0.00346893, 0.0097

In [29]:
param_test2 = {
 'max_depth':[3, 4, 5],
 'min_child_weight':[1,2]
}
grid_csv(clf, param_test2)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   33.2s finished


Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.8274690971638142 {'max_depth': 3, 'min_child_weight': 2} XGBClassifier(alpha=0.0001, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9,
              eval_metric='aucpr', gamma=0.3, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=3, metrics='auc', min_child_weight=2, missing=nan,
              monotone_constraints='()', n_estimators=66, n_jobs=4, nthread=4,
              num_parallel_tree=1, random_state=42, reg_alpha=9.99999975e-05,
              reg_lambda=1, scale_pos_weight=1, seed=42, subsample=0.45,
              tree_method='exact', validate_parameters

({'max_depth': 3, 'min_child_weight': 2},
 {'mean_fit_time': array([ 6.34048061,  6.51199694,  8.5795229 ,  8.15668402, 10.30940151,
          8.12835083]),
  'std_fit_time': array([0.37316728, 0.52009569, 0.36362074, 0.13191676, 0.55667283,
         0.92062418]),
  'mean_score_time': array([0.08837337, 0.07804112, 0.06571374, 0.06954184, 0.07618999,
         0.04614172]),
  'std_score_time': array([0.03238458, 0.01711655, 0.0070522 , 0.00950644, 0.01479505,
         0.01518002]),
  'param_max_depth': masked_array(data=[3, 3, 4, 4, 5, 5],
               mask=[False, False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'param_min_child_weight': masked_array(data=[1, 2, 1, 2, 1, 2],
               mask=[False, False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'max_depth': 3, 'min_child_weight': 1},
   {'max_depth': 3, 'min_child_weight': 2},
   {'max_depth': 4, 'min_child_weight': 1},
   {'max_de

In [31]:
param_test3 = {
 'gamma': np.arange(0.1, 1, 0.2)
}
grid_csv(clf, param_test3)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   18.3s finished


Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.8278078211061732 {'gamma': 0.1} XGBClassifier(alpha=0.0001, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9,
              eval_metric='aucpr', gamma=0.1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=3, metrics='auc', min_child_weight=2, missing=nan,
              monotone_constraints='()', n_estimators=66, n_jobs=4, nthread=4,
              num_parallel_tree=1, random_state=42, reg_alpha=9.99999975e-05,
              reg_lambda=1, scale_pos_weight=1, seed=42, subsample=0.45,
              tree_method='exact', validate_parameters=1, ...)


({'gamma': 0.1},
 {'mean_fit_time': array([6.01310372, 5.75127487, 5.94467201, 5.26562576, 4.38402028]),
  'std_fit_time': array([0.20326156, 0.42465592, 0.29469751, 0.35002496, 1.23172176]),
  'mean_score_time': array([0.0646596 , 0.06241488, 0.06237698, 0.0567028 , 0.02634406]),
  'std_score_time': array([0.0046107 , 0.01032423, 0.00491264, 0.0107589 , 0.00991273]),
  'param_gamma': masked_array(data=[0.1, 0.30000000000000004, 0.5000000000000001,
                     0.7000000000000001, 0.9000000000000001],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'gamma': 0.1},
   {'gamma': 0.30000000000000004},
   {'gamma': 0.5000000000000001},
   {'gamma': 0.7000000000000001},
   {'gamma': 0.9000000000000001}],
  'split0_test_score': array([0.83111962, 0.83111962, 0.83111962, 0.83105415, 0.83105415]),
  'split1_test_score': array([0.82765002, 0.82765002, 0.82765002, 0.82687247, 0.82627616]),
  'split2_test_score': 

In [32]:
param_test4 = {
 'subsample': np.arange(0.3, 0.5, 0.05),
   'colsample_bytree': np.arange(0.7, 0.9, 0.05) 
}
grid_csv(clf, param_test4)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   40.3s finished


Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.8278078211061732 {'colsample_bytree': 0.9000000000000001, 'subsample': 0.44999999999999996} XGBClassifier(alpha=0.0001, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.9000000000000001, eval_metric='aucpr',
              gamma=0.1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=3, metrics='auc', min_child_weight=2, missing=nan,
              monotone_constraints='()', n_estimators=66, n_jobs=4, nthread=4,
              num_parallel_tree=1, random_state=42, reg_alpha=9.99999975e-05,
              reg_lambda=1, scale_pos_weight=1, seed=42,
      

({'colsample_bytree': 0.9000000000000001, 'subsample': 0.44999999999999996},
 {'mean_fit_time': array([2.52944956, 2.65374942, 2.9682097 , 3.06448417, 2.87111421,
         2.91877046, 3.14542327, 3.3208344 , 2.78790779, 2.9086782 ,
         3.13402419, 3.16281905, 2.79290347, 2.92097883, 3.08726997,
         3.48099113, 3.28929996, 3.7613008 , 4.20551481, 3.18904986]),
  'std_fit_time': array([0.13797326, 0.23678346, 0.15426682, 0.19504489, 0.18418659,
         0.12061633, 0.21299031, 0.12950162, 0.11751584, 0.14700759,
         0.25601734, 0.16431784, 0.12413744, 0.15957324, 0.1307735 ,
         0.28266033, 0.35391724, 0.32933154, 0.18535474, 0.73973648]),
  'mean_score_time': array([0.04089427, 0.04248405, 0.04017735, 0.04185677, 0.03689475,
         0.04090509, 0.04661112, 0.0407867 , 0.04031072, 0.03952241,
         0.0405694 , 0.03633394, 0.03576703, 0.04375257, 0.03966269,
         0.04639182, 0.05187764, 0.05251951, 0.03665171, 0.01694713]),
  'std_score_time': array([0.00408328

In [33]:
param_test5 = {
 'alpha':[1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
}
grid_csv(clf, param_test5)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   13.4s finished


Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.8278078211061732 {'alpha': 0.0001} XGBClassifier(alpha=0.0001, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9,
              eval_metric='aucpr', gamma=0.1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=3, metrics='auc', min_child_weight=2, missing=nan,
              monotone_constraints='()', n_estimators=66, n_jobs=4, nthread=4,
              num_parallel_tree=1, random_state=42, reg_alpha=9.99999975e-05,
              reg_lambda=1, scale_pos_weight=1, seed=42, subsample=0.45,
              tree_method='exact', validate_parameters=1, ...)


({'alpha': 0.0001},
 {'mean_fit_time': array([4.14300923, 4.17253451, 4.09733181, 4.18840442, 3.63575916]),
  'std_fit_time': array([0.17280327, 0.10927404, 0.23727542, 0.15651377, 1.11989082]),
  'mean_score_time': array([0.04836097, 0.05021639, 0.04770832, 0.04686337, 0.02766008]),
  'std_score_time': array([0.00854684, 0.00499473, 0.00290752, 0.0054284 , 0.01349622]),
  'param_alpha': masked_array(data=[1e-08, 1e-07, 1e-06, 1e-05, 0.0001],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'alpha': 1e-08},
   {'alpha': 1e-07},
   {'alpha': 1e-06},
   {'alpha': 1e-05},
   {'alpha': 0.0001}],
  'split0_test_score': array([0.83111922, 0.83111922, 0.83111929, 0.83111929, 0.83111962]),
  'split1_test_score': array([0.82765015, 0.82765015, 0.82765015, 0.82765015, 0.82765002]),
  'split2_test_score': array([0.82604148, 0.82604148, 0.82604148, 0.82604134, 0.82604134]),
  'split3_test_score': array([0.82711594, 0.82711

**params** <p>
- max depth:3
- min_child_weight : 1
- gamma: 0.7 
- subsample: 0.5
- colsample_bytree: 0.85
- alpha:1e-56
- n_estimator: 1060
    

