# H1N1 XGB TUNING DS4A Project - Team 18 - Vaccine Acceptance

To tune XGB, first choose a wide range of hyperparameters run the CV then choose smaller range.

see https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

---
Authorship: Marie-anne

---

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.inspection import permutation_importance

In [None]:
#Import dfs
features = pd.read_csv(os.path.join(os.getcwd(), 'Data/training_set_features.csv'))
labels = pd.read_csv(os.path.join(os.getcwd(), 'Data/training_set_labels.csv'))
imp_feat = pd.read_csv(os.path.join(os.getcwd(), 'Data/imputed_train_hot_encoded.csv'))
imp_feat_not_hot = pd.read_csv(os.path.join(os.getcwd(), 'Data/imputed_train.csv'))

In [None]:
# set label index
labels.set_index('respondent_id', inplace=True)

In [None]:
#IMPUTED 
imp_feat.set_index('Unnamed: 0', inplace=True)
imp_feat.sort_index(inplace=True)


In [None]:
#merge_df options

merged_df = imp_feat.join(labels)
#merged_df = imp_feat_small.join(labels)


df_h1n1 = merged_df.reset_index(drop=True).drop(['seasonal_vaccine'], axis=1)
df_h1n1.shape

### Train test split

In [None]:
X = df_h1n1.iloc[:, :-1]
y= df_h1n1.iloc[:,-1]

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    test_size=0.1,
                                                    stratify=y,
                                                    random_state=42
                                                 )
# get feature names
feature_names=list(X_train)

#check shape
print(X.shape)
X_train.shape

In [None]:
# IMPUTED Scaling and 
X_train = StandardScaler().fit_transform(X_train)
print(X_train.shape)
X_train

### xgb CV

In [None]:
def XGB_CV(model, boost=200, early_stopping_rounds=30):
    '''set up xgb.cv
    Args:
        - model: initiated xgb model
        - boost : number of round (default 200)
        - early_stopping_rounds: stop if model did not improve (default 50) 
    return result table
    '''
    params = model.get_xgb_params()
    
    df_matrix = xgb.DMatrix(data=X_train,label=y_train, 
                        feature_names=feature_names
                       )
    tuned_xgb = xgb.cv(dtrain=df_matrix,
                   params=params, 
                   num_boost_round=boost, 
                   stratified=True,
                   nfold = 5,
                    folds = StratifiedKFold(n_splits=10),
                   metrics=['auc', 'aucpr'],
                    early_stopping_rounds=early_stopping_rounds,
                    verbose_eval=True,
                   as_pandas=True, seed=42)
    
    model.set_params(num_parallel_tree=boost) 
    
    return tuned_xgb

### GridSearch CV

In [None]:
def grid_csv(model, params):
    '''set up sklearn gridsearchCV
    Args:
        - model: initiated xgb model
        - params : dict of parameters
    return best estimator and table of results
    '''
    
    GSCV = GridSearchCV(model, param_grid = params, scoring = 'average_precision', cv = 5, n_jobs=-1, verbose=True)
    best_clf = GSCV.fit(X_train, y_train)
    best_hyperparams = best_clf.best_params_
    best_score = best_clf.best_score_
    estimator = best_clf.best_estimator_
    print(best_score, best_hyperparams, estimator)
    table = best_clf.cv_results_
    return best_hyperparams, table

In [None]:
clf = xgb.XGBClassifier(objective= 'binary:logistic',
                        scale_pos_weight=4,
                        eval_metric = 'aucpr',
                        learning_rate = 0.01, 
                        max_depth=3,
                        n_estimators=1475,
                        min_child_weight=1,
                        gamma=0.7,
                        alpha=1e-06,
                        subsample=0.5,
                        colsample_bytree=0.85,
                        nthread=4,
                        seed=42, metrics='auc')

In [None]:
#Save at the end
xgbCV = XGB_CV(clf, boost=2000)
xgbCV.to_csv('Results/h1n1/XGB_CV_h1n1.csv')


In [None]:
param_test1 = {
 'max_depth':range(3,15,2),
 'min_child_weight':range(1,6,2)
}
grid_csv(clf, param_test1)

In [None]:
param_test2 = {
 'max_depth':[3, 4, 5],
 'min_child_weight':[1,2]
}
grid_csv(clf, param_test2)

In [None]:
param_test3 = {
 'gamma': np.arange(0.1, 1, 0.2)
}
grid_csv(clf, param_test3)

In [None]:
param_test4 = {
 'subsample': np.arange(0.3, 0.5, 0.05),
   'colsample_bytree': np.arange(0.7, 0.9, 0.05) 
}
grid_csv(clf, param_test4)

In [None]:
param_test5 = {
 'alpha':[1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
}
grid_csv(clf, param_test5)

**params** <p>
- max depth:3
- min_child_weight : 1
- gamma: 0.7 
- subsample: 0.5
- colsample_bytree: 0.85
- alpha:1e-56
- n_estimator: 1060
    

