In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()

plt.rcParams['figure.figsize'] = [8.0, 8.0]
plt.rcParams['figure.dpi'] = 140

# Loading data

In [2]:
X_train_full = pd.read_csv('intermediate_data\\preprocessed_train_features.csv', index_col = 'respondent_id')
y_train_full = pd.read_csv('input_data\\training_set_labels.csv', index_col = 'respondent_id')

In [3]:
train_df = X_train_full.join(y_train_full)

# Loading models

In [4]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier

# Hyperparameter Tuning

## Importing Optuna

In [5]:
import optuna
from optuna import Trial, visualization

## Making validation folds

In [6]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5, random_state = 42, shuffle=True)

for i, (trn, val) in enumerate(kf.split(train_df)):
    train_df.loc[val, 'kfold'] = i
    
train_df['kfold'] = train_df['kfold'].astype(int)

## XGBoost Tuning

### XGBoost - H1N1 Vaccine

In [7]:
feature_cols = [col for col in X_train_full.columns.tolist() if col not in ['h1n1_vaccine', 'seasonal_vaccine']]
target_cols = ['h1n1_vaccine']

In [8]:
def objective_xgb(trial):
    roc = 0
    for fold in range(5):
        trn_idx = train_df['kfold'] != fold
        val_idx = train_df['kfold'] == fold
        trn = train_df.loc[trn_idx, :]
        val = train_df.loc[val_idx, :]

        x_tr, y_tr = trn[feature_cols].values, trn[target_cols].values
        x_val, y_val = val[feature_cols].values, val[target_cols].values
        
        model, log = fit_xgb(trial, x_tr, y_tr, x_val, y_val)
        roc += log['valid roc']/5
        
    return roc

In [9]:
from sklearn.metrics import roc_auc_score

def fit_xgb(trial, x_train, y_train, x_val, y_val):
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [70, 75, 80, 85, 90, 95, 100]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.025, 0.05, 0.75, 0.1]),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.4,1,0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.8,1,0.1),
        "max_depth": trial.suggest_categorical("max_depth",[5,6,7,8,9,10]),
        "min_child_weight": trial.suggest_int("min_child_weight",1,5),
        "gamma": 0,
        "base_score": 0.5,
        "random_state": 42,
        "use_label_encoder": False,
        "objective": 'reg:logistic',
        "tree_method": 'exact'
    }
    
    model = XGBClassifier(**params)
    model.fit(x_train, y_train.reshape(-1,))

    y_pred_tr = model.predict_proba(x_train)[:, 1]
    y_pred_val = model.predict_proba(x_val)[:, 1]

    log = {
        "train roc": roc_auc_score(y_train, y_pred_tr),
        "valid roc": roc_auc_score(y_val, y_pred_val)
    }
    
    return model, log

In [10]:
XGB_study_H1N1 = optuna.create_study(direction="maximize", study_name='XGBoost H1N1 Vaccine optimization')
XGB_study_H1N1.optimize(objective_xgb, n_trials=10)

[32m[I 2021-02-01 17:11:04,234][0m A new study created in memory with name: XGBoost H1N1 Vaccine optimization[0m
[32m[I 2021-02-01 17:11:10,674][0m Trial 0 finished with value: 0.8349731477330505 and parameters: {'n_estimators': 80, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.9, 'max_depth': 5, 'min_child_weight': 2}. Best is trial 0 with value: 0.8349731477330505.[0m
[32m[I 2021-02-01 17:11:24,903][0m Trial 1 finished with value: 0.8321322743536183 and parameters: {'n_estimators': 85, 'learning_rate': 0.025, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.9, 'max_depth': 9, 'min_child_weight': 2}. Best is trial 0 with value: 0.8349731477330505.[0m
[32m[I 2021-02-01 17:11:31,326][0m Trial 2 finished with value: 0.8302641899620713 and parameters: {'n_estimators': 75, 'learning_rate': 0.025, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.8, 'max_depth': 5, 'min_child_weight': 5}. Best is trial 0 with value: 0.8349731477330505.[0m
[32m[I 2021

In [11]:
XGB_study_H1N1.best_params

{'n_estimators': 85,
 'learning_rate': 0.05,
 'subsample': 0.9,
 'colsample_bytree': 0.9,
 'max_depth': 6,
 'min_child_weight': 3}

In [47]:
import pickle
from pathlib import Path

root = Path(".")

with open(root / "interim_data\\XGB_H1N1_best_params.pkl","wb") as f:
    pickle.dump(XGB_study_H1N1.best_params, f)

### XGBoost - Seasonal Vaccine

In [12]:
feature_cols = [col for col in X_train_full.columns.tolist() if col not in ['h1n1_vaccine', 'seasonal_vaccine']]
target_cols = ['seasonal_vaccine']

In [13]:
XGB_study_SEAS = optuna.create_study(direction="maximize", study_name='XGBoost Seasonal Vaccine optimization')
XGB_study_SEAS.optimize(objective_xgb, n_trials=10)

[32m[I 2021-02-01 17:12:42,571][0m A new study created in memory with name: XGBoost Seasonal Vaccine optimization[0m
[32m[I 2021-02-01 17:12:53,640][0m Trial 0 finished with value: 0.8529467779115617 and parameters: {'n_estimators': 80, 'learning_rate': 0.1, 'subsample': 0.9, 'colsample_bytree': 0.9, 'max_depth': 8, 'min_child_weight': 5}. Best is trial 0 with value: 0.8529467779115617.[0m
[32m[I 2021-02-01 17:13:02,436][0m Trial 1 finished with value: 0.8558622319982334 and parameters: {'n_estimators': 95, 'learning_rate': 0.05, 'subsample': 0.5, 'colsample_bytree': 0.9, 'max_depth': 5, 'min_child_weight': 1}. Best is trial 1 with value: 0.8558622319982334.[0m
[32m[I 2021-02-01 17:13:09,888][0m Trial 2 finished with value: 0.8566159415258077 and parameters: {'n_estimators': 95, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'max_depth': 5, 'min_child_weight': 2}. Best is trial 2 with value: 0.8566159415258077.[0m
[32m[I 2021-02-01 17:13:22,558][0m Trial

In [14]:
XGB_study_SEAS.best_params

{'n_estimators': 95,
 'learning_rate': 0.1,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'max_depth': 5,
 'min_child_weight': 2}

In [48]:
with open(root / "interim_data\\XGB_SEAS_best_params.pkl","wb") as f:
    pickle.dump(XGB_study_SEAS.best_params, f)

## LightGBM

### LightGBM - H1N1 Vaccine

In [15]:
feature_cols = [col for col in X_train_full.columns.tolist() if col not in ['h1n1_vaccine', 'seasonal_vaccine']]
target_cols = ['h1n1_vaccine']

In [16]:
def objective_lgbm(trial):
    roc = 0
    for fold in range(5):
        trn_idx = train_df['kfold'] != fold
        val_idx = train_df['kfold'] == fold
        trn = train_df.loc[trn_idx, :]
        val = train_df.loc[val_idx, :]

        x_tr, y_tr = trn[feature_cols].values, trn[target_cols].values
        x_val, y_val = val[feature_cols].values, val[target_cols].values
        
        model, log = fit_lgbm(trial, x_tr, y_tr, x_val, y_val)
        roc += log['valid roc']/5
        
    return roc

In [17]:
def fit_lgbm(trial, x_train, y_train, x_val, y_val):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 60, 150, 10),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.025, 0.05, 0.75, 0.1]),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.8,1,0.05),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.6,0.9,0.05),
        "min_child_samples": trial.suggest_int("min_child_samples",18,30),
        "min_child_weight": trial.suggest_discrete_uniform("min_child_weight",0.0005,0.0015,0.0005),
        "max_depth": -1,
        "random_state": 42,
        "silent":True
    }
    
    model = LGBMClassifier(**params)
    model.fit(x_train, y_train.reshape(-1,))

    y_pred_tr = model.predict_proba(x_train)[:, 1]
    y_pred_val = model.predict_proba(x_val)[:, 1]

    log = {
        "train roc": roc_auc_score(y_train, y_pred_tr),
        "valid roc": roc_auc_score(y_val, y_pred_val)
    }
    
    return model, log

In [18]:
LGBM_study_H1N1 = optuna.create_study(direction="maximize", study_name='LightGBM H1N1 Vaccine optimization')
LGBM_study_H1N1.optimize(objective_lgbm, n_trials=10)

[32m[I 2021-02-01 17:14:25,876][0m A new study created in memory with name: LightGBM H1N1 Vaccine optimization[0m
[32m[I 2021-02-01 17:14:29,000][0m Trial 0 finished with value: 0.8353687987493683 and parameters: {'n_estimators': 140, 'learning_rate': 0.025, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.75, 'min_child_samples': 22, 'min_child_weight': 0.0015}. Best is trial 0 with value: 0.8353687987493683.[0m
[32m[I 2021-02-01 17:14:30,923][0m Trial 1 finished with value: 0.7954170112453353 and parameters: {'n_estimators': 60, 'learning_rate': 0.75, 'subsample': 0.9, 'colsample_bytree': 0.6, 'min_child_samples': 22, 'min_child_weight': 0.001}. Best is trial 0 with value: 0.8353687987493683.[0m
[32m[I 2021-02-01 17:14:33,876][0m Trial 2 finished with value: 0.8364088239299313 and parameters: {'n_estimators': 150, 'learning_rate': 0.05, 'subsample': 0.9500000000000001, 'colsample_bytree': 0.85, 'min_child_samples': 21, 'min_child_weight': 0.0015}. Best is trial 2 wit

In [19]:
LGBM_study_H1N1.best_params

{'n_estimators': 120,
 'learning_rate': 0.05,
 'subsample': 0.8,
 'colsample_bytree': 0.7,
 'min_child_samples': 24,
 'min_child_weight': 0.001}

In [49]:
with open(root / "interim_data\\LGBM_H1N1_best_params.pkl","wb") as f:
    pickle.dump(LGBM_study_H1N1.best_params, f)

### LightGBM - Seasonal Vaccine

In [20]:
feature_cols = [col for col in X_train_full.columns.tolist() if col not in ['h1n1_vaccine', 'seasonal_vaccine']]
target_cols = ['seasonal_vaccine']

In [21]:
LGBM_study_SEAS = optuna.create_study(direction="maximize", study_name='LightGBM Seasonal Vaccine optimization')
LGBM_study_SEAS.optimize(objective_lgbm, n_trials=10)

[32m[I 2021-02-01 17:14:52,538][0m A new study created in memory with name: LightGBM Seasonal Vaccine optimization[0m
[32m[I 2021-02-01 17:14:55,852][0m Trial 0 finished with value: 0.8555168655183526 and parameters: {'n_estimators': 150, 'learning_rate': 0.025, 'subsample': 0.9, 'colsample_bytree': 0.8, 'min_child_samples': 23, 'min_child_weight': 0.001}. Best is trial 0 with value: 0.8555168655183526.[0m
[32m[I 2021-02-01 17:14:58,503][0m Trial 1 finished with value: 0.8564571757102681 and parameters: {'n_estimators': 100, 'learning_rate': 0.1, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.7, 'min_child_samples': 29, 'min_child_weight': 0.0015}. Best is trial 1 with value: 0.8564571757102681.[0m
[32m[I 2021-02-01 17:15:01,421][0m Trial 2 finished with value: 0.8553481861652152 and parameters: {'n_estimators': 140, 'learning_rate': 0.025, 'subsample': 0.8, 'colsample_bytree': 0.7, 'min_child_samples': 19, 'min_child_weight': 0.0015}. Best is trial 1 with value: 0.8

In [22]:
LGBM_study_SEAS.best_params

{'n_estimators': 110,
 'learning_rate': 0.05,
 'subsample': 0.9500000000000001,
 'colsample_bytree': 0.7,
 'min_child_samples': 20,
 'min_child_weight': 0.001}

In [50]:
with open(root / "interim_data\\LGBM_SEAS_best_params.pkl","wb") as f:
    pickle.dump(LGBM_study_SEAS.best_params, f)