
# H1N1 MODEL SELECTION DS4A Project - Team 18 - Vaccine Acceptance


---
Authorship: Marie-anne

---

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pickle

import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.inspection import permutation_importance

In [2]:
#Import dfs
features = pd.read_csv(os.path.join(os.getcwd(), 'Data/training_set_features.csv'))
labels = pd.read_csv(os.path.join(os.getcwd(), 'Data/training_set_labels.csv'))
imp_feat = pd.read_csv(os.path.join(os.getcwd(), 'Data/imputed_train_hot_encoded.csv'))
imp_feat_not_hot = pd.read_csv(os.path.join(os.getcwd(), 'Data/imputed_train.csv'))

In [3]:
# set label index
labels.set_index('respondent_id', inplace=True)

In [4]:
#IMPUTED 
imp_feat.set_index('Unnamed: 0', inplace=True)
imp_feat.sort_index(inplace=True)


In [5]:
#merge_df options

merged_df = imp_feat.join(labels)

df_h1n1 = merged_df.reset_index(drop=True).drop(['h1n1_vaccine'], axis=1)
df_h1n1.shape

(24036, 45)

### Train test split

In [6]:
X = df_h1n1.iloc[:, :-1]
y= df_h1n1.iloc[:,-1]

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    test_size=0.1,
                                                    stratify=y,
                                                    random_state=42
                                                 )
# get feature names
feature_names=list(X_train)

#check shape
print(X.shape)
X_train.shape

(24036, 44)


(21632, 44)

In [7]:
# # IMPUTED Scaling 
X_train = StandardScaler().fit_transform(X_train)
print(X_train.shape)
X_train

(21632, 44)


array([[ 0.41723886,  1.19336245, -0.22541347, ...,  3.29047534,
        -0.34584792, -0.36604693],
       [-1.78281257, -2.04502637, -0.22541347, ..., -0.30390746,
        -0.34584792, -0.36604693],
       [ 0.41723886, -0.42583196, -0.22541347, ..., -0.30390746,
        -0.34584792, -0.36604693],
       ...,
       [ 0.41723886, -0.42583196,  4.43629218, ..., -0.30390746,
        -0.34584792, -0.36604693],
       [ 0.41723886, -0.42583196, -0.22541347, ...,  3.29047534,
        -0.34584792, -0.36604693],
       [-0.68278686, -0.42583196, -0.22541347, ...,  3.29047534,
        -0.34584792, -0.36604693]])

### GridSearch CV

In [8]:
# Create model_dict

model_GSCV = dict()

model_GSCV ['LR'] = LogisticRegression()
model_GSCV['SVM'] = SVC()
model_GSCV['RF'] = RandomForestClassifier()
#for XGB tuning without gridsearch (see other nb)


In [9]:
# Define best_model:
def best_model(name, model):
    '''run standard scaler and gridsearch CV pipeline on models
    Args:
        -model: initiated model 
        -name : name of model as str
    return list of best estimator and table of results
    '''
    pipe = Pipeline([('scaler', StandardScaler()), ('classifier',model)])
    best_model_stack = list()
    results_cv = dict()
    def grid_csv(params):
        
        GSCV = GridSearchCV(pipe, param_grid = params, scoring = ['accuracy', 'roc_auc', 'average_precision'], refit='average_precision', cv = 5, n_jobs=-1, verbose=True)
        best_clf = GSCV.fit(X_train, y_train)
        best_hyperparams = best_clf.best_params_
        best_score = best_clf.best_score_
        estimator = best_clf.best_estimator_
        print(best_score, best_hyperparams, estimator)
        table = best_clf.cv_results_
        results_cv[name] = table
        return name, best_hyperparams
    
    if name == 'LR':
        params = {'classifier__penalty' : ['l1', 'l2', 'elasticnet', 'none'], 
                  'classifier__C' : [0.2, 0.5, 1]} 
        best_model_stack.append(grid_csv(params))
        
    
    if name == 'SVM':
        params = {'classifier__kernel' : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
                 'classifier__C' : [0.2, 0.5, 1]} 
        best_model_stack.append(grid_csv(params))


    if name == 'RF': 
        params = {'classifier__n_estimators' : np.arange(100, 200, 50),
                  'classifier__criterion' : ['gini', 'entropy'],
                  'classifier__max_depth' : np.arange(5, 15, 1),
                 } 
        best_model_stack.append(grid_csv(params))
    
    if name == 'XGB':
            pass #for XGB fine tuning in XGB_tuning
        
    return best_model_stack, results_cv

In [10]:
results_best_model = list()
scoring = dict()
for name, model in model_GSCV.items():
    
    if name == 'XGB':
        pass
    scores = best_model(name, model)
    results_best_model.append(scores[0])
    scoring[name] = pd.DataFrame(scores[1][name])
    

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8197028216990656 {'classifier__C': 0.2, 'classifier__penalty': 'none'} Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', LogisticRegression(C=0.2, penalty='none'))])
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  8.7min finished


0.8205887174503467 {'classifier__C': 0.5, 'classifier__kernel': 'rbf'} Pipeline(steps=[('scaler', StandardScaler()), ('classifier', SVC(C=0.5))])
Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.4min finished


0.8246807832008176 {'classifier__criterion': 'entropy', 'classifier__max_depth': 11, 'classifier__n_estimators': 150} Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 RandomForestClassifier(criterion='entropy', max_depth=11,
                                        n_estimators=150))])


In [11]:
#save params
with open('Results/h1n1/gridsearch_h1n1.txt', 'w') as file:
    file.write(str(results_best_model))

In [12]:
with pd.ExcelWriter('Results/h1n1/results_h1n1_CV.xls') as writer:
    for df_name, df in scoring.items():
        df.to_excel(writer, sheet_name=df_name)

### KFold CV

In [None]:
# Create model_dict with gridsearchCV params

models = dict()

models['LR'] = LogisticRegression(C=0.5, penalty='l2')
models['SVM'] = SVC(kernel='linear', C=0.2)
models['RF'] = RandomForestClassifier(criterion= 'entropy', n_estimators=150, max_depth=10, n_jobs=-1)


In [40]:
def evaluate_model(scoremodel):
    '''Calculate choosen score for different models using repeated stratified Kfold
    Args:
    - model : model_name(params)
    - score : metrics as string
    
    return scores
    '''
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats=3, random_state=42)

  # Calculate accuracy using `cross_val_score()
    scores = cross_val_score(model, X_train, y_train, scoring=score, cv=cv, n_jobs=-1, error_score='raise', verbose=2)
    print(X_train.shape)
    return scores

In [None]:
#quick test
np.mean(evaluate_model(LogisticRegression(C=1)))

In [None]:
# Evaluate the models and store results
results = list()

for name, model in models.items():   
    scores = evaluate_model(model)
    results.append(scores)
    print('>%s %.3f (%.3f)' %(name, np.mean(scores), np.std(scores)))