# Improved models

In [10]:
import pandas as pd
import numpy as np
import os

from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import optuna
from functools import partial

## Data pre-processing

In [4]:
DATA_PATH = '../../data/raw'
X_train = pd.read_csv(
    os.path.join(DATA_PATH, 'training_set_features.csv')
).drop('respondent_id',axis =1)

X_test = pd.read_csv(
    os.path.join(DATA_PATH, 'test_set_features.csv')
).drop('respondent_id',axis =1)

y_train = pd.read_csv(
    os.path.join(DATA_PATH, 'training_set_labels.csv')
).drop('respondent_id',axis =1)

sub = pd.read_csv(os.path.join(DATA_PATH, 'submission_format.csv'))

## Models

Common preprocessor

In [5]:
num_features = X_train.columns[X_train.dtypes != "object"].values
cat_features = X_train.columns[X_train.dtypes == "object"].values

num_transformer = Pipeline([
    ('scale', StandardScaler()),
    ('impute', KNNImputer(n_neighbors = 10))
])

cat_transformer = Pipeline([
    ('impute', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('encode', OneHotEncoder(drop = 'first'))
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

### Random forest

As the hyperparameter space is more complex for the random forest 
(at least 4 hyperparameters to examine, vs just $C$ for logistic regression)
we'll use Bayesian optimization with `optuna` to find suitable hyperparameters.

In [13]:
def objective_func(trial, X, y):
    
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0)
    }

    rf_model = Pipeline([
        ('pre', preprocessor),
        ('rf', MultiOutputClassifier(RandomForestClassifier(**params)))
    ])
    cv_score = cross_val_score(
        X = X,
        y = y,
        scoring = 'roc_auc',
        estimator = rf_model,
        cv = 5 # cannot use stratified k-fold in multilabel problems 
    )
    
    return np.mean(cv_score)

In [14]:
objective = partial(objective_func,X = X_train, y = y_train)
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 20)

[I 2020-07-20 15:10:24,895] Finished trial#0 with value: 0.8519623655058026 with parameters: {'criterion': 'entropy', 'n_estimators': 1254, 'max_depth': 10, 'max_features': 0.5478300585833582}. Best is trial#0 with value: 0.8519623655058026.
[I 2020-07-20 15:15:00,329] Finished trial#1 with value: 0.8523749236382809 with parameters: {'criterion': 'gini', 'n_estimators': 306, 'max_depth': 13, 'max_features': 0.30897436493668373}. Best is trial#1 with value: 0.8523749236382809.


In [15]:
params = study.best_params

{'criterion': 'gini',
 'n_estimators': 306,
 'max_depth': 13,
 'max_features': 0.30897436493668373}

In [17]:
study.best_value

0.8523749236382809

This score is better than our previous CV score with logistic regression, 
build a submission using these parameters.

In [None]:
rf_model = Pipeline([
    ('pre', preprocessor),
    ('rf', MultiOutputClassifier(RandomForestClassifier(**params)))
])

rf_model.fit(X_train, y_train)
preds = rf_model.predict_proba(X_test)

In [None]:
sub['h1n1_vaccine'] = preds[0][:,1]
sub['seasonal_vaccine'] = preds[1][:,1]
sub.to_csv('../../output/baseline_pred.csv', index = False)

### Results