# Hyperparameter Optimization

V1: 04/14/19 | Justin Campbell | justin.campbell@hsc.utah.edu

Here, we evaluate a broad hyperparameter space for two models (i.e., SVC, ET) using the method of nested cross validation (aka "double cross validation"). *Nested cross validation* uses an inner cross validation (for parameter tuning) wrapped in an outer cross validation (for model evaluation). Our pipeline uses a 100x5 approach, wherein the full dataset is partitioned into 100 stratified outer-folds (80/20 split, equal class representation), and each outer-fold is further partitioned into 5 stratified inner-folds (80/20 split, equal class representation).  

Within the inner cross validation loop, we use an automated hyperparameter optimization tool (_Hyperopt-sklearn_) to iteratively evaluate different hyperparameter combinations (50 test epochs). This nested cross validation approach provides an unbiased estimate of performance by evaluating model performance on an independent validation dataset not used in training or optimization.  



## Import Helper Libraries

This script uses a handful of common python libraries (e.g., _numpy_, _pandas_, _sklearn_) in addition to the _Hyperopt_ library (https://github.com/hyperopt/hyperopt)

In [None]:
import numpy as np
import pandas as pd
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_score

## Import & Prepare Data

In [None]:
# Import Dataset (as Pandas Dataframe)
data = pd.read_csv('/Users/justincampbell/Library/CloudStorage/iCloud Drive/Research/CCS/ML-AnesDOC/Data/data_Anes.csv', index_col=0)

# Shuffle rows & re-index
data = data.sample(frac=1, axis=0).reset_index(drop=True)

# Preview data
data.head()

In [None]:
# Specify features and label
features = data.columns.drop(['State', 'Dataset', 'Cond'])
# features = ['DMN_ALFF', 'FPTC_ALFF', 'SAL_ALFF', 'AUDI', 'COTC', 'DA', 'DMN', 'FPTC', 'SS', 'VA', 'VIS', 'AUDI_COTC', 'COTC_SS', 'AUDI_DA', 'COTC_DA', 'DA_SS', 'AUDI_SAL', 'AUDI_SS', 'DMN_SS', 'AUDI_SUB', 'SAL_SUB', 'COTC_VA', 'SAL_VA', 'SS_VA', 'SUB_VA', 'AUDI_VIS', 'COTC_VIS', 'DA_VIS', 'DMN_VIS', 'SS_VIS', 'VA_VIS', 'Global']
label = ['State']

X = data[features]
y = data[label]

## Prepare for Hyperopt

To run _Hyperopt_, you must first define the hyperparameters to optimize and the range of possible values (_hparam_space_).  

_hp.uniform_ for continuous hyperparameters  
_hp.choice_ for nominal hyperparameters

In [31]:
# Define hyperparameter space for models
kernels = ['linear', 'sigmoid', 'poly', 'rbf']
max_depths = [1,2,3,4,5]
max_features = [1,2,3,4,5]
n_estimators = [50, 100, 250, 500, 750, 1000, 1250, 1500, 1750, 2000]
criterions = ['gini', 'entropy']

hparam_space = {
    'SVC': {'C': hp.uniform('C', 2**-5,2**5),
            'kernel': hp.choice('kernel', kernels),
            'gamma': hp.uniform('gamma', 2**-5,2**5)},

    'ET': {'max_depth': hp.choice('max_depth', max_depths),
            'max_features': hp.choice('max_features', max_features),
            'n_estimators': hp.choice('n_estimators', n_estimators),
            'criterion': hp.choice('criterion', criterions)}}

# Create empty dictionary for optimized hyperparams
opt_hparams = {}

### Define Cross-Validation Folds

In [32]:
# Create inner and outer folds
inner_folds = 5
outer_folds = 100

# Split into stratified 80/20 sub-samples
inner_cv = StratifiedShuffleSplit(n_splits=inner_folds, train_size=0.80)
outer_cv = StratifiedShuffleSplit(n_splits=outer_folds, train_size=0.80)

# Create empty lists to store dataframe indices for each fold
val_idxs = []
opt_idxs = []

# Get indices from full dataset, split into stratified 80/20 optimization/validation folds
for opt_idx, val_idx in outer_cv.split(X,y):
    val_idxs.append(val_idx)
    opt_idxs.append(opt_idx)

### Define Functions Used in Hyperparameter Optimization

In [33]:
def hyperparam_opt(params):
    '''Hyperparameter optimization w/ Monte Carlo Cross Validation'''

    # import global variables
    global model
    global X_opt
    global y_opt
    
    if model == 'SVC':
        # construct Support Vector Classifier
        clf = SVC(**params)
        
    elif model == 'ET':
        # construct Extra Trees
        clf = ExtraTreesClassifier(**params)
    
    # Inner Cross-Validation w/ Stratified K-Fold CV
    inner_cv_aucs = cross_val_score(clf, X_opt, y_opt, cv=inner_cv, scoring='roc_auc')
                              
    # return the average ROC-AUC across all inner folds
    return inner_cv_aucs.mean()

In [34]:
def f(params):
    '''Helper-function built to run hyperopt'''

    auc = hyperparam_opt(params)
    return {'loss': -auc, 'status': STATUS_OK}

## Run Hyperparameter Optimization

### Support Vector Machine Classifier (SVC)

In [35]:
# Define model
model = 'SVC'

# Create empty lists to store validation AUC and associated hyperparameters
outer_cv_aucs = []
fold_hparams = []

# Loop through outer folds
for i in range(outer_folds):
    # Partition into optimization dataset and validation dataset
    X_opt, y_opt = X.loc[opt_idxs[i]], np.ravel(np.asarray(y.loc[opt_idxs[i]]))
    X_val, y_val = X.loc[val_idxs[i]], np.ravel(np.asarray(y.loc[val_idxs[i]]))
    
    # Run Hyperopt
    trials = Trials()
    best = fmin(fn=f, space=hparam_space[model], algo=tpe.suggest, max_evals=50, trials=trials)
    
    # Define model using best inner loop hyperparameters
    opt_model = SVC(C=best['C'], 
                gamma = best['gamma'], 
                kernel = kernels[best['kernel']])       
    
    # Train on optimization dataset, test validation dataset AUC
    opt_model.fit(X_opt, pd.Series(y_opt))
    preds = opt_model.predict(X_val)
    opt_auc = roc_auc_score(y_val, preds)
    
    # Save performance and hyperparameters
    outer_cv_aucs.append(opt_auc)
    fold_hparams.append(best)
    
# Store in dataframe, sort outer loops by validation AUC, grab best performance
SVC_report = pd.DataFrame(data={'Validation AUC': outer_cv_aucs,
                                'Hyperparameters': fold_hparams})
SVC_report = SVC_report.sort_values('Validation AUC',ascending=False)
opt_hparams['SVC'] = SVC_report.iloc[0]['Hyperparameters']
SVC_report.head()

100%|██████████| 50/50 [00:01<00:00, 37.96it/s, best loss: -0.9836734693877551]
100%|██████████| 50/50 [00:01<00:00, 40.27it/s, best loss: -0.9877551020408163]
100%|██████████| 50/50 [00:01<00:00, 41.29it/s, best loss: -0.9959183673469388]
100%|██████████| 50/50 [00:01<00:00, 42.40it/s, best loss: -0.9959183673469388]
100%|██████████| 50/50 [00:01<00:00, 42.20it/s, best loss: -1.0]               
100%|██████████| 50/50 [00:01<00:00, 40.75it/s, best loss: -1.0]               
100%|██████████| 50/50 [00:01<00:00, 41.81it/s, best loss: -1.0]              
100%|██████████| 50/50 [00:01<00:00, 40.53it/s, best loss: -0.9959183673469388]
100%|██████████| 50/50 [00:01<00:00, 41.49it/s, best loss: -0.9918367346938777]
100%|██████████| 50/50 [00:01<00:00, 41.23it/s, best loss: -0.9877551020408164]
100%|██████████| 50/50 [00:01<00:00, 44.14it/s, best loss: -0.9959183673469388]
100%|██████████| 50/50 [00:01<00:00, 37.96it/s, best loss: -1.0]              
100%|██████████| 50/50 [00:01<00:00, 41.97

Unnamed: 0,Validation AUC,Hyperparameters
0,1.0,"{'C': 15.919266430236618, 'gamma': 18.75015649..."
44,1.0,"{'C': 29.31312993307266, 'gamma': 18.821992483..."
81,1.0,"{'C': 17.221755701343483, 'gamma': 13.95877904..."
9,1.0,"{'C': 12.82016439466881, 'gamma': 3.7851097454..."
20,1.0,"{'C': 23.037618032467222, 'gamma': 28.68632285..."


### Extra Trees Classifier (ET)

In [None]:
# Define model
model = 'ET'

# Create empty lists to store validation AUC and associated hyperparameters
outer_cv_aucs = []
fold_hparams = []

# Loop through outer folds
for i in range(outer_folds):
    # Partition into optimization dataset and validation dataset
    X_opt, y_opt = X.loc[opt_idxs[i]], np.ravel(np.asarray(y.loc[opt_idxs[i]]))
    X_val, y_val = X.loc[val_idxs[i]], np.ravel(np.asarray(y.loc[val_idxs[i]]))
    
    # Run Hyperopt
    trials = Trials()
    best = fmin(fn=f, space=hparam_space[model], algo=tpe.suggest, max_evals=50, trials=trials)
    
    # Define model using best inner loop hyperparameters
    opt_model = ExtraTreesClassifier(max_depth = max_depths[best['max_depth']], 
                    max_features = max_features[best['max_features']], 
                    n_estimators = n_estimators[best['n_estimators']], 
                    criterion = criterions[best['criterion']])
                  
    # Train on optimization dataset, test validation dataset AUC
    opt_model.fit(X_opt, pd.Series(y_opt))
    preds = opt_model.predict(X_val)
    opt_auc = roc_auc_score(y_val, preds)
    
    # Save performance and hyperparameters
    outer_cv_aucs.append(opt_auc)
    fold_hparams.append(best)
    
# Store in dataframe, sort outer loops by validation AUC, grab best performance
ET_report = pd.DataFrame(data={'Validation AUC': outer_cv_aucs,
                               'Hyperparameters': fold_hparams})
ET_report = ET_report.sort_values('Validation AUC',ascending=False)
opt_hparams['ET'] = ET_report.iloc[0]['Hyperparameters']
ET_report.head()

### Optimized Hyperparameters:

In [None]:
print('SVC:',SVC_report.iloc[0])
print('ET:',ET_report.iloc[0])