## Setup
### Libraries

In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate

from types import SimpleNamespace

# math
from bisect import bisect, bisect_left
from datetime import datetime
import scipy.stats as st
import random
import itertools as itertools

#pipeline models
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# scaling data
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2,mutual_info_classif, f_classif
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import LinearSVC

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

import xgboost as xgb
from xgboost import XGBClassifier

# evaluating models
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.metrics import roc_auc_score, f1_score , precision_score, accuracy_score, recall_score

### Config

In [53]:
%matplotlib inline

### Load Data
load the different datasets that contain differently selected features for benchmarking scores and evaluating which is best features to proceed with final modeling.

In [90]:
# Full features
full = SimpleNamespace()
full.train = pd.read_csv('../datasets/preprocessed_datasets/x_train.csv')
full.val = pd.read_csv('../datasets/preprocessed_datasets/x_val.csv')
full.test = pd.read_csv('../datasets/preprocessed_datasets/x_test.csv')

# Manually selected features
man= SimpleNamespace()
man.train = pd.read_csv('../datasets/preprocessed_datasets/manual/x_train_manual.csv')
man.val = pd.read_csv('../datasets/preprocessed_datasets/manual/x_val_manual.csv')
man.test = pd.read_csv('../datasets/preprocessed_datasets/manual/x_test_manual.csv')
man.production = pd.concat([man.train,man.val])

# PCA selected features
pca= SimpleNamespace()
pca.train = pd.read_csv('../datasets/preprocessed_datasets/pca/x_train_pca.csv')
pca.val = pd.read_csv('../datasets/preprocessed_datasets/pca/x_val_pca.csv')
pca.test = pd.read_csv('../datasets/preprocessed_datasets/pca/x_test_pca.csv')
pca.production = pd.concat([pca.train,pca.val])

# SelectKBest selected features
skb= SimpleNamespace()
skb.train = pd.read_csv('../datasets/preprocessed_datasets/skb/x_train_skb.csv')
skb.val = pd.read_csv('../datasets/preprocessed_datasets/skb/x_val_skb.csv')
skb.test = pd.read_csv('../datasets/preprocessed_datasets/skb/x_test_skb.csv')
skb.production = pd.concat([skb.train,skb.val])

# RecursiveFeatureElimination decision tree selected features
rfe , rfe.dtc, rfe.svc = SimpleNamespace(), SimpleNamespace(), SimpleNamespace()
rfe.dtc.train = pd.read_csv('../datasets/preprocessed_datasets/rfe_dtc/x_train_rfe_dtc.csv')
rfe.dtc.val = pd.read_csv('../datasets/preprocessed_datasets/rfe_dtc/x_val_rfe_dtc.csv')
rfe.dtc.test = pd.read_csv('../datasets/preprocessed_datasets/rfe_dtc/x_test_rfe_dtc.csv')
rfe.dtc.production = pd.concat([rfe.dtc.train,rfe.dtc.val])

# RecursiveFeatureElimination SVM selected features
rfe.svc.train = pd.read_csv('../datasets/preprocessed_datasets/rfe_svc/x_train_rfe_svc.csv')
rfe.svc.val = pd.read_csv('../datasets/preprocessed_datasets/rfe_svc/x_val_rfe_svc.csv')
rfe.svc.test = pd.read_csv('../datasets/preprocessed_datasets/rfe_svc/x_test_rfe_svc.csv')
rfe.svc.production = pd.concat([rfe.svc.train,rfe.svc.val])

# Cross Validated RecursiveFeatureElimination decision tree selected features
rfecv, rfecv.dtc, rfecv.svc = SimpleNamespace(), SimpleNamespace(), SimpleNamespace()
rfecv.dtc.train = pd.read_csv('../datasets/preprocessed_datasets/rfecv_dtc/x_train_rfecv_dtc.csv')
rfecv.dtc.val = pd.read_csv('../datasets/preprocessed_datasets/rfecv_dtc/x_val_rfecv_dtc.csv')
rfecv.dtc.test = pd.read_csv('../datasets/preprocessed_datasets/rfecv_dtc/x_test_rfecv_dtc.csv')
rfecv.dtc.production = pd.concat([rfecv.dtc.train,rfecv.dtc.val])

# Cross Validated RecursiveFeatureElimination SVM selected features
rfecv.svc.train = pd.read_csv('../datasets/preprocessed_datasets/rfecv_svc/x_train_rfecv_svc.csv')
rfecv.svc.val = pd.read_csv('../datasets/preprocessed_datasets/rfecv_svc/x_val_rfecv_svc.csv')
rfecv.svc.test = pd.read_csv('../datasets/preprocessed_datasets/rfecv_svc/x_test_rfecv_svc.csv')
rfecv.svc.production = pd.concat([rfecv.svc.train,rfecv.svc.val])

# target values
y = SimpleNamespace()
y.train = pd.read_csv('../datasets/preprocessed_datasets/y_train.csv')['WnvPresent']
y.val = pd.read_csv('../datasets/preprocessed_datasets/y_val.csv')['WnvPresent']
y.production = pd.concat([y.train,y.val])

## Baseline score

In [58]:
'''
Description:
Takes true and predicted values and caluclates classification performance metrics
---
Params:

- true: (int list) list of Y values 1 or 0

- pred: (int list) list of Y hat values 1 or 0
---
Returns:

- tn:int true negative count
- fp:int false positive count
- fn:int false negative count
- tp:int true positive count
- baseline_accuracy:float accuracy if all y = 1
- accuracy:float
- misclassification_rate:float
- sensitivity:float
- specificity:float
- precision:float
- f1:float
'''
def performance_metrics(true, pred):
    #confusion matrix
    tn, fp, fn, tp = confusion_matrix(true, pred).ravel()
    
    # baseline accuracy
    baseline_accuracy = np.mean(true)
    
    # accuracy score
    accuracy = accuracy_score(true, pred)
    
    # misclassification score
    misclassification_rate = 1 - accuracy
    
    # recall score
    sensitivity = recall_score(true, pred, zero_division=0)
    
    # specificity score
    specificity = np.nan_to_num(tn / (tn + fp))
    
    # precision score
    precision = precision_score(true, pred, zero_division=0)
    
    # roc auc score
    auc = roc_auc_score(true, pred)
    
    # f1 score
    f1 = f1_score(true, pred, zero_division=0)
    
    return {
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'baseline_accuracy': baseline_accuracy,
        'accuracy': accuracy,
        'misclassification_rate': misclassification_rate,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'precision': precision,
        'f1': f1,
        'auc': auc,
    }

In [59]:
# calculate baseline score
baseline = performance_metrics(list(y_val), [1]*len(list(y_val)))
# format for pandas
baseline = { k:[round(v,2)] for k,v in baseline.items()}
# output dataframe
pd.DataFrame(baseline, index=['validation baseline']).T

Unnamed: 0,validation baseline
tn,0.0
fp,1631.0
fn,0.0
tp,91.0
baseline_accuracy,0.05
accuracy,0.05
misclassification_rate,0.95
sensitivity,1.0
specificity,0.0
precision,0.05


This baseline score assumes we predict all Y predictions as 1 WNV Present in the validation set. We get a specificity of 0 meaning we have not been able to discriminate any non WNV affected areas. We also get the minimum ROC AUC score of 0.5, meaning it has absolutely no ability to distingush the difference between a WNV present and not present location.

## Modeling

In [60]:
data_dict = {
    'full':{
        'X_train': full.train,
        'X_val': full.val,
        'y_train': y.train,
        'y_val': y.val,
    },
    'manual':{
        'X_train': man.train,
        'X_val': man.val,
        'y_train': y.train,
        'y_val': y.val,
    },
    'pca':{
        'X_train': pca.train,
        'X_val': pca.val,
        'y_train': y.train,
        'y_val': y.val,
    },
    'skb':{
        'X_train': skb.train,
        'X_val': skb.val,
        'y_train': y.train,
        'y_val': y.val,
    },
    'rfe_dtc':{
        'X_train': rfe.dtc.train,
        'X_val': rfe.dtc.val,
        'y_train': y.train,
        'y_val': y.val,
    },
    'rfe_svc':{
        'X_train': rfe.svc.train,
        'X_val': rfe.svc.val,
        'y_train': y.train,
        'y_val': y.val,
    },
    'rfecv_dtc':{
        'X_train': rfecv.dtc.train,
        'X_val': rfecv.dtc.val,
        'y_train': y.train,
        'y_val': y.val,
    },
    'rfecv_svc':{
        'X_train': rfecv.svc.train,
        'X_val': rfecv.svc.val,
        'y_train': y.train,
        'y_val': y.val,
    },
}

In [61]:
'''
Description:
Evaluates a pipeline and prints out a short/long report
---
Params:

- pipe: (pipeline) pipeline object
- param_grid: (dict) dictionary of pipeline params
- scoring: (str | scorer object) scoring param for gridsearchCV. default 'roc_auc'
- cv: (int) n Kfold for cross validation. default 5
- verbose: (boolean) Print long or short report. default False
- fs: (str) which feature set to select from data_dict. default 'full'
- d: (dict) data dictionary to supply data. default data_dict

---
Returns:

- pipeline.best_estimator_

'''

def evaluate_esitmator(
    pipe, 
    param_grid, 
    scoring='roc_auc', 
    cv=5,
    verbose = False,
    fs = 'full', # feature selector
    d = data_dict, # data_dict
):
    
    # modeling
    gs = GridSearchCV(pipe, param_grid=param_grid, cv=cv, n_jobs=-1, scoring=scoring)
    model = gs.fit(d[fs]['X_train'], d[fs]['y_train']) 
    
    # predict train
    y_pred_train = model.predict(d[fs]['X_train'])
    
    # predict validation
    y_pred_val = model.predict(d[fs]['X_val'])
    
    # performance matrix of train set
    train_pm = performance_metrics(d[fs]['y_train'], y_pred_train)
    
    # performance matrix of validation set
    val_pm = performance_metrics(d[fs]['y_val'], y_pred_val)
    
    if verbose:
        # Long report
        # Model report
        table = [
            ["Classifier", pipe['clf']],
            ["Scoring optimizer", scoring],
            ["Best score", model.best_score_],
            ["Refit time", model.refit_time_],
            ["n Components", d[fs]['X_train'].shape[1]],
            ["Feature Selector", fs],
            ["Best params", str(model.best_params_).replace(", '",",\n'") ],
        ]
        print(tabulate(table,headers=['model result', 'value'], tablefmt="grid"))
        print('')
        
        # Confusion matrix report
        metrics = [
            ['tn', train_pm['tn'], val_pm['tn']],
            ['fp', train_pm['fp'], val_pm['fp']],
            ['fn', train_pm['fn'], val_pm['fn']],
            ['tp', train_pm['tp'], val_pm['tp']],
            ['accuracy', train_pm['accuracy'], val_pm['accuracy']],
            ['misclassification_rate', train_pm['misclassification_rate'], val_pm['misclassification_rate']],
            ['sensitivity', train_pm['sensitivity'], val_pm['sensitivity']],
            ['specificity', train_pm['specificity'], val_pm['specificity']],
            ['precision', train_pm['precision'], val_pm['precision']],
            ['f1', train_pm['f1'], val_pm['f1']],
            ['roc_auc', train_pm['auc'], val_pm['auc']],
        ]
        print(tabulate(metrics, headers=['metric', 'train', 'validation'], numalign='right', tablefmt="github"))
        print('')
        
        # Classification report from sklearn
        print(classification_report(y_val, y_pred_val, target_names=['WNV present', 'not present']))
        
    else:
        # Short summarised report
        table = [
            ["Classifier", pipe['clf']],
            ["Scoring optimizer", model.scorer_],
            ["Best score", model.best_score_],
            ["n Components", d[fs]['X_train'].shape[1]],
            ["Feature Selector", fs],
            ["roc auc: train", train_pm['auc']],
            ["roc auc: val", val_pm['auc']],
            
        ]
        print(tabulate(table))
    
    # returns the optimised estimator from pipeline
    return model.best_estimator_

In [62]:
'''
Description:
Prints a table of features and evaluation scores
---
Params:

- model: (pipeline) pipeline object
- x: (dataframe) corresponding X dataframe to get feature names

---
Returns:

- pipeline.best_estimator_

'''
def display_xgb_feature_importance(model, x=full.train):
    # get a sorted list of features with their importance score
    features = sorted(list(zip(list(x.columns), model['clf'].feature_importances_)), key=lambda a:a[1], reverse=True)
    tbl=[]
    # cumulative score
    cum = 0
    # ranking
    count= 0
    
    for k,v in features:
        cum += v
        count+=1
        
        # example result : [1, feature_name, cumulative_score, importance_score],...
        tbl.append([count, k, round(cum,4), round(v,4)])
    
    # prints a table of results
    print(tabulate(tbl, headers=['rank', 'feature', 'cumulative', 'importance']))
    
    # returns list of results
    return tbl

### Find the best features for each estimator
Run default setting to select best combination of features and estimator

In [63]:
datasets_keys = list(data_dict.keys())

#### Log Reg

In [65]:
%%time
pipe = Pipeline([('sc',StandardScaler()),('clf', LogisticRegression()),])
params = {'sc__with_mean': [False], 'clf__max_iter': [1000]}
for fs in datasets_keys:
    evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=False, fs=fs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------  ------------------------------------------------
Classifier         LogisticRegression()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9834572608168797
n Components       177
Feature Selector   full
roc auc: train     0.9684145967494634
roc auc: val       0.5262462859029382
-----------------  ------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------  ------------------------------------------------
Classifier         LogisticRegression()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9166400208543966
n Components       38
Feature Selector   manual
roc auc: train     0.8197638761116222
roc auc: val       0.6446965052115265
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         LogisticRegression()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9838957583370359
n Components       177
Feature Selector   pca
roc auc: train     0.9708678319533884
roc auc: val       0.5146441541291328
-----------------  ------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------  ------------------------------------------------
Classifier         LogisticRegression()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9386475702533519
n Components       77
Feature Selector   skb
roc auc: train     0.8777215578043545
roc auc: val       0.6333301891241806
-----------------  ------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------  ------------------------------------------------
Classifier         LogisticRegression()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9444899568592383
n Components       77
Feature Selector   rfe_dtc
roc auc: train     0.8756516406010427
roc auc: val       0.6318209687308399
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         LogisticRegression()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9702969481148557
n Components       77
Feature Selector   rfe_svc
roc auc: train     0.921343146274149
roc auc: val       0.597486204782342
-----------------  ------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------  ------------------------------------------------
Classifier         LogisticRegression()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9804603717256694
n Components       136
Feature Selector   rfecv_dtc
roc auc: train     0.9513186139221099
roc auc: val       0.5157760694241381
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         LogisticRegression()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9838307678849361
n Components       151
Feature Selector   rfecv_svc
roc auc: train     0.9681846059490954
roc auc: val       0.52105834080083
-----------------  ------------------------------------------------
Wall time: 33.2 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### MultinomialNB

In [66]:
%%time
pipe = Pipeline([('sc', MinMaxScaler()),('clf', MultinomialNB())])
params = {}
for fs in datasets_keys:
    evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=False, fs=fs)

-----------------  ------------------------------------------------
Classifier         MultinomialNB()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9792678817454847
n Components       177
Feature Selector   full
roc auc: train     0.9375958295001532
roc auc: val       0.5603923973022685
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         MultinomialNB()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.8453585826155823
n Components       38
Feature Selector   manual
roc auc: train     0.7483900643974242
roc auc: val       0.6637268311088055
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         MultinomialNB()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9687058

#### KNeighborsClassifier

In [67]:
%%time
pipe = Pipeline([('sc', StandardScaler()),('clf', KNeighborsClassifier())])
params = {}
for fs in datasets_keys:
    evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=False, fs=fs)

-----------------  ------------------------------------------------
Classifier         KNeighborsClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9770523933273431
n Components       177
Feature Selector   full
roc auc: train     0.9704845139527751
roc auc: val       0.5396406168938358
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         KNeighborsClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.961236624999333
n Components       38
Feature Selector   manual
roc auc: train     0.9372891750996627
roc auc: val       0.6731830401358299
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         KNeighborsClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best sco

#### SVC

In [68]:
%%time
pipe = Pipeline([('sc', StandardScaler()),('clf', SVC())])
params = {}
for fs in datasets_keys:
    evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=False, fs=fs)

-----------------  ------------------------------------------------
Classifier         SVC()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9842555886977724
n Components       177
Feature Selector   full
roc auc: train     0.9674946335479915
roc auc: val       0.5195255388388437
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         SVC()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.954396278723555
n Components       38
Feature Selector   manual
roc auc: train     0.88508126341613
roc auc: val       0.7200867801726172
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         SVC()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9860674367860796
n Components       177


#### DecisionTreeClassifier

In [69]:
%%time
pipe = Pipeline([('clf', DecisionTreeClassifier())])
params = {}
for fs in datasets_keys:
    evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=False, fs=fs)

-----------------  ------------------------------------------------
Classifier         DecisionTreeClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9146507403678473
n Components       177
Feature Selector   full
roc auc: train     0.9932536031892057
roc auc: val       0.590482478894496
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         DecisionTreeClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.8981028446372339
n Components       38
Feature Selector   manual
roc auc: train     0.9932536031892057
roc auc: val       0.638400226383059
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         DecisionTreeClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Bes

#### RandomForestClassifier

In [70]:
%%time
pipe = Pipeline([('sc', StandardScaler()),('clf', RandomForestClassifier())])
params = {}
for fs in datasets_keys:
    evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=False, fs=fs)

-----------------  ------------------------------------------------
Classifier         RandomForestClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9807644946455474
n Components       177
Feature Selector   full
roc auc: train     0.9932536031892059
roc auc: val       0.663915483657973
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         RandomForestClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9633920102977009
n Components       38
Feature Selector   manual
roc auc: train     0.9932536031892059
roc auc: val       0.6463236334480971
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         RandomForestClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Be

#### GradientBoostingClassifier

In [71]:
%%time
pipe = Pipeline([('sc', StandardScaler()),('clf', GradientBoostingClassifier())])
params = {}
for fs in datasets_keys:
    evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=False, fs=fs)

-----------------  ------------------------------------------------
Classifier         GradientBoostingClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9612314055418343
n Components       177
Feature Selector   full
roc auc: train     0.8972707758356332
roc auc: val       0.8131632316181672
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         GradientBoostingClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9573513133283843
n Components       38
Feature Selector   manual
roc auc: train     0.8875344986200553
roc auc: val       0.7731688911946424
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         GradientBoostingClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_thres

#### ExtraTreesClassifier

In [72]:
%%time
pipe = Pipeline([('sc', StandardScaler()),('clf', ExtraTreesClassifier())])
params = {}
for fs in datasets_keys:
    evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=False, fs=fs)

-----------------  ------------------------------------------------
Classifier         ExtraTreesClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9725886753481753
n Components       177
Feature Selector   full
roc auc: train     0.9932536031892057
roc auc: val       0.6654011224826676
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         ExtraTreesClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.9315510826000801
n Components       38
Feature Selector   manual
roc auc: train     0.9932536031892057
roc auc: val       0.6472433146252888
-----------------  ------------------------------------------------
-----------------  ------------------------------------------------
Classifier         ExtraTreesClassifier()
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best sc

#### XGBClassifier

In [73]:
%%time
pipe = Pipeline([('clf', xgb.XGBClassifier())])
params = {}
for fs in datasets_keys:
    evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=False, fs=fs)

-----------------  --------------------------------------------------------------------------------
Classifier         XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
                                 colsample_bynode=None, colsample_bytree=None, gamma=None,
                                 gpu_id=None, importance_type='gain', interaction_constraints=None,
                                 learning_rate=None, max_delta_step=None, max_depth=None,
                                 min_child_weight=None, missing=nan, monotone_constraints=None,
                                 n_estimators=100, n_jobs=None, num_parallel_tree=None,
                                 random_state=None, reg_alpha=None, reg_lambda=None,
                                 scale_pos_weight=None, subsample=None, tree_method=None,
                                 validate_parameters=None, verbosity=None)
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.98871177

-----------------  --------------------------------------------------------------------------------
Classifier         XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
                                 colsample_bynode=None, colsample_bytree=None, gamma=None,
                                 gpu_id=None, importance_type='gain', interaction_constraints=None,
                                 learning_rate=None, max_delta_step=None, max_depth=None,
                                 min_child_weight=None, missing=nan, monotone_constraints=None,
                                 n_estimators=100, n_jobs=None, num_parallel_tree=None,
                                 random_state=None, reg_alpha=None, reg_lambda=None,
                                 scale_pos_weight=None, subsample=None, tree_method=None,
                                 validate_parameters=None, verbosity=None)
Scoring optimizer  make_scorer(roc_auc_score, needs_threshold=True)
Best score         0.98861225

## Tuning Model

### XGBClassifier - Manually selected features
Despite promising ROC AUC scores in other classifiers and feature sets, they have yielded lower than acceptable scores on Kaggle. This is our final model that has produced one of the better scores.

In [81]:
list(data_dict['manual']['X_train'].columns)

['Week',
 'Year',
 'Month',
 'Day',
 'Tmax',
 'Tmin',
 'Tavg',
 'DewPoint',
 'WetBulb',
 'Heat',
 'Cool',
 'Sunrise',
 'Sunset',
 'PrecipTotal',
 'StnPressure',
 'SeaLevel',
 'ResultSpeed',
 'ResultDir',
 'AvgSpeed',
 'IntLoc',
 'code_dz',
 'code_vcts',
 'code_fg',
 'code_tsra',
 'code_bcfg',
 'code_ts',
 'code_ra',
 'code_br',
 'code_hz',
 'code_fg+',
 'species_CULEX ERRATICUS',
 'species_CULEX PIPIENS',
 'species_CULEX PIPIENS/RESTUANS',
 'species_CULEX RESTUANS',
 'species_CULEX SALINARIUS',
 'species_CULEX TARSALIS',
 'species_CULEX TERRITANS',
 'species_UNSPECIFIED CULEX']

In [104]:
%%time
pipe = Pipeline([('clf', xgb.XGBClassifier())])
params = {
    'clf__booster' : ['gbtree'],
    'clf__colsample_bytree' : [.15],
    'clf__eval_metric' : ['auc'],
    'clf__min_split_loss' : [.01],
    'clf__learning_rate' : [.05],
    'clf__max_depth' : [3],
    'clf__n_estimators' : [500],
    'clf__reg_alpha' : [.9],
    'clf__reg_lambda' : [5],
    'clf__scale_pos_weight': [19],
    'clf__subsample' : [1],
}
best_model = evaluate_esitmator(pipe, params, scoring='roc_auc', verbose=True, fs='manual')
xgb_feature_table = display_xgb_feature_importance(best_model, x=man.train)

+-------------------+----------------------------------------------------------------------------------+
| model result      | value                                                                            |
| Classifier        | XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,             |
|                   |               colsample_bynode=None, colsample_bytree=None, gamma=None,          |
|                   |               gpu_id=None, importance_type='gain', interaction_constraints=None, |
|                   |               learning_rate=None, max_delta_step=None, max_depth=None,           |
|                   |               min_child_weight=None, missing=nan, monotone_constraints=None,     |
|                   |               n_estimators=100, n_jobs=None, num_parallel_tree=None,             |
|                   |               random_state=None, reg_alpha=None, reg_lambda=None,                |
|                   |               scale_pos_weight=No

## Production Model
Retrain model with complete X, using the same hyper parameters and features.

In [105]:
%%time
X = man.production
Y = y.production

production_model = xgb.XGBClassifier(
    eval_metric='auc',
    subsample = 1,
    colsample_bytree = .15,
    learning_rate = .05,
    max_depth=3,
    scale_pos_weight=19,
    n_estimators=500,
    reg_alpha=.9,
    reg_lambda= 5,
    gamma=0.01
)

production_model.fit(X, Y);

Wall time: 568 ms


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.15, eval_metric='auc',
              gamma=0.01, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=0,
              num_parallel_tree=1, random_state=0, reg_alpha=0.9, reg_lambda=5,
              scale_pos_weight=19, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

## Kaggle Submission

In [92]:
# Toggle for saving for submission
SAVE_FOR_SUBMISSION = True

In [93]:
'''
Description:
Formats predictions to Kaggle compatible csv file and saves to ./datasets/submission_datasets
---
Params:

- Y_pred: (int list) list of Y values 1 or 0
- suffix: (str) suffix for file name
- should_submit: (bool) True to submit
---
Returns:

- tn:int true negative count
- fp:int false positive count
- fn:int false negative count
- tp:int true positive count
- baseline_accuracy:float accuracy if all y = 1
- accuracy:float
- misclassification_rate:float
- sensitivity:float
- specificity:float
- precision:float
- f1:float
'''
def format_and_save_kaggle(Y_pred, suffix:str='', should_submit:bool=False):
    
    # format prediction list to labeled dataframe
    submission = pd.DataFrame(Y_pred).rename(columns={0: 'WnvPresent'})
    submission['Id'] = submission.index + 1
    submission = submission[['Id', 'WnvPresent']]
    
    # print sum of each prediction
    print(submission['WnvPresent'].value_counts())
    
    if should_submit:
        # save csv for submission
        submission.to_csv(
        f"../datasets/submission_datasets/submission_{suffix}_{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.csv" 
        ,index=False)

In [102]:
X_test = man.test

In [106]:
Y_pred = production_model.predict(X_test)
format_and_save_kaggle(Y_pred, suffix='xgb_manual_selection', should_submit=SAVE_FOR_SUBMISSION)

0    64788
1    51505
Name: WnvPresent, dtype: int64


submission file: submission_xgb_manual_selection_2020-10-15-20-31-45.csv
![kaggle score](../assets/kaggle_score.png "kagle score")