# 03 Build Models

## Imports

* We want AUC ROC

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier





from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.layers import Dropout
from keras import regularizers



import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Using TensorFlow backend.


# Functions

In [2]:
def get_metrics(y_true, y_predict, print_scores = True):
    matrix_def = [['tn','fp'], ['fn','tp']]
    matrix = confusion_matrix(y_true, y_predict)
    tn, fp, fn, tp = matrix.ravel()
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    misclass = 1-accuracy
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    precision = tp/(tp+fp)
    if print_scores:
        print('Matrix Definition')
        print(np.array(matrix_def))
        print('')
        print('Confusion Matrix')
        print(matrix)
        print('')
        print('METRICS')
        print('accuracy:', accuracy)
        print('misclass:', misclass)
        print('sensitivity:', sensitivity)
        print('specificity:', specificity)
    else:
        return accuracy, misclass, sensitivity, specificity, precision

## Read in Train and Test Data

In [3]:
with open('../Cleansed_Data/X_train_ss.pkl','rb') as f:
    X_train_ss = pickle.load(f)
    
with open('../Cleansed_Data/X_test_ss.pkl','rb') as f:
    X_test_ss = pickle.load(f)
    
with open('../Cleansed_Data/y_train.pkl','rb') as f:
    y_train = pickle.load(f)
    
with open('../Cleansed_Data/y_test.pkl','rb') as f:
    y_test = pickle.load(f)

## Import Kaggle Test Data

In [4]:
kaggle_X = pd.read_csv('../Cleansed_Data/test_final.csv', index_col=0)

In [5]:
X_test_ss = pd.DataFrame(X_test_ss, columns=X_train_ss.columns)

In [6]:
X_train_ss.drop('NumMosquitos', 1, inplace=True)

In [7]:
X_test_ss.drop('NumMosquitos', 1, inplace=True)

In [8]:
kaggle_X.drop('NumMosquitos', 1, inplace=True)

In [9]:
rf_tf_pipe = Pipeline([
    ('pca', PCA()),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=15, min_samples_leaf=3))
])

In [10]:
num_components = len(X_train_ss.columns)

In [11]:
rf_tf_params = {
    'pca__n_components': [81],
}

In [12]:
grid = GridSearchCV(rf_tf_pipe, 
                    rf_tf_params,
                        n_jobs=3,)

In [13]:
grid.fit(X_train_ss, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'pca__n_components': [81]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [14]:
roc_auc_score(y_train, grid.predict(X_train_ss))

0.6207417941399077

In [15]:
roc_auc_score(y_test, grid.predict(X_test_ss))

0.5284573653585335

In [16]:
grid.best_params_

{'pca__n_components': 81}

## Make prediction for Kaggle

In [17]:
kaggle_X.head()

Unnamed: 0_level_0,Latitude,Longitude,BR,HZ,RA,TSRA,VCTS,FU,TS,DZ,...,Month_09,Month_10,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,Month_05
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.033507,-1.257471,-0.204456,-0.725293,-0.496999,-0.751752,-0.484906,-0.241767,0.0,-0.379729,...,-0.572506,-0.696702,-0.545837,-0.17978,79.718254,-0.549235,-0.909952,-0.625148,-0.093428,-0.021731
2,1.033507,-1.257471,-0.204456,-0.725293,-0.496999,-0.751752,-0.484906,-0.241767,0.0,-0.379729,...,-0.572506,-0.696702,-0.545837,-0.17978,-0.012544,1.820715,-0.909952,-0.625148,-0.093428,-0.021731
3,1.033507,-1.257471,-0.204456,-0.725293,-0.496999,-0.751752,-0.484906,-0.241767,0.0,-0.379729,...,-0.572506,-0.696702,-0.545837,5.562346,-0.012544,-0.549235,-0.909952,-0.625148,-0.093428,-0.021731
4,1.033507,-1.257471,-0.204456,-0.725293,-0.496999,-0.751752,-0.484906,-0.241767,0.0,-0.379729,...,-0.572506,-0.696702,-0.545837,-0.17978,-0.012544,-0.549235,1.098959,-0.625148,-0.093428,-0.021731
5,1.033507,-1.257471,-0.204456,-0.725293,-0.496999,-0.751752,-0.484906,-0.241767,0.0,-0.379729,...,-0.572506,-0.696702,-0.545837,-0.17978,-0.012544,-0.549235,-0.909952,-0.625148,10.70344,-0.021731


In [18]:
rf_pred_proba = grid.predict_proba(kaggle_X)

In [19]:
rf_pred_proba

array([[0.53756999, 0.46243001],
       [0.62062951, 0.37937049],
       [0.62272475, 0.37727525],
       ...,
       [0.57692316, 0.42307684],
       [0.57422475, 0.42577525],
       [0.56218418, 0.43781582]])

In [20]:
submission_df = pd.DataFrame([prob[1] for prob in rf_pred_proba], columns=['WnvPresent'], index=kaggle_X.index)
submission_df.head()

Unnamed: 0_level_0,WnvPresent
Id,Unnamed: 1_level_1
1,0.46243
2,0.37937
3,0.377275
4,0.397347
5,0.362308


In [21]:
submission_df.to_csv('../Cleansed_Data/tired_submission.csv')