# 03 Build Models

## Imports

* We want AUC ROC

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier





from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.layers import Dropout
from keras import regularizers



import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Using TensorFlow backend.


# Functions

In [2]:
def get_metrics(y_true, y_predict, print_scores = True):
    matrix_def = [['tn','fp'], ['fn','tp']]
    matrix = confusion_matrix(y_true, y_predict)
    tn, fp, fn, tp = matrix.ravel()
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    misclass = 1-accuracy
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    precision = tp/(tp+fp)
    if print_scores:
        print('Matrix Definition')
        print(np.array(matrix_def))
        print('')
        print('Confusion Matrix')
        print(matrix)
        print('')
        print('METRICS')
        print('accuracy:', accuracy)
        print('misclass:', misclass)
        print('sensitivity:', sensitivity)
        print('specificity:', specificity)
    else:
        return accuracy, misclass, sensitivity, specificity, precision

## Read in Train and Test Data

In [3]:
with open('../Cleansed_Data/X_train_ss.pkl','rb') as f:
    X_train_ss = pickle.load(f)
    
with open('../Cleansed_Data/X_test_ss.pkl','rb') as f:
    X_test_ss = pickle.load(f)
    
with open('../Cleansed_Data/y_train.pkl','rb') as f:
    y_train = pickle.load(f)
    
with open('../Cleansed_Data/y_test.pkl','rb') as f:
    y_test = pickle.load(f)

## Import Kaggle Test Data

In [4]:
kaggle_X = pd.read_csv('../Cleansed_Data/test_final.csv')

In [5]:
X_test_ss = pd.DataFrame(X_test_ss, columns=X_train_ss.columns)

In [6]:
X_train_ss.drop('NumMosquitos', 1, inplace=True)

In [7]:
X_test_ss.drop('NumMosquitos', 1, inplace=True)

In [8]:
kaggle_X.drop('NumMosquitos', 1, inplace=True)

In [9]:
rf_tf_pipe = Pipeline([
    ('pca', PCA()),
    ('rf', RandomForestClassifier())
])

In [10]:
num_components = len(X_train_ss.columns)

In [11]:
rf_tf_params = {
    'pca__n_components': list(range(int(num_components/2), num_components, 10)),
    'rf__max_depth': list(np.linspace(5, 500, 10)),
    'rf__min_samples_leaf': list(range(1,10)),
    'rf__n_estimators': [10, 50, 100],
}

In [12]:
grid = GridSearchCV(rf_tf_pipe, 
                    {},
                        n_jobs=3,)

In [13]:
grid.fit(X_train_ss, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
          ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=3, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
roc_auc_score(y_train, grid.predict(X_train_ss))

0.8978760305053337

In [18]:
roc_auc_score(y_test, grid.predict(X_test_ss))

0.5495362471015444