# Importing packages and csv files

In [36]:
# importing essential libraries for data cleaning and EDA
import numpy as np
import pandas as pd

# for handling date time
import datetime as dt

# stats and other libraries
import scipy.stats as stats

# plotting libraries
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)

import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid")

# for handling geographic data
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame
from geopy.distance import geodesic

from shapely.geometry import Point, LineString, Polygon
from shapely.ops import nearest_points

# modeling libraries
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn import svm

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB





In [2]:
# changing pandas' option to display all columns

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 4000)



In [3]:
# using IPython to display output more easily (without having to call print)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
combined = pd.read_csv('./assets/train_processed.csv', index_col=0)
combined.shape
combined.head()

(8610, 46)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,WnvPresent,NumMosquitos,geometry,Coord,YearMonth,Year,Month,Day,Week,n3_intersect_dates,n3_spray_sets,n3_spray_geom,n3_spray_coord,n3_spray_dist_deg,n3_spray_dist_m,n7_intersect_dates,n7_spray_sets,n7_spray_geom,n7_spray_coord,n7_spray_dist_deg,n7_spray_dist_m,n14_intersect_dates,n14_spray_sets,n14_spray_geom,n14_spray_coord,n14_spray_dist_deg,n14_spray_dist_m,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed
0,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,0,1,POINT (-87.654224 41.867108),"(41.867108, -87.654224)",2007-05,2007,5,29,22,,,,,,,,,,,,,,,,,,,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4
1,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,0,2,POINT (-87.654224 41.867108),"(41.867108, -87.654224)",2007-05,2007,5,29,22,,,,,,,,,,,,,,,,,,,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4
2,2007-05-29,"1100 South Peoria Street, Chicago, IL 60608, USA",CULEX RESTUANS,11,S PEORIA ST,T091,"1100 S PEORIA ST, Chicago, IL",41.862292,-87.64886,8,0,1,POINT (-87.64886 41.862292),"(41.862292, -87.64886)",2007-05,2007,5,29,22,,,,,,,,,,,,,,,,,,,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4
3,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,8,0,1,POINT (-87.655232 41.896282),"(41.896282, -87.655232)",2007-05,2007,5,29,22,,,,,,,,,,,,,,,,,,,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4
4,2007-05-29,"1500 North Long Avenue, Chicago, IL 60651, USA",CULEX RESTUANS,15,N LONG AVE,T153,"1500 N LONG AVE, Chicago, IL",41.907645,-87.760886,8,0,1,POINT (-87.760886 41.907645),"(41.907645, -87.760886)",2007-05,2007,5,29,22,,,,,,,,,,,,,,,,,,,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4


# Pre-processing our data

In [5]:
combined.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'WnvPresent', 'NumMosquitos', 'geometry', 'Coord', 'YearMonth', 'Year',
       'Month', 'Day', 'Week', 'n3_intersect_dates', 'n3_spray_sets',
       'n3_spray_geom', 'n3_spray_coord', 'n3_spray_dist_deg',
       'n3_spray_dist_m', 'n7_intersect_dates', 'n7_spray_sets',
       'n7_spray_geom', 'n7_spray_coord', 'n7_spray_dist_deg',
       'n7_spray_dist_m', 'n14_intersect_dates', 'n14_spray_sets',
       'n14_spray_geom', 'n14_spray_coord', 'n14_spray_dist_deg',
       'n14_spray_dist_m', 'Tavg', 'PrecipTotal', 'StnPressure', 'AvgSpeed',
       'Fort_Date', 'Fort_Tavg', 'Fort_PrecipTotal', 'Fort_StnPressure',
       'Fort_AvgSpeed'],
      dtype='object')

We will drop the columns that we wouldn't be using for modeling

- `Address, Block, Street, AddressNumberAndStreet, AddressAccuracy, geometry, Coord` - these are all captured under Longitude, Latitude
- all the columns starting with 'n' - since we've decided that spray data isn't very helpful.


In [6]:
n3_cols = [ col for col in combined.columns if "n3" in col]
n7_cols = [ col for col in combined.columns if "n7" in col]
n14_cols = [ col for col in combined.columns if "n14" in col]



In [7]:
cols_to_drop = ['Address', 'Block', 'Street', 'AddressNumberAndStreet', 'AddressAccuracy', 'geometry', 'Coord']
cols_to_drop.extend(n3_cols)
cols_to_drop.extend(n7_cols)
cols_to_drop.extend(n14_cols)
cols_to_drop


['Address',
 'Block',
 'Street',
 'AddressNumberAndStreet',
 'AddressAccuracy',
 'geometry',
 'Coord',
 'n3_intersect_dates',
 'n3_spray_sets',
 'n3_spray_geom',
 'n3_spray_coord',
 'n3_spray_dist_deg',
 'n3_spray_dist_m',
 'n7_intersect_dates',
 'n7_spray_sets',
 'n7_spray_geom',
 'n7_spray_coord',
 'n7_spray_dist_deg',
 'n7_spray_dist_m',
 'n14_intersect_dates',
 'n14_spray_sets',
 'n14_spray_geom',
 'n14_spray_coord',
 'n14_spray_dist_deg',
 'n14_spray_dist_m']

In [8]:
df = combined.drop(columns= cols_to_drop)
df.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,WnvPresent,NumMosquitos,YearMonth,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed
0,2007-05-29,CULEX PIPIENS/RESTUANS,T048,41.867108,-87.654224,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4
1,2007-05-29,CULEX RESTUANS,T048,41.867108,-87.654224,0,2,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4
2,2007-05-29,CULEX RESTUANS,T091,41.862292,-87.64886,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4
3,2007-05-29,CULEX RESTUANS,T049,41.896282,-87.655232,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4
4,2007-05-29,CULEX RESTUANS,T153,41.907645,-87.760886,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8610 entries, 0 to 8609
Data columns (total 21 columns):
Date                8610 non-null object
Species             8610 non-null object
Trap                8610 non-null object
Latitude            8610 non-null float64
Longitude           8610 non-null float64
WnvPresent          8610 non-null int64
NumMosquitos        8610 non-null int64
YearMonth           8610 non-null object
Year                8610 non-null int64
Month               8610 non-null int64
Day                 8610 non-null int64
Week                8610 non-null int64
Tavg                8610 non-null float64
PrecipTotal         8610 non-null float64
StnPressure         8610 non-null float64
AvgSpeed            8610 non-null float64
Fort_Date           8610 non-null object
Fort_Tavg           8610 non-null float64
Fort_PrecipTotal    8610 non-null float64
Fort_StnPressure    8610 non-null float64
Fort_AvgSpeed       8610 non-null float64
dtypes: float64(10), int64(6

In [10]:
dummies = pd.get_dummies(df["Species"])
dummies.head()

Unnamed: 0,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS
0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0


In [11]:
# formatting the names into something shorter
species_names = [ name.lower().replace("culex ", "") for name in dummies.columns ]
species_names

['erraticus',
 'pipiens',
 'pipiens/restuans',
 'restuans',
 'salinarius',
 'tarsalis',
 'territans']

In [12]:
# taking out the slash character 
species_names[2] = "pipiens_restuans"
species_names

['erraticus',
 'pipiens',
 'pipiens_restuans',
 'restuans',
 'salinarius',
 'tarsalis',
 'territans']

In [13]:
# renaming columns for dummies df
dummies.columns=species_names
dummies.head()

Unnamed: 0,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0


In [14]:
df = pd.concat([df,dummies],axis=1)


In [15]:
# besides Species, we will also drop the Trap column since we can't meaningfully turn that into numeric, and it's also not a unique identifier
# Date and Fort_Date also need to be dropped
# NumMosquitos need to be dropped since it's not in test
df.drop(columns=["Species", "Trap"], inplace=True)
df.head(2)

Unnamed: 0,Date,Latitude,Longitude,WnvPresent,NumMosquitos,YearMonth,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,2007-05-29,41.867108,-87.654224,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,1,0,0,0,0
1,2007-05-29,41.867108,-87.654224,0,2,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0


# Modeling

In [16]:
# dropping all datetime columns
# NumMosquitos need to be dropped since it's not in test

X = df.drop(columns=["WnvPresent", "Date", "NumMosquitos", "YearMonth", "Fort_Date"])
X.head(2)

Unnamed: 0,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,41.867108,-87.654224,2007,5,29,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,1,0,0,0,0
1,41.867108,-87.654224,2007,5,29,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0


In [17]:
y = df["WnvPresent"]
y.head(2)

0    0
1    0
Name: WnvPresent, dtype: int64

In [18]:

# doing train, test, split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state = 142)

ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_val_sc = ss.transform(X_val)

## Modeling without tuning hyper-parameters

We will try to model using these classifiers:

* Logistic Regression
* K Neighbors Classifier
* Decision Tree Classifier
* Random Forest Classifier
* Extra Trees Classifier

In [19]:
# Decision Tree Classifier
dt = DecisionTreeClassifier()

dt.fit(X_train_sc, y_train)

dt_train = dt.score(X_train_sc, y_train)
dt_val = dt.score(X_val_sc, y_val)
print("Train score:", dt_train)
print("Val score:", dt_val)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

Train score: 0.9891590521914202
Val score: 0.8964235949837436


In [20]:
knn = KNeighborsClassifier()

knn.fit(X_train_sc, y_train)

knn_train = knn.score(X_train_sc, y_train)
knn_val = knn.score(X_val_sc, y_val)

print("Train core:", knn_train)
print("Val score:", knn_val)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

Train core: 0.9501316400805327
Val score: 0.9391546679052485


In [21]:
lr = LogisticRegression()

lr.fit(X_train_sc, y_train)

lr_train = lr.score(X_train_sc, y_train)
lr_val = lr.score(X_val_sc, y_val)
print("Train score:", lr_train)
print("Val score:", lr_val)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Train score: 0.9468793557379588
Val score: 0.9470506270320483


In [22]:
# random forest

rf = RandomForestClassifier()

rf.fit(X_train_sc, y_train)

rf_train = rf.score(X_train_sc, y_train)
rf_val = rf.score(X_val_sc, y_val)

print("Train score:", rf_train)
print("Val score:", rf_val)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Train score: 0.9797119405296577
Val score: 0.9284718996748723


In [23]:
et = ExtraTreesClassifier()

et.fit(X_train_sc, y_train)

et_train = et.score(X_train_sc, y_train)
et_val = et.score(X_val_sc, y_val)

print("Train score:", et_train)
print("Val score:", et_val)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

Train score: 0.9891590521914202
Val score: 0.913144449605202


Based on accuracy scores, the best model was Logistic Regression, which returned a relatively high validation score of 0.9471 that was in fact, marginally higher than the train score of 0.9469.

But we'll try everything again with GridSearchCV.

## Running GridSearchCV

In [38]:
models = {
        "lr" : LogisticRegression(),
        "knn" : KNeighborsClassifier(),
        "dt" : DecisionTreeClassifier(),
        "rf" : RandomForestClassifier(),
        "et" : ExtraTreesClassifier()
        }

gs_parameters = {
            "lr" : {'penalty' : ['l1','l2'],
                    'solver' : ['liblinear'],
                    'C' : np.logspace(-5,0,100)
                    },
            "knn" : {'n_neighbors' : [1, 3, 5, 7, 15],
                     'weights':['uniform','distance'],
                     'metric':['euclidean','manhattan']
                    },
            "dt" : {'max_depth': [2, 3, 5, 7, 10],
                    'min_samples_split': [5, 10, 15, 20],
                    'min_samples_leaf': [2, 3, 4, 5, 6, 7]
                    },
            "rf" : {'n_estimators': [10, 20, 50, 100, 150, 200],
                    'max_depth': [1, 2, 3, 4, 5, 10],
                    'min_samples_leaf': [10, 20, 50]
                    },
            "et" : {'n_estimators': [10, 20, 50, 100, 150, 200],
                    'max_depth': [2, 3, 4, 5, 10, 20],
                    'min_samples_leaf': [10, 20, 50]
                    },
                }

In [72]:
Grid_Summary = []

for key in models.keys():
    model = models[key]
#     print(model)
    params = gs_parameters[key]
#     print(params)
    
    print("---------------------------------Running GridSearchCV for %s ---------------" % key)
    gs = GridSearchCV(model, params, cv=5, verbose=1, n_jobs=-1, scoring="roc_auc")
    
    gs.fit(X_train_sc, y_train)
    
    
    best_estimator = gs.best_estimator_
    best_params = gs.best_params_
    best_train_score = gs.best_score_
    
    val_score = gs.score(X_val_sc, y_val)
    
    y_val_proba = gs.predict_proba(X_val_sc)
    
    df_val_proba = pd.DataFrame(y_val_proba)
    y_val_proba_0 = df_val_proba[1]
    roc_auc = roc_auc_score(y_val, y_val_proba_0)
    
    # GridResults[key] = gs
    
    print(f"Best estimator is: {gs.best_estimator_}")
    print(f"Best params are: {gs.best_params_}")
    print(f"Best train score is: {gs.best_score_}")
    print(f"Validation score: {gs.score(X_val_sc, y_val)}")

    print(f"ROC/AUC : {roc_auc}")

    Grid_Summary.append([key, best_estimator, best_params, best_train_score, val_score, roc_auc])

    print("---")


---------------------------------Running GridSearchCV for lr ---------------
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   12.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.00000000e...
       1.74752840e-01, 1.96304065e-01, 2.20513074e-01, 2.47707636e-01,
       2.78255940e-01, 3.12571585e-01, 3.51119173e-01, 3.94420606e-01,
       4.43062146e-01, 4.97702356e-01, 5.59081018e-01, 6.28029144e-01,
       7.05480231e-01, 7.92482898e-01, 8.90215085e-01, 1.00000000e+00]),
       

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best estimator is: LogisticRegression(C=0.49770235643321137, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
Best params are: {'C': 0.49770235643321137, 'penalty': 'l2', 'solver': 'liblinear'}
Best train score is: 0.7813263952337094
Validation score: 0.7797187303717853
ROC/AUC : 0.7797187303717853
---
---------------------------------Running GridSearchCV for knn ---------------
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [1, 3, 5, 7, 15],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=1)

Best estimator is: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
Best params are: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}
Best train score is: 0.77196993723735
Validation score: 0.7807684365400996
ROC/AUC : 0.7807684365400996
---
---------------------------------Running GridSearchCV for dt ---------------
Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    4.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [2, 3, 5, 7, 10],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7],
                 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best estimator is: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
Best params are: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 20}
Best train score is: 0.7898853317133172
Validation score: 0.7486663569173055
ROC/AUC : 0.7486663569173055
---
---------------------------------Running GridSearchCV for rf ---------------
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 522 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

Best estimator is: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Best params are: {'max_depth': 10, 'min_samples_leaf': 20, 'n_estimators': 200}
Best train score is: 0.8391260323983277
Validation score: 0.8378741729261849
ROC/AUC : 0.8378741729261849
---
---------------------------------Running GridSearchCV for et ---------------
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 444 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:   53.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None,
                                            criterion='gini', max_depth=None,
                                            max_features='auto',
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators='warn', n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 5

Best estimator is: ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=10, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=10, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=150,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Best params are: {'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 150}
Best train score is: 0.836226918876731
Validation score: 0.8452264181788458
ROC/AUC : 0.8452264181788458
---


In [74]:
# new grid search

grid_df = pd.DataFrame(Grid_Summary, columns=["model", "estimator", "params", "train score", "val score", "roc_auc" ])
grid_df

Unnamed: 0,model,estimator,params,train score,val score,roc_auc
0,lr,"LogisticRegression(C=0.49770235643321137, clas...","{'C': 0.49770235643321137, 'penalty': 'l2', 's...",0.781326,0.779719,0.779719
1,knn,"KNeighborsClassifier(algorithm='auto', leaf_si...","{'metric': 'manhattan', 'n_neighbors': 15, 'we...",0.77197,0.780768,0.780768
2,dt,"DecisionTreeClassifier(class_weight=None, crit...","{'max_depth': 5, 'min_samples_leaf': 2, 'min_s...",0.789885,0.748666,0.748666
3,rf,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 10, 'min_samples_leaf': 20, 'n_e...",0.839126,0.837874,0.837874
4,et,"(ExtraTreeClassifier(class_weight=None, criter...","{'max_depth': 10, 'min_samples_leaf': 10, 'n_e...",0.836227,0.845226,0.845226


*Note*: When scoring the GridSearchResults using accuracy (i.e. default scorer), somehow all the models were returning the exact same train and validation scores. 

Therefore, this version of the GridSearch is based on roc_auc - which also explains the significant drop compared to the earlier results for the un-tuned models.

Based on roc_auc, Extra Tress Classifier offers the best results.

## Voting Classifier

In [81]:
# testing if we get the full model returned
grid_df["estimator"][0]

LogisticRegression(C=0.49770235643321137, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [82]:
# initialising our hyper-tuned models

b_lr = grid_df["estimator"][0]
b_knn = grid_df["estimator"][1]
b_dt = grid_df["estimator"][2]
b_rf = grid_df["estimator"][3]
b_et = grid_df["estimator"][4]

In [87]:
vote = VotingClassifier(estimators = [
                                ('lr', b_lr),
                                ('knn', b_knn),
                                ('dt', b_dt),
                                ('rf', b_rf),
                                ('et', b_et),
                                     ],
                                    voting = "soft")


In [88]:
vote.fit(X_train_sc, y_train)
print(f'Accuracy score on training set: {vote.score(X_train_sc, y_train)}')
print(f'Accuracy score on evaluation set: {vote.score(X_val_sc, y_val)}')

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=0.49770235643321137,
                                                 class_weight=None, dual=False,
                                                 fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,...
                                               

Accuracy score on training set: 0.9468793557379588
Accuracy score on evaluation set: 0.9470506270320483


In [90]:
# calculating roc_auc 
pred_proba = [i[1] for i in vote.predict_proba(X_val_sc)]

print("ROC/AUC score for Voting Classifier is:", roc_auc_score(y_val, pred_proba))

ROC/AUC score for Voting Classifier is: 0.8360931140996188


Since the Extra Trees Classifier's ROC/AUC score of 0.845 is a little better than the Voting Classifier's 0.837, we will use `b_et` to predict our test data set.


# Prepping test

In [375]:
test = pd.read_csv('./assets/test.csv')
test.shape
test.head()

(116293, 11)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [376]:
test.tail(3)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
116290,116291,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX TARSALIS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,8
116291,116292,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",UNSPECIFIED CULEX,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,8
116292,116293,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX ERRATICUS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,8


In [377]:
# creating a month column
test["Year"] = pd.to_datetime(test['Date']).apply(lambda x: '{}'.format(x.year)).astype(int)

# creating a month column
test["Month"] = pd.to_datetime(test['Date']).apply(lambda x: '{}'.format(x.month)).astype(int)

# creating a day column
test["Day"] = pd.to_datetime(test['Date']).apply(lambda x: '{}'.format(x.day)).astype(int)

# creating a week column
test["Week"] = pd.to_datetime(test['Date']).apply(lambda x: '{}'.format(x.week)).astype(int)

In [378]:
test.head(2)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Year,Month,Day,Week
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,2008,6,11,24
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,2008,6,11,24


In [379]:
test.isnull().sum()

Id                        0
Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
Year                      0
Month                     0
Day                       0
Week                      0
dtype: int64

In [380]:
# getting weather records for the same day
weather_comb = pd.read_csv('./assets/weather_combined_processed.csv', index_col=0)
weather_comb.shape
weather_comb.head(2)

(1471, 5)

Unnamed: 0,Date,Tavg,PrecipTotal,StnPressure,AvgSpeed
0,2007-05-01,67.5,0.0,29.14,9.4
1,2007-05-02,51.5,0.0,29.41,13.4


In [381]:
weather_comb.loc[weather_comb["Date"] == "2014-10-02"]

Unnamed: 0,Date,Tavg,PrecipTotal,StnPressure,AvgSpeed
1441,2014-10-02,69.5,0.52,29.065,7.9


In [382]:
# merging test with weather_comb
test_combined = pd.merge(test, weather_comb, how = 'left', on = 'Date')
test_combined.head(2)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,2008,6,11,24,75.0,0.0,29.31,10.2
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,2008,6,11,24,75.0,0.0,29.31,10.2


In [383]:
test_combined.shape
test_combined.tail(2)

(116293, 19)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed
116291,116292,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",UNSPECIFIED CULEX,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,8,2014,10,2,40,69.5,0.52,29.065,7.9
116292,116293,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX ERRATICUS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,8,2014,10,2,40,69.5,0.52,29.065,7.9


In [384]:
# getting weather records from 14 days before
weather_fort = pd.read_csv('./assets/weather_fort_processed.csv', index_col=0)
weather_fort.shape
weather_fort.head(2)

(1471, 5)

Unnamed: 0,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed
0,2007-04-17,67.5,0.0,29.14,9.4
1,2007-04-18,51.5,0.0,29.41,13.4


In [385]:
# merging test_combined with weather_fort
test_combined = pd.merge(test_combined, weather_fort, how = 'left', left_on = 'Date', right_on = 'Fort_Date')
test_combined.head(2)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9


In [386]:
test_combined.shape

(116293, 24)

In [387]:
test_combined.tail(2)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed
116291,116292,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",UNSPECIFIED CULEX,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,8,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9
116292,116293,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX ERRATICUS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.63359,8,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9


In [388]:
test_cols_drop = ['Address', 'Block', 'Street', 'AddressNumberAndStreet', 'AddressAccuracy']
test_df = test_combined.drop(columns= test_cols_drop)
test_df.head(2)

Unnamed: 0,Id,Date,Species,Trap,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed
0,1,2008-06-11,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9
1,2,2008-06-11,CULEX RESTUANS,T002,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9


In [389]:
test_dummies = pd.get_dummies(test_df["Species"])
test_dummies.head()

Unnamed: 0,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX
0,0,0,1,0,0,0,0,0
1,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0


In [390]:
test_dummies.isnull().sum()

CULEX ERRATICUS           0
CULEX PIPIENS             0
CULEX PIPIENS/RESTUANS    0
CULEX RESTUANS            0
CULEX SALINARIUS          0
CULEX TARSALIS            0
CULEX TERRITANS           0
UNSPECIFIED CULEX         0
dtype: int64

In [391]:
test_dummies.drop(columns=["UNSPECIFIED CULEX"], inplace=True)
test_dummies.head(2)

Unnamed: 0,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS
0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0


In [392]:
test_dummies.columns=species_names
test_dummies.head(2)

Unnamed: 0,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0


In [393]:
test_df = pd.concat([test_df,test_dummies],axis=1)
test_df.head(2)

Unnamed: 0,Id,Date,Species,Trap,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,1,2008-06-11,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9,0,0,1,0,0,0,0
1,2,2008-06-11,CULEX RESTUANS,T002,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9,0,0,0,1,0,0,0


In [394]:
test_df.isnull().sum()

Id                  0
Date                0
Species             0
Trap                0
Latitude            0
Longitude           0
Year                0
Month               0
Day                 0
Week                0
Tavg                0
PrecipTotal         0
StnPressure         0
AvgSpeed            0
Fort_Date           0
Fort_Tavg           0
Fort_PrecipTotal    0
Fort_StnPressure    0
Fort_AvgSpeed       0
erraticus           0
pipiens             0
pipiens_restuans    0
restuans            0
salinarius          0
tarsalis            0
territans           0
dtype: int64

In [395]:
test_df.drop(columns=["Species", "Trap"], inplace=True)
test_df.head(2)

Unnamed: 0,Id,Date,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,1,2008-06-11,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9,0,0,1,0,0,0,0
1,2,2008-06-11,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9,0,0,0,1,0,0,0


In [396]:
test_df.tail(2)

Unnamed: 0,Id,Date,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
116291,116292,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,0,0,0
116292,116293,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,1,0,0,0,0,0,0


In [397]:
test_keepid = test_df.copy()

In [398]:
test_df.set_index("Id", inplace=True)
test_df.tail(2)

Unnamed: 0_level_0,Date,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
116292,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,0,0,0
116293,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,1,0,0,0,0,0,0


## Predicting for test

In [399]:
X_test = test_df.drop(columns=["Date", "Fort_Date"])
X_test.head(2)

Unnamed: 0_level_0,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,1,0,0,0,0
2,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,0,1,0,0,0


In [400]:
X_test.shape

(116293, 21)

In [401]:
X.shape

(8610, 21)

In [402]:
ss = StandardScaler()

# fitting 
X_sc = ss.fit_transform(X)
X_test_sc = ss.transform(X_test)

In [403]:
b_et.fit(X_sc, y)



  """Entry point for launching an IPython kernel.


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=10, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=10, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=150,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [404]:
pred_proba = b_et.predict_proba(X_test_sc)
pred_proba

array([[0.98481771, 0.01518229],
       [0.99715028, 0.00284972],
       [0.97095297, 0.02904703],
       ...,
       [0.94041931, 0.05958069],
       [0.94041931, 0.05958069],
       [0.94041931, 0.05958069]])

In [405]:
WnvPresent_proba = [prob[1] for prob in pred_proba]
WnvPresent_proba

[0.015182290986491234,
 0.0028497211387189685,
 0.02904702697258914,
 0.009056590487028802,
 0.006254965150088627,
 0.009679540875093552,
 0.009679540875093552,
 0.009679540875093552,
 0.010393780395282756,
 0.002530133156627293,
 0.023033723983720163,
 0.0026799325114008703,
 0.0018461682383067131,
 0.0033028828994656196,
 0.0033028828994656196,
 0.0033028828994656196,
 0.0168631486703757,
 0.0043602561583343915,
 0.029523217448779622,
 0.009180773493564751,
 0.007490259267735688,
 0.0098037238816295,
 0.0098037238816295,
 0.0098037238816295,
 0.007484131754090755,
 0.0031827086500013538,
 0.02578233633842739,
 0.0017052838778036476,
 0.0008967184512411636,
 0.0023282342658683974,
 0.0023282342658683974,
 0.0023282342658683974,
 0.00788337402821885,
 0.0030720585808450608,
 0.021620203487771435,
 0.0021196162774301643,
 0.000837194641717354,
 0.0027425666654949145,
 0.0027425666654949145,
 0.0027425666654949145,
 0.00788337402821885,
 0.0030720585808450608,
 0.021620203487771435,
 0.0

In [406]:
test_df["WnvPresent"] = WnvPresent_proba
test_df["WnvPresent"].head()

Id
1    0.015182
2    0.002850
3    0.029047
4    0.009057
5    0.006255
Name: WnvPresent, dtype: float64

In [334]:
submission = test_df[["WnvPresent"]]
submission.index.name = "Id"
submission.head()

Unnamed: 0_level_0,WnvPresent
Id,Unnamed: 1_level_1
1,0.016814
2,0.003286
3,0.029004
4,0.005925
5,0.004223


In [335]:
submission.to_csv("./assets/submission_b_et_proba.csv", index = True)

In [336]:
submission.shape

(116293, 1)

**Kaggle**

Private: 0.71901

Public: 0.72772

# Additional feature selection

## Dropping `Year`, `Month`, `Day` 

We're leaving only 'Week`

In [407]:
X.head()

Unnamed: 0,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,41.867108,-87.654224,2007,5,29,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,1,0,0,0,0
1,41.867108,-87.654224,2007,5,29,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0
2,41.862292,-87.64886,2007,5,29,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0
3,41.896282,-87.655232,2007,5,29,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0
4,41.907645,-87.760886,2007,5,29,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0


In [408]:
X_no_ym = X.drop(columns=["Year", "Month", "Day"])
X_no_ym

Unnamed: 0,Latitude,Longitude,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,41.867108,-87.654224,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,1,0,0,0,0
1,41.867108,-87.654224,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,0,1,0,0,0
2,41.862292,-87.648860,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,0,1,0,0,0
3,41.896282,-87.655232,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,0,1,0,0,0
4,41.907645,-87.760886,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8605,41.726465,-87.585413,39,64.0,0.0,29.370,4.40,61.5,0.0,29.405,3.85,0,1,0,0,0,0,0
8606,41.726465,-87.585413,39,64.0,0.0,29.370,4.40,61.5,0.0,29.405,3.85,0,0,1,0,0,0,0
8607,41.723195,-87.649970,39,64.0,0.0,29.370,4.40,61.5,0.0,29.405,3.85,0,0,1,0,0,0,0
8608,41.868077,-87.666901,39,64.0,0.0,29.370,4.40,61.5,0.0,29.405,3.85,0,0,1,0,0,0,0


In [409]:
X_no_ym.head()

Unnamed: 0,Latitude,Longitude,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,41.867108,-87.654224,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,1,0,0,0,0
1,41.867108,-87.654224,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0
2,41.862292,-87.64886,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0
3,41.896282,-87.655232,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0
4,41.907645,-87.760886,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0


In [410]:
# doing train, test, split
X_train, X_val, y_train, y_val = train_test_split(X_no_ym, y, stratify=y, random_state = 142)

ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_val_sc = ss.transform(X_val)

### Predicting using Voting Classifier

In [411]:
vote = VotingClassifier(estimators = [
                                ('lr', b_lr),
                                ('knn', b_knn),
                                ('dt', b_dt),
                                ('rf', b_rf),
                                ('et', b_et),
                                     ],
                                    voting = "soft")


vote.fit(X_train_sc, y_train)
print(f'Accuracy score on training set: {vote.score(X_train_sc, y_train)}')
print(f'Accuracy score on evaluation set: {vote.score(X_val_sc, y_val)}')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=0.49770235643321137,
                                                 class_weight=None, dual=False,
                                                 fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,...
                                               

Accuracy score on training set: 0.9468793557379588
Accuracy score on evaluation set: 0.9470506270320483


In [412]:
X_test.head()

Unnamed: 0_level_0,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,1,0,0,0,0
2,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,0,1,0,0,0
3,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,1,0,0,0,0,0
4,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,0,0,1,0,0
5,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,0,0,0,0,1


In [413]:
X_test_no_ym = X_test.drop(columns=["Year", "Month", "Day"])
X_test_no_ym.head(2)

Unnamed: 0_level_0,Latitude,Longitude,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,41.95469,-87.800991,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,1,0,0,0,0
2,41.95469,-87.800991,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,0,1,0,0,0


In [414]:
ss = StandardScaler()

X_no_ym_sc = ss.fit_transform(X_no_ym)
X_test_no_ym_sc = ss.transform(X_test_no_ym)

In [415]:
vote.fit(X_no_ym_sc, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=0.49770235643321137,
                                                 class_weight=None, dual=False,
                                                 fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,...
                                               

In [416]:
pred_proba = vote.predict_proba(X_test_no_ym_sc)
pred_proba

array([[0.98108911, 0.01891089],
       [0.99366916, 0.00633084],
       [0.9460988 , 0.0539012 ],
       ...,
       [0.97847558, 0.02152442],
       [0.97609873, 0.02390127],
       [0.97859326, 0.02140674]])

In [417]:
WnvPresent_proba = [prob[1] for prob in pred_proba]
WnvPresent_proba

[0.018910887428591972,
 0.006330836823385126,
 0.053901197657079424,
 0.007105051276047786,
 0.006690660830768557,
 0.007794771926495331,
 0.011085178882094707,
 0.007631061864846489,
 0.01343473227724834,
 0.005404426106347791,
 0.07604327301505096,
 0.005028734817383675,
 0.0044916713894630825,
 0.005709593785779943,
 0.008388452995960796,
 0.005576745007997421,
 0.022817339063010377,
 0.00976875184992893,
 0.08581817446742243,
 0.010232523218165621,
 0.009780122140207653,
 0.010933634614132682,
 0.014574235872555486,
 0.010752161080378572,
 0.008272394004459546,
 0.004367563808558229,
 0.015466145544859167,
 0.0028091762725097995,
 0.002378196983636264,
 0.0033387030072761755,
 0.005045135444832654,
 0.0032545145314037153,
 0.008121960765925482,
 0.004374620098894107,
 0.04086014249491449,
 0.0024096774991767224,
 0.0018755601319240874,
 0.0029510957555999575,
 0.004608725202691462,
 0.0028693361621455,
 0.008189493219990162,
 0.004319213691732299,
 0.04070816623891874,
 0.002429204

In [418]:
test_df["WnvPresent"] = WnvPresent_proba
test_df["WnvPresent"].head()


Id
1    0.018911
2    0.006331
3    0.053901
4    0.007105
5    0.006691
Name: WnvPresent, dtype: float64

In [183]:

submission = test_df[["WnvPresent"]]
submission.index.name = "Id"
submission.head()

submission.to_csv("./assets/submission_no_ym_vote_proba_redo.csv", index = True)

Unnamed: 0_level_0,WnvPresent
Id,Unnamed: 1_level_1
1,0.019205
2,0.007199
3,0.052898
4,0.009066
5,0.008393


**Kaggle**

Private: 0.68836

Public: 0.69945


### Predicting using Extra Trees Classifier

In [419]:
b_et.fit(X_no_ym_sc, y)

  """Entry point for launching an IPython kernel.


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=10, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=10, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=150,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [420]:
pred_proba = b_et.predict_proba(X_test_no_ym_sc)
pred_proba

array([[0.96259965, 0.03740035],
       [0.98979   , 0.01021   ],
       [0.94553612, 0.05446388],
       ...,
       [0.97470513, 0.02529487],
       [0.97470513, 0.02529487],
       [0.97470513, 0.02529487]])

In [421]:
WnvPresent_proba = [prob[1] for prob in pred_proba]
WnvPresent_proba

test_df["WnvPresent"] = WnvPresent_proba
test_df["WnvPresent"].head()


[0.03740035034104584,
 0.010210003084355738,
 0.054463879350433946,
 0.013529988223310757,
 0.009278308169087703,
 0.018293527246047936,
 0.018293527246047936,
 0.018293527246047936,
 0.03556207946463543,
 0.009889330585162986,
 0.0572935406393214,
 0.011715392790883676,
 0.006554418528453368,
 0.016138183672992867,
 0.016138183672992867,
 0.016138183672992867,
 0.04123123399287671,
 0.012658388364338639,
 0.059690261462915506,
 0.016708735891236733,
 0.012493485709509124,
 0.02134271072080548,
 0.02134271072080548,
 0.02134271072080548,
 0.02599964829305693,
 0.008690988012294886,
 0.036218751107161655,
 0.0073806546344594385,
 0.006056374371979095,
 0.013667982384064054,
 0.013667982384064054,
 0.013667982384064054,
 0.0242318610173906,
 0.008356576744682029,
 0.034387008115800005,
 0.006775865098143597,
 0.005451584835663252,
 0.012780361663789718,
 0.012780361663789718,
 0.012780361663789718,
 0.02489246800265057,
 0.00853771553429736,
 0.03520026484805531,
 0.007886435871347719,
 

Id
1    0.037400
2    0.010210
3    0.054464
4    0.013530
5    0.009278
Name: WnvPresent, dtype: float64

In [187]:

submission = test_df[["WnvPresent"]]
submission.index.name = "Id"
submission.head()

submission.to_csv("./assets/submission_no_ym_et_proba.csv", index = True)

Unnamed: 0_level_0,WnvPresent
Id,Unnamed: 1_level_1
1,0.033654
2,0.009243
3,0.055521
4,0.015471
5,0.007852


**Kaggle**

Private: 0.70180

Public: 0.71639

So far, it seems like Extra Trees Classifier is doing better than Voting Classifier, but the model with years, month and days seem to be doing better.


## Exploring other features from weather

In [422]:
weather_comb = pd.read_csv('./assets/weather_combined_processed.csv', index_col=0)
weather_comb.shape
weather_comb.head()

(1471, 5)

Unnamed: 0,Date,Tavg,PrecipTotal,StnPressure,AvgSpeed
0,2007-05-01,67.5,0.0,29.14,9.4
1,2007-05-02,51.5,0.0,29.41,13.4
2,2007-05-03,57.0,0.0,29.425,12.55
3,2007-05-04,58.0,0.0,29.31,10.8
4,2007-05-05,60.0,0.0,29.43,11.75


In [423]:
weather_comb["Date"] = pd.to_datetime(weather_comb["Date"])
# weather_comb.set_index("Date", inplace=True)
# weather_comb.sort_index(inplace=True)

In [424]:
weather_comb.head(3)

Unnamed: 0,Date,Tavg,PrecipTotal,StnPressure,AvgSpeed
0,2007-05-01,67.5,0.0,29.14,9.4
1,2007-05-02,51.5,0.0,29.41,13.4
2,2007-05-03,57.0,0.0,29.425,12.55


In [425]:
weather_comb.tail(3)

Unnamed: 0,Date,Tavg,PrecipTotal,StnPressure,AvgSpeed
1468,2014-10-29,44.0,0.0,29.39,9.45
1469,2014-10-30,43.5,0.0,29.375,6.0
1470,2014-10-31,41.0,0.035,29.515,22.75


### Adding weather records from 10 days ago for predicting

In [426]:
# creating a new df to denote it as having weather from a fortnight ago
weather_n14_n10 = weather_comb.copy()

# `after_fort` to store the expected date that's 14 days later - this will be used to merge with combined ddf
weather_n14_n10["Date"] = weather_n14_n10["Date"] - dt.timedelta(days=10)

weather_n14_n10.head()

Unnamed: 0,Date,Tavg,PrecipTotal,StnPressure,AvgSpeed
0,2007-04-21,67.5,0.0,29.14,9.4
1,2007-04-22,51.5,0.0,29.41,13.4
2,2007-04-23,57.0,0.0,29.425,12.55
3,2007-04-24,58.0,0.0,29.31,10.8
4,2007-04-25,60.0,0.0,29.43,11.75


In [427]:
# adding a prefix to make these columns distinct post-merger
weather_n14_n10 = weather_n14_n10.add_prefix("n10_")
weather_n14_n10.shape
weather_n14_n10.head()

(1471, 5)

Unnamed: 0,n10_Date,n10_Tavg,n10_PrecipTotal,n10_StnPressure,n10_AvgSpeed
0,2007-04-21,67.5,0.0,29.14,9.4
1,2007-04-22,51.5,0.0,29.41,13.4
2,2007-04-23,57.0,0.0,29.425,12.55
3,2007-04-24,58.0,0.0,29.31,10.8
4,2007-04-25,60.0,0.0,29.43,11.75


In [428]:
df.head()

Unnamed: 0,Date,Latitude,Longitude,WnvPresent,NumMosquitos,YearMonth,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
0,2007-05-29,41.867108,-87.654224,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,1,0,0,0,0
1,2007-05-29,41.867108,-87.654224,0,2,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0
2,2007-05-29,41.862292,-87.64886,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0
3,2007-05-29,41.896282,-87.655232,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0
4,2007-05-29,41.907645,-87.760886,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0


In [429]:
weather_n14_n10["n10_Date"] = weather_n14_n10["n10_Date"].astype(str)

In [430]:
df_n14_n10 = pd.merge(df, weather_n14_n10, how = 'left', left_on = 'Date', right_on = 'n10_Date')
df_n14_n10.head()

Unnamed: 0,Date,Latitude,Longitude,WnvPresent,NumMosquitos,YearMonth,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans,n10_Date,n10_Tavg,n10_PrecipTotal,n10_StnPressure,n10_AvgSpeed
0,2007-05-29,41.867108,-87.654224,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,1,0,0,0,0,2007-05-29,70.5,0.0,29.21,13.1
1,2007-05-29,41.867108,-87.654224,0,2,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0,2007-05-29,70.5,0.0,29.21,13.1
2,2007-05-29,41.862292,-87.64886,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0,2007-05-29,70.5,0.0,29.21,13.1
3,2007-05-29,41.896282,-87.655232,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0,2007-05-29,70.5,0.0,29.21,13.1
4,2007-05-29,41.907645,-87.760886,0,1,2007-05,2007,5,29,22,75.5,0.0,29.415,6.95,2007-05-29,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0,2007-05-29,70.5,0.0,29.21,13.1


In [431]:
# dropping all datetime columns except week
# NumMosquitos need to be dropped since it's not in test

Xtr_n14_n10 = df_n14_n10.drop(columns=["WnvPresent", "Date", "NumMosquitos", "YearMonth", "Year", "Month", "Day", "Fort_Date", "n10_Date"])
Xtr_n14_n10.head(2)

y = df[["WnvPresent"]]
y.head(2)

Unnamed: 0,Latitude,Longitude,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans,n10_Tavg,n10_PrecipTotal,n10_StnPressure,n10_AvgSpeed
0,41.867108,-87.654224,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,1,0,0,0,0,70.5,0.0,29.21,13.1
1,41.867108,-87.654224,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.4,0,0,0,1,0,0,0,70.5,0.0,29.21,13.1


Unnamed: 0,WnvPresent
0,0
1,0


In [432]:
# doing train, test, split
X_train, X_val, y_train, y_val = train_test_split(Xtr_n14_n10, y, stratify=y, random_state = 142)

ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_val_sc = ss.transform(X_val)

In [433]:
test_df.head()

Unnamed: 0_level_0,Date,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans,WnvPresent
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,2008-06-11,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9,0,0,1,0,0,0,0,0.0374
2,2008-06-11,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9,0,0,0,1,0,0,0,0.01021
3,2008-06-11,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9,0,1,0,0,0,0,0,0.054464
4,2008-06-11,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9,0,0,0,0,1,0,0,0.01353
5,2008-06-11,41.95469,-87.800991,2008,6,11,24,75.0,0.0,29.31,10.2,2008-06-11,73.0,0.095,29.31,8.9,0,0,0,0,0,0,1,0.009278


In [436]:
test_keepid.tail()

Unnamed: 0,Id,Date,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans
116288,116289,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,1,0,0
116289,116290,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,0,0,1
116290,116291,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,0,1,0
116291,116292,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,0,0,0
116292,116293,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,1,0,0,0,0,0,0


In [437]:
test_df_n14_n10  = pd.merge(test_keepid, weather_n14_n10, how = 'left', left_on = 'Date', right_on = 'n10_Date')
test_df_n14_n10.tail()

Unnamed: 0,Id,Date,Latitude,Longitude,Year,Month,Day,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Date,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans,n10_Date,n10_Tavg,n10_PrecipTotal,n10_StnPressure,n10_AvgSpeed
116288,116289,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,1,0,0,2014-10-02,53.0,0.01,29.275,9.5
116289,116290,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,0,0,1,2014-10-02,53.0,0.01,29.275,9.5
116290,116291,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,0,1,0,2014-10-02,53.0,0.01,29.275,9.5
116291,116292,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,0,0,0,0,0,0,0,2014-10-02,53.0,0.01,29.275,9.5
116292,116293,2014-10-02,41.925652,-87.63359,2014,10,2,40,69.5,0.52,29.065,7.9,2014-10-02,54.5,0.0,29.055,7.9,1,0,0,0,0,0,0,2014-10-02,53.0,0.01,29.275,9.5


In [439]:
# dropping all datetime columns except week
# NumMosquitos need to be dropped since it's not in test

test_df_n14_n10 = test_df_n14_n10.drop(columns=["Date", "Year", "Month", "Day", "Fort_Date", "n10_Date"])
test_df_n14_n10.head(2)


Unnamed: 0,Id,Latitude,Longitude,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans,n10_Tavg,n10_PrecipTotal,n10_StnPressure,n10_AvgSpeed
0,1,41.95469,-87.800991,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,1,0,0,0,0,72.5,0.01,29.265,7.35
1,2,41.95469,-87.800991,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,0,1,0,0,0,72.5,0.01,29.265,7.35


In [442]:
test_df_n14_n10.set_index("Id", inplace=True)
test_df_n14_n10.tail(2)

Unnamed: 0_level_0,Latitude,Longitude,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans,n10_Tavg,n10_PrecipTotal,n10_StnPressure,n10_AvgSpeed
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
116292,41.925652,-87.63359,40,69.5,0.52,29.065,7.9,54.5,0.0,29.055,7.9,0,0,0,0,0,0,0,53.0,0.01,29.275,9.5
116293,41.925652,-87.63359,40,69.5,0.52,29.065,7.9,54.5,0.0,29.055,7.9,1,0,0,0,0,0,0,53.0,0.01,29.275,9.5


In [443]:
Xtr_n14_n10

Unnamed: 0,Latitude,Longitude,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans,n10_Tavg,n10_PrecipTotal,n10_StnPressure,n10_AvgSpeed
0,41.867108,-87.654224,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,1,0,0,0,0,70.5,0.00,29.21,13.1
1,41.867108,-87.654224,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,0,1,0,0,0,70.5,0.00,29.21,13.1
2,41.862292,-87.648860,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,0,1,0,0,0,70.5,0.00,29.21,13.1
3,41.896282,-87.655232,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,0,1,0,0,0,70.5,0.00,29.21,13.1
4,41.907645,-87.760886,22,75.5,0.0,29.415,6.95,73.5,0.0,29.465,9.40,0,0,0,1,0,0,0,70.5,0.00,29.21,13.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8605,41.726465,-87.585413,39,64.0,0.0,29.370,4.40,61.5,0.0,29.405,3.85,0,1,0,0,0,0,0,55.0,0.09,29.02,9.5
8606,41.726465,-87.585413,39,64.0,0.0,29.370,4.40,61.5,0.0,29.405,3.85,0,0,1,0,0,0,0,55.0,0.09,29.02,9.5
8607,41.723195,-87.649970,39,64.0,0.0,29.370,4.40,61.5,0.0,29.405,3.85,0,0,1,0,0,0,0,55.0,0.09,29.02,9.5
8608,41.868077,-87.666901,39,64.0,0.0,29.370,4.40,61.5,0.0,29.405,3.85,0,0,1,0,0,0,0,55.0,0.09,29.02,9.5


In [444]:
ss = StandardScaler()

X_tr_sc = ss.fit_transform(Xtr_n14_n10)

X_test_sc = ss.transform(test_df_n14_n10)

In [445]:
b_et.fit(Xtr_n14_n10, y)

Wnv_proba = [i[1] for i in b_et.predict_proba(X_test_sc)]


  """Entry point for launching an IPython kernel.


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=10, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=10, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=150,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [446]:
test_df_n14_n10["WnvPresent"] = Wnv_proba
test_df_n14_n10["WnvPresent"].head()


Id
1    0.012929
2    0.007285
3    0.014406
4    0.006937
5    0.005882
Name: WnvPresent, dtype: float64

In [225]:
submission = test_df[["WnvPresent"]]
submission.index.name = "Id"
submission.head()

submission.to_csv("./assets/submission_et_n14_n10.csv", index = True)

Unnamed: 0_level_0,WnvPresent
Id,Unnamed: 1_level_1
1,0.033654
2,0.009243
3,0.055521
4,0.015471
5,0.007852


### Trying this again with imblearn for predicting

In [227]:
!pip install imblearn

Collecting imblearn
  Downloading https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl
Collecting imbalanced-learn
  Downloading https://files.pythonhosted.org/packages/e6/62/08c14224a7e242df2cef7b312d2ef821c3931ec9b015ff93bb52ec8a10a3/imbalanced_learn-0.5.0-py3-none-any.whl (173kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.5.0 imblearn-0.0


In [228]:
# using SMOTE to oversample WNV
from imblearn.over_sampling import SMOTE



In [447]:
# fit smote instance to train data
smote = SMOTE(random_state = 42)
xtrain_balanced, ytrain_balanced = smote.fit_resample(Xtr_n14_n10, df[["WnvPresent"]]) 

  y = column_or_1d(y, warn=True)


In [448]:

# put train data into df
xtrain_balanced = pd.DataFrame(xtrain_balanced, columns = Xtr_n14_n10.columns)
ytrain_balanced = pd.DataFrame(ytrain_balanced, columns = df[["WnvPresent"]].columns)


In [449]:
# check balance of dataset
# ytrain_balanced.WnvPresent.value_counts()

ytrain_balanced["WnvPresent"].value_counts()

1    8153
0    8153
Name: WnvPresent, dtype: int64

In [450]:
xtrain_balanced

Unnamed: 0,Latitude,Longitude,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans,n10_Tavg,n10_PrecipTotal,n10_StnPressure,n10_AvgSpeed
0,41.867108,-87.654224,22.000000,75.500000,0.000000,29.415000,6.950000,73.500000,0.000000,29.465000,9.400000,0.0,0.000000,1.000000,0.0,0.0,0.0,0.0,70.500000,0.000000,29.210000,13.100000
1,41.867108,-87.654224,22.000000,75.500000,0.000000,29.415000,6.950000,73.500000,0.000000,29.465000,9.400000,0.0,0.000000,0.000000,1.0,0.0,0.0,0.0,70.500000,0.000000,29.210000,13.100000
2,41.862292,-87.648860,22.000000,75.500000,0.000000,29.415000,6.950000,73.500000,0.000000,29.465000,9.400000,0.0,0.000000,0.000000,1.0,0.0,0.0,0.0,70.500000,0.000000,29.210000,13.100000
3,41.896282,-87.655232,22.000000,75.500000,0.000000,29.415000,6.950000,73.500000,0.000000,29.465000,9.400000,0.0,0.000000,0.000000,1.0,0.0,0.0,0.0,70.500000,0.000000,29.210000,13.100000
4,41.907645,-87.760886,22.000000,75.500000,0.000000,29.415000,6.950000,73.500000,0.000000,29.465000,9.400000,0.0,0.000000,0.000000,1.0,0.0,0.0,0.0,70.500000,0.000000,29.210000,13.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16301,41.735593,-87.652243,33.989207,79.962223,0.618254,29.259406,8.912763,80.059364,0.000000,29.274298,6.604317,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,69.500000,0.000000,29.513057,4.690826
16302,41.770788,-87.668193,33.000000,76.500000,0.530000,29.200000,6.450000,77.500000,0.000000,29.365000,8.500000,0.0,0.000000,0.000000,1.0,0.0,0.0,0.0,71.500000,0.295000,29.260000,6.000000
16303,41.704173,-87.702173,30.981835,74.522706,0.035990,29.169773,9.854541,78.518165,0.000000,29.275363,8.012308,0.0,0.000000,1.000000,0.0,0.0,0.0,0.0,71.522706,0.000000,29.413547,5.495413
16304,41.955783,-87.915678,37.000000,53.545742,0.000000,29.590374,3.858630,53.481299,0.127661,29.344220,14.293442,0.0,0.844074,0.155926,0.0,0.0,0.0,0.0,58.513521,0.890498,29.027963,12.863090


In [451]:
X_test = test_df_n14_n10.drop(columns=["WnvPresent"])
X_test.head()

Unnamed: 0_level_0,Latitude,Longitude,Week,Tavg,PrecipTotal,StnPressure,AvgSpeed,Fort_Tavg,Fort_PrecipTotal,Fort_StnPressure,Fort_AvgSpeed,erraticus,pipiens,pipiens_restuans,restuans,salinarius,tarsalis,territans,n10_Tavg,n10_PrecipTotal,n10_StnPressure,n10_AvgSpeed
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,41.95469,-87.800991,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,1,0,0,0,0,72.5,0.01,29.265,7.35
2,41.95469,-87.800991,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,0,1,0,0,0,72.5,0.01,29.265,7.35
3,41.95469,-87.800991,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,1,0,0,0,0,0,72.5,0.01,29.265,7.35
4,41.95469,-87.800991,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,0,0,1,0,0,72.5,0.01,29.265,7.35
5,41.95469,-87.800991,24,75.0,0.0,29.31,10.2,73.0,0.095,29.31,8.9,0,0,0,0,0,0,1,72.5,0.01,29.265,7.35


In [452]:

X_tr_sc = ss.fit_transform(xtrain_balanced)
X_test_sc = ss.transform(X_test)

In [453]:
b_et.fit(xtrain_balanced, ytrain_balanced)


Wnv_proba = [i[1] for i in b_et.predict_proba(X_test_sc)]


  """Entry point for launching an IPython kernel.


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=10, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=10, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=150,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [454]:
test_df_n14_n10["WnvPresent"] = Wnv_proba
test_df_n14_n10["WnvPresent"].head()


Id
1    0.088250
2    0.038073
3    0.083778
4    0.076383
5    0.037579
Name: WnvPresent, dtype: float64

In [455]:
test_df_n14_n10[["WnvPresent"]].tail()

Unnamed: 0_level_0,WnvPresent
Id,Unnamed: 1_level_1
116289,0.072344
116290,0.030244
116291,0.081438
116292,0.081438
116293,0.081438


In [456]:
submission = test_df_n14_n10[["WnvPresent"]]
submission.index.name = "Id"
submission.head()

submission.to_csv("./assets/submission_n14_n10_balanced.csv", index = True)

Unnamed: 0_level_0,WnvPresent
Id,Unnamed: 1_level_1
1,0.08825
2,0.038073
3,0.083778
4,0.076383
5,0.037579


In [457]:
submission.shape

(116293, 1)

**Kaggle**

This seems to have backfired completely as after training on a balanced data set, my overall score

Private: 0.57445

Public: 0.57301