In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [2]:
#importing data, spray data for the test set (2008,2010,2012,2014) is not provided. Therefore, spray info is not used for the analysis
train = pd.read_csv('./assets/train.csv')
test =pd.read_csv('./assets/test.csv')
weather = pd.read_csv('./assets/weather.csv')
spray =pd.read_csv('./assets/spray.csv')
mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")

In [3]:
from datetime import datetime
train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')
test['Date'] = pd.to_datetime(test['Date'], format='%Y-%m-%d')
weather['Date'] = pd.to_datetime(weather['Date'], format='%Y-%m-%d')
spray['Date'] = pd.to_datetime(spray['Date'], format='%Y-%m-%d')

In [4]:
train['week'] = train['Date'].dt.weekofyear
train['year'] = train['Date'].dt.year
test['week'] = test['Date'].dt.weekofyear
test['year'] = test['Date'].dt.year
spray['year']=spray['Date'].dt.year

In [5]:
train.columns


Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent', u'week', u'year'],
      dtype='object')

In [6]:
train = train.drop(train[[1,3,4,5,6,9,10]], axis=1)

In [7]:
train.head(1)

Unnamed: 0,Date,Species,Latitude,Longitude,WnvPresent,week,year
0,2007-05-29,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,0,22,2007


In [8]:
test =test.drop(test[[0,2,4,5,6,7,10]], axis=1)

In [9]:
test.head()

Unnamed: 0,Date,Species,Latitude,Longitude,week,year
0,2008-06-11,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,24,2008
1,2008-06-11,CULEX RESTUANS,41.95469,-87.800991,24,2008
2,2008-06-11,CULEX PIPIENS,41.95469,-87.800991,24,2008
3,2008-06-11,CULEX SALINARIUS,41.95469,-87.800991,24,2008
4,2008-06-11,CULEX TERRITANS,41.95469,-87.800991,24,2008


In [10]:
train = pd.concat([train , pd.get_dummies(train['Species'])], axis=1)  

In [11]:
test = pd.concat([test , pd.get_dummies(test['Species'])], axis=1)  

In [12]:
test.head()

Unnamed: 0,Date,Species,Latitude,Longitude,week,year,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX
0,2008-06-11,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,24,2008,0,0,1,0,0,0,0,0
1,2008-06-11,CULEX RESTUANS,41.95469,-87.800991,24,2008,0,0,0,1,0,0,0,0
2,2008-06-11,CULEX PIPIENS,41.95469,-87.800991,24,2008,0,1,0,0,0,0,0,0
3,2008-06-11,CULEX SALINARIUS,41.95469,-87.800991,24,2008,0,0,0,0,1,0,0,0
4,2008-06-11,CULEX TERRITANS,41.95469,-87.800991,24,2008,0,0,0,0,0,0,1,0


In [13]:
test =test.drop(test[[1]], axis=1)
train =train.drop(train[[1]], axis=1)

In [14]:
w30 = pd.read_csv('weather_ave_30.csv')

In [15]:
w30 = w30.drop('Datetime_Date', axis=1)


In [16]:
w30['Date'] = pd.to_datetime(w30['Date'], format='%Y-%m-%d')

In [17]:
# Adding the weather data to the train and test data
train_add =train.join(w30.set_index('Date'), on='Date')
test_add = test.join(w30.set_index('Date'), on='Date')

In [18]:

test_add['UNSPECIFIED CULEX'].value_counts()



0    101948
1     14345
Name: UNSPECIFIED CULEX, dtype: int64

In [19]:
train_add['UNSPECIFIED CULEX']= 0


In [20]:
y =train_add['WnvPresent']


In [21]:
train_add = train_add.drop('WnvPresent', axis=1)


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.cross_validation import cross_val_score, StratifiedKFold ,train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import GridSearchCV




In [23]:
train_add.columns

Index([u'Date', u'Latitude', u'Longitude', u'week', u'year',
       u'CULEX ERRATICUS', u'CULEX PIPIENS', u'CULEX PIPIENS/RESTUANS',
       u'CULEX RESTUANS', u'CULEX SALINARIUS', u'CULEX TARSALIS',
       u'CULEX TERRITANS', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint',
       u'WetBulb', u'Heat', u'Cool', u'PrecipTotal', u'StnPressure',
       u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed', u'HZ', u'VC',
       u'FU', u'BC', u'SQ', u'FG+', u'MI', u'TS', u'DZ', u'RA', u'BR', u'FG',
       u'SN', u'UNSPECIFIED CULEX'],
      dtype='object')

In [24]:
X =train_add[[u'Latitude', u'Longitude', u'week', u'year',
       u'CULEX ERRATICUS', u'CULEX PIPIENS', u'CULEX PIPIENS/RESTUANS',
       u'CULEX RESTUANS', u'CULEX SALINARIUS', u'CULEX TARSALIS',
       u'CULEX TERRITANS', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint',
       u'WetBulb', u'Heat', u'Cool', u'PrecipTotal', u'StnPressure',
       u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed', u'HZ', u'VC',
       u'FU', u'BC', u'SQ', u'FG+', u'MI', u'TS', u'DZ', u'RA', u'BR', u'FG',
       u'SN', u'UNSPECIFIED CULEX']]

In [25]:
X.dtypes

Latitude                  float64
Longitude                 float64
week                        int64
year                        int64
CULEX ERRATICUS             uint8
CULEX PIPIENS               uint8
CULEX PIPIENS/RESTUANS      uint8
CULEX RESTUANS              uint8
CULEX SALINARIUS            uint8
CULEX TARSALIS              uint8
CULEX TERRITANS             uint8
Tmax                      float64
Tmin                      float64
Tavg                      float64
Depart                    float64
DewPoint                  float64
WetBulb                   float64
Heat                      float64
Cool                      float64
PrecipTotal               float64
StnPressure               float64
SeaLevel                  float64
ResultSpeed               float64
ResultDir                   int64
AvgSpeed                  float64
HZ                        float64
VC                        float64
FU                        float64
BC                        float64
SQ            

In [26]:
scaler = MinMaxScaler()
X= scaler.fit_transform(X)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [31]:
##Grid search over Random Forest parameters
# model evaluation function
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    a = accuracy_score(y_test, y_pred)
    probabilities = model.predict_proba(X_test)
    #cm = confusion_matrix(y_test, y_pred)
    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))

    confusion = pd.DataFrame(conmat, index=['1', '0'],
                         columns=['predicted_1','predicted_0'])

    cr = classification_report(y_test, y_pred)
    
    print 'confusion matrix:'
    print confusion
    print 'classification_report:',cr
    print 'Accuracy of the model on test:',a
    return probabilities
#params = {'max_features ': [0.5,1.0],'max_depth':[0.5,1.0],'n_estimators':[5,10]}
max_depths = [0.5,1.0]
max_features = [0.5,1.0]
n_estimators = [1000,2000]
rf = RandomForestClassifier(n_jobs=-1,random_state = 33)
gsrf = GridSearchCV(estimator = rf,param_grid=dict(max_depth = max_depths, max_features=max_features,n_estimators=n_estimators), n_jobs=-1,cv=3)
gsrf.fit(X_train, y_train)
print 'best parameters for the model:',gsrf.best_params_
print 'best score on train:',gsrf.best_score_
probability = evaluate_model(gsrf.best_estimator_)


KeyboardInterrupt: 

In [29]:
##Grid search over Knn parameters
# model evaluation function
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    a = accuracy_score(y_test, y_pred)
    probabilities = model.predict_proba(X_test)
    #cm = confusion_matrix(y_test, y_pred)
    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))

    confusion = pd.DataFrame(conmat, index=['1', '0'],
                         columns=['predicted_1','predicted_0'])

    cr = classification_report(y_test, y_pred)
    
    print 'confusion matrix:'
    print confusion
    print 'classification_report:',cr
    print 'Accuracy of the model on test:',a
    return probabilities

n_neighbors =  range(5, 50)
knn = KNeighborsClassifier()
gsrf = GridSearchCV(estimator = knn,param_grid=dict( n_neighbors=n_neighbors), n_jobs=-1,cv=3)
gsrf.fit(X_train, y_train)
print 'best parameters for the model:',gsrf.best_params_
print 'best score on train:',gsrf.best_score_
probability = evaluate_model(gsrf.best_estimator_)


KeyboardInterrupt: 