In [1]:
import pandas as pd
test_w = pd.read_csv("clean_test.csv")
train_w = pd.read_csv("clean_train.csv")

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib import cm
import re
from sklearn import preprocessing, tree, ensemble, linear_model, metrics, model_selection, svm
import xgboost

  from numpy.core.umath_tests import inner1d


## Preprocessing

In [3]:
# Have to pre-process label-encoder on test set because test set has more traps and species.
le_trap = preprocessing.LabelEncoder()
test_w['Trap_num'] = le_trap.fit_transform(test_w.Trap)

# le_species = preprocessing.LabelEncoder()
# test_w['Species_num'] = le_species.fit_transform(test_w.Species)

le_zip = preprocessing.LabelEncoder()
test_w['zip_code_num'] = le_zip.fit_transform(test_w.zip_code)

In [4]:
#repeat with Train
train_w['Trap_num'] = le_trap.transform(train_w.Trap)

# train_w['Species_num'] = le_species.transform(train_w.Species)

train_w['zip_code_num'] = le_zip.transform(train_w.zip_code)

## Model Selection


In [5]:
X = train_w.drop('Date Trap Species CodeSum NumMosquitos WnvPresent zip_code'.split(' '), axis=1)
y = train_w.WnvPresent


In [6]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=.3)

In [7]:
#XGBoost
xgb = xgboost.XGBClassifier(n_estimators=100)


xgb.fit(X_train, y_train)



predict = xgb.predict_proba(X_test)
metrics.roc_auc_score(y_test, predict[:,1])

0.8728255136857288

In [8]:
#Random Forrest
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=100)
clf.fit(X_train, y_train)
predict = clf.predict_proba(X_test)
metrics.roc_auc_score(y_test, predict[:,1])

0.8089724132734886

In [9]:
#AdaBoost
ada = ensemble.AdaBoostClassifier(n_estimators=100)
ada.fit(X_train, y_train)

predict = ada.predict_proba(X_test)
metrics.roc_auc_score(y_test, predict[:,1])

0.864609771061384

In [10]:
pd.DataFrame(xgb.feature_importances_, index=X.columns).sort_values(by=0, ascending=False)

Unnamed: 0,0
Unnamed: 0,0.311189
day_of_year,0.157343
trap_percent_with_wnv2,0.129371
trap_percent_with_wnv,0.085664
Longitude,0.06993
DewPoint,0.026224
Tavg,0.022727
Tmax,0.022727
trap_percent_of_all_mosquitos,0.020979
zip_code_num,0.020979


In [11]:
#Cross-validated scores
cv = model_selection.StratifiedKFold(shuffle=True, n_splits=5)
for model, name in zip([clf, ada, xgb], ['clf', 'ada', 'xgb']):
    score = model_selection.cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
    print(name, score.mean(), '+/-', score.std())

clf 0.7618385108088775 +/- 0.01830684599333593
ada 0.8408733156699002 +/- 0.018309584548588935
xgb 0.8576802326274955 +/- 0.008431637958924026


In [12]:
#Grid-Search to find best parameters
cv = model_selection.StratifiedKFold(shuffle=True, n_splits=5)
xg_best = model_selection.GridSearchCV(xgboost.XGBClassifier(), param_grid={
        'max_depth':[3,4,5,6, 7],
        'learning_rate':[.04, .06, .08, .1, .12, .14, .16],
    },scoring='roc_auc', cv=cv)
xg_best.fit(X, y).best_params_

# cv = model_selection.StratifiedKFold(shuffle=True, n_splits=5)
xg_best = xgboost.XGBRegressor(learning_rate=0.14, max_depth=3)
score = model_selection.cross_val_score(xg_best, X, y, cv=cv, scoring='roc_auc')
print('score:',score.mean(),"+/-",score.std())

score: 0.8420300689713045 +/- 0.007260741141150592


In [13]:
cv = model_selection.StratifiedKFold(shuffle=True, n_splits=5)
predict = model_selection.cross_val_predict(xgboost.XGBClassifier(), X, y, cv=cv, method='predict_proba')
predictions = model_selection.cross_val_predict(xgboost.XGBClassifier(), X, y, cv=cv)
metrics.roc_auc_score(y, predict[:,1])

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.8529678471451843

## Kaggle Submission

In [14]:
X = train_w.drop('Date Trap Species CodeSum NumMosquitos WnvPresent zip_code'.split(' '), axis=1)
y = train_w.WnvPresent
X_testset = test_w.drop('Date Trap Species CodeSum zip_code Id'.split(' '), axis=1)
xgb.fit(X, y)
FINAL = xgb.predict_proba(X_testset)[:,1]

In [15]:
sample = pd.read_csv('sampleSubmission.csv', index_col='Id')

In [16]:
sample.WnvPresent = FINAL

In [17]:
sample.to_csv('testSubmission.csv')

In [19]:
sample.head()

Unnamed: 0_level_0,WnvPresent
Id,Unnamed: 1_level_1
1,0.00146
2,0.00146
3,0.00146
4,0.00146
5,0.00146
