In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np

# Loading the Data

In [2]:
X_train_1 = pd.read_csv('data/dengue_features_train.csv')
y_train = pd.read_csv('data/dengue_labels_train.csv')['total_cases']
attr = list(X_train_1)
attr

['city',
 'year',
 'weekofyear',
 'week_start_date',
 'ndvi_ne',
 'ndvi_nw',
 'ndvi_se',
 'ndvi_sw',
 'precipitation_amt_mm',
 'reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'reanalysis_dew_point_temp_k',
 'reanalysis_max_air_temp_k',
 'reanalysis_min_air_temp_k',
 'reanalysis_precip_amt_kg_per_m2',
 'reanalysis_relative_humidity_percent',
 'reanalysis_sat_precip_amt_mm',
 'reanalysis_specific_humidity_g_per_kg',
 'reanalysis_tdtr_k',
 'station_avg_temp_c',
 'station_diur_temp_rng_c',
 'station_max_temp_c',
 'station_min_temp_c',
 'station_precip_mm']

## Cleaning the noisy training data

In [3]:
def bools_to_indexes(booleans):
    r = []
    for idx, x in enumerate(booleans):
        if x:
            r.append(idx)
    return r

idx = bools_to_indexes(X_train_1['weekofyear'] == 53)
y_train.drop(idx, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_train_1.drop(idx, inplace=True)
X_train_1.reset_index(drop=True, inplace=True)
X_train_1.shape

(1451, 24)

# Data Pipeline

In [20]:
%autoreload
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from utils.ContinuityImputer import ContinuityImputer
from utils.DataFrameDropper import DataFrameDropper
from utils.LastWeeks import LastWeeks
lw = LastWeeks(attributes=['ndvi_ne', 'precipitation_amt_mm', 'reanalysis_relative_humidity_percent'], weeks=3)

pipeline = Pipeline([
    ('imputer', ContinuityImputer(attributes=attr[4:])),
    ('lw', LastWeeks(attributes=attr[4:], weeks=3)),
    ('dataframe_dropper', DataFrameDropper(attribute_names=attr[:4])),
    ('scaler', StandardScaler()),
    #('pca', PCA(n_components=0.95))
])

In [21]:
X_train = pipeline.fit_transform(X_train_1)
X_train.shape

(1451, 80)

# Model Selection

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
score_metric='neg_mean_absolute_error'
jobs=-1 #-1 to make it execute in parallel
verbose_level = 1
random_n = 42

## SVR
* The results with the kernel *sigmoid* and *poly* were too bad, so we removed them.

In [7]:
k_folds=4
n_iter_search = 20
C = sp_randint(0, 10000)
params = {'kernel':['rbf', 'linear'], 'gamma':['scale'], 'C': C}

In [8]:
SVR_optimizer = RandomizedSearchCV(estimator=SVR(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
SVR_optimizer.fit(X_train, y_train)
SVR_optimizer.best_score_

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 52.6min finished


-19.17685248872835

In [15]:
SVR_optimizer.best_estimator_

SVR(C=769, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

## Regression Trees
* 18.01 - with 2 previous weeks & without PCA & with (max_depth=6, min_samples_leaf=0.1611807565247405, min_samples_split=0.11193019906931466)
* 18.29 - With PCA at 0.9
* 18.27 - With PCA at 0.95
* 18.36 - With PCA at 0.65. PCA appears to be only making the model worse.
* 18.38 - Without PCA and with previous weeks. Clearly the previous weeks are useful
* 17.87 - Without PCA and with 3 previous weeks
* 17.86 - Without PCA and with 4 previous weeks
* 18.28 - Withou PCA 0.95 and 3 previous weeks fixed

In [16]:
k_folds=10
n_iter_search = 100
min_samples = sp_uniform(0.01, 0.35)
params = {'criterion':['mae'], 'max_depth': sp_randint(2, 10), 'min_samples_split': min_samples, 'min_samples_leaf': min_samples}

In [17]:
Tree_optimizer = RandomizedSearchCV(estimator=DecisionTreeRegressor(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
Tree_optimizer.fit(X_train, y_train)
Tree_optimizer.best_score_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.2min finished


-18.274293590627153

In [18]:
Tree_optimizer.best_estimator_

DecisionTreeRegressor(criterion='mae', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=0.2320229706454773,
           min_samples_split=0.24824690804416838,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

## Random Forests
* 18.34 With 4 previous weeks and without PCA
* **17.79** With fixed 3 previous weeks and PCA at 0.95 (n_estimators= ?, max_depth = 2, min_samples_leaf=0.112, min_samples_split=0.224)
* **17.79** With fixed 3 previous weeks and without PCA (n_estimators= ?, max_depth = 5, min_samples_leaf=0.07, min_samples_split=0.27)

In [48]:
k_folds=10
n_iter_search = 30
min_samples = sp_uniform(0.01, 0.35)
params = {'n_estimators': sp_randint(2,30), 'criterion':['mae'], 'max_depth': sp_randint(2, 10), 'min_samples_split': min_samples, 'min_samples_leaf': min_samples}

In [49]:
Forest_optimizer = RandomizedSearchCV(estimator=RandomForestRegressor(n_jobs=-1), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
Forest_optimizer.fit(X_train, y_train)
Forest_optimizer.best_score_

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.1min finished


-17.740020145257915

In [50]:
Forest_optimizer.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=0.09435891310910409,
           min_samples_split=0.24914223158891036,
           min_weight_fraction_leaf=0.0, n_estimators=13, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

## KNN
* -21.349 - with PCA at 0.65 & 2 previous weeks
* -20.36  - without PCA

In [35]:
k_folds=10
n_iter_search = 100
params = {'n_neighbors': sp_randint(3,150), 'weights': ['uniform', 'distance']}

In [36]:
KNN_optimizer = RandomizedSearchCV(estimator=KNeighborsRegressor(n_jobs=-1), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
KNN_optimizer.fit(X_train, y_train)
KNN_optimizer.best_score_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.6min finished


-20.359505759574677

In [37]:
%autoreload
from utils.LastInfected import LastInfected
tmp = pd.concat([LastInfected(weeks=2).fit_transform(X_train_1, y=y_train), y_train], axis=1)

In [58]:
tmp[900:950]

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,last_infected_0,last_infected_1,total_cases
900,sj,2007,37,2007-09-10,,0.0569,0.238543,0.187486,10.37,301.117143,...,17.72,3.157143,28.871429,6.514286,33.9,25.0,10.4,71.0,92.0,112
901,sj,2007,38,2007-09-17,-0.01345,0.0749,0.152571,0.131929,70.39,301.217143,...,18.037143,2.814286,28.3,6.285714,32.8,24.4,26.9,112.0,71.0,106
902,sj,2007,39,2007-09-24,-0.0307,-0.00294,0.152729,0.144629,94.37,301.052857,...,17.981429,3.585714,28.171429,6.028571,32.2,24.4,21.3,106.0,112.0,101
903,sj,2007,40,2007-10-01,0.096,0.024767,0.1853,0.117729,74.5,301.022857,...,18.118571,2.685714,27.985714,7.242857,32.8,22.2,86.6,101.0,106.0,170
904,sj,2007,41,2007-10-08,0.009,0.104,0.118129,0.126343,108.26,300.79,...,18.375714,3.0,28.128571,6.914286,33.3,23.9,14.5,170.0,101.0,135
905,sj,2007,42,2007-10-15,0.021,0.132667,0.245943,0.189757,17.56,301.492857,...,17.845714,3.185714,29.1,7.542857,33.9,24.4,10.2,135.0,170.0,106
906,sj,2007,43,2007-10-22,,-0.00915,0.191186,0.1764,16.48,301.007143,...,17.275714,2.471429,27.957143,6.442857,32.2,24.4,8.6,106.0,135.0,68
907,sj,2007,44,2007-10-29,0.1243,0.0543,0.156814,0.123529,137.55,299.458571,...,17.502857,2.6,26.2,5.4,30.6,22.2,89.2,68.0,106.0,48
908,sj,2007,45,2007-11-05,-0.2517,-0.0486,0.205171,0.172883,15.25,300.604286,...,17.295714,2.257143,27.442857,6.857143,32.2,22.8,4.1,48.0,68.0,48
909,sj,2007,46,2007-11-12,-0.0589,-0.06255,0.205743,0.202543,42.0,299.934286,...,17.082857,3.542857,26.814286,6.685714,31.1,22.8,65.7,48.0,48.0,26
