In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np

# Loading the Data

In [2]:
X_train_1 = pd.read_csv('data/dengue_features_train.csv')
y_train = pd.read_csv('data/dengue_labels_train.csv')['total_cases']
attr = list(X_train_1)
attr

['city',
 'year',
 'weekofyear',
 'week_start_date',
 'ndvi_ne',
 'ndvi_nw',
 'ndvi_se',
 'ndvi_sw',
 'precipitation_amt_mm',
 'reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'reanalysis_dew_point_temp_k',
 'reanalysis_max_air_temp_k',
 'reanalysis_min_air_temp_k',
 'reanalysis_precip_amt_kg_per_m2',
 'reanalysis_relative_humidity_percent',
 'reanalysis_sat_precip_amt_mm',
 'reanalysis_specific_humidity_g_per_kg',
 'reanalysis_tdtr_k',
 'station_avg_temp_c',
 'station_diur_temp_rng_c',
 'station_max_temp_c',
 'station_min_temp_c',
 'station_precip_mm']

## Cleaning the noisy training data

In [3]:
def bools_to_indexes(booleans):
    r = []
    for idx, x in enumerate(booleans):
        if x:
            r.append(idx)
    return r

idx = bools_to_indexes(X_train_1['weekofyear'] == 53)
y_train.drop(idx, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_train_1.drop(idx, inplace=True)
X_train_1.reset_index(drop=True, inplace=True)
X_train_1.shape

(1451, 24)

# Data Pipeline

In [4]:
%autoreload
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from utils.ContinuityImputer import ContinuityImputer
from utils.DataFrameDropper import DataFrameDropper
from utils.LastWeeks import LastWeeks
lw = LastWeeks(attributes=['ndvi_ne', 'precipitation_amt_mm', 'reanalysis_relative_humidity_percent'], weeks=3)

pipeline = Pipeline([
    ('imputer', ContinuityImputer(attributes=attr[4:])),
    ('lw', LastWeeks(attributes=attr[4:], weeks=2)),
    ('dataframe_dropper', DataFrameDropper(attribute_names=attr[:4])),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95))
])

In [5]:
X_train = pipeline.fit_transform(X_train_1)
X_train.shape

(1451, 23)

# Model Selection

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
score_metric='neg_mean_absolute_error'
jobs=-1 #-1 to make it execute in parallel
verbose_level = 1
random_n = 42

## SVR
* The results with the kernel *sigmoid* were too bad, so we removed them.

In [7]:
k_folds=4
n_iter_search = 20
C = sp_randint(0, 10000)
params = {'kernel':['rbf', 'linear'], 'gamma':['scale'], 'C': C}

In [8]:
SVR_optimizer = RandomizedSearchCV(estimator=SVR(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
SVR_optimizer.fit(X_train, y_train)
SVR_optimizer.best_score_

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 52.6min finished


-19.17685248872835

In [15]:
SVR_optimizer.best_estimator_

SVR(C=769, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [9]:
params = {'kernel':['poly'], 'degree':sp_randint(2,8), 'gamma':['scale'], 'C': C}

In [10]:
SVR_poly_optimizer = RandomizedSearchCV(estimator=SVR(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
SVR_poly_optimizer.fit(X_train, y_train)
SVR_poly_optimizer.best_score_

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.8min finished


-25.45358085803704

In [None]:
SVR_poly_optimizer.best_estimator_

## Regression Trees

In [11]:
k_folds=10
n_iter_search = 1000
min_samples = sp_uniform(0.01, 0.35)
params = {'criterion':['mae'], 'max_depth': sp_randint(2, 10), 'min_samples_split': min_samples, 'min_samples_leaf': min_samples}

In [12]:
Tree_optimizer = RandomizedSearchCV(estimator=DecisionTreeRegressor(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
Tree_optimizer.fit(X_train, y_train)
Tree_optimizer.best_score_

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   43.4s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2476 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 3576 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 4876 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 6376 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 8076 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 9976 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:  8.2min finished


-18.271881461061337

## Random Forests

In [13]:
k_folds=5
n_iter_search = 30
min_samples = sp_uniform(0.01, 0.35)
params = {'n_estimators': sp_randint(2,30), 'criterion':['mae'], 'max_depth': sp_randint(2, 10), 'min_samples_split': min_samples, 'min_samples_leaf': min_samples}

In [14]:
Forest_optimizer = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
Forest_optimizer.fit(X_train, y_train)
Forest_optimizer.best_score_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   24.6s finished


-18.8661842407535