In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np

# Loading the Data

In [2]:
X_train_1 = pd.read_csv('data/dengue_features_train.csv')
y_train = pd.read_csv('data/dengue_labels_train.csv')['total_cases']
attr = list(X_train_1)
attr

['city',
 'year',
 'weekofyear',
 'week_start_date',
 'ndvi_ne',
 'ndvi_nw',
 'ndvi_se',
 'ndvi_sw',
 'precipitation_amt_mm',
 'reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'reanalysis_dew_point_temp_k',
 'reanalysis_max_air_temp_k',
 'reanalysis_min_air_temp_k',
 'reanalysis_precip_amt_kg_per_m2',
 'reanalysis_relative_humidity_percent',
 'reanalysis_sat_precip_amt_mm',
 'reanalysis_specific_humidity_g_per_kg',
 'reanalysis_tdtr_k',
 'station_avg_temp_c',
 'station_diur_temp_rng_c',
 'station_max_temp_c',
 'station_min_temp_c',
 'station_precip_mm']

## Cleaning the noisy training data

In [3]:
def bools_to_indexes(booleans):
    r = []
    for idx, x in enumerate(booleans):
        if x:
            r.append(idx)
    return r

idx = bools_to_indexes(X_train_1['weekofyear'] == 53)
y_train.drop(idx, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_train_1.drop(idx, inplace=True)
X_train_1.reset_index(drop=True, inplace=True)
X_train_1.shape

(1451, 24)

# Model Selection

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
score_metric='neg_mean_absolute_error'
jobs=-1 #-1 to make it execute in parallel
verbose_level = 0
random_n = 42
base_args = {'estimator': None, 'param_distributions': None, 'n_iter': None, 'scoring': score_metric, 'n_jobs': jobs, 'cv': None, 'verbose': verbose_level, 'random_state': random_n, 'return_train_score': True, 'iid': True}

## SVR
* The results with the kernel *sigmoid* and *poly* were too bad, so we removed them.

In [34]:
k_folds=10
n_iter_search = 5
C = sp_randint(0, 10000)
params = {'kernel':['linear'], 'gamma':['scale'], 'C': C}
SVR_optimizer = RandomizedSearchCV(estimator=SVR(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

## Regression Trees
* 18.01 - with 2 previous weeks & without PCA & with (max_depth=6, min_samples_leaf=0.1611807565247405, min_samples_split=0.11193019906931466)
* 18.29 - With PCA at 0.9
* 18.27 - With PCA at 0.95
* 18.36 - With PCA at 0.65. PCA appears to be only making the model worse.
* 18.38 - Without PCA and with previous weeks. Clearly the previous weeks are useful
* 17.87 - Without PCA and with 3 previous weeks
* 17.86 - Without PCA and with 4 previous weeks
* 18.28 - With PCA 0.95 and 3 previous weeks fixed
* 9.16 - Without PCA, with 3 weeks and 1 last infection (max_depth=5, min_samples_leaf=0.03, min_samples_split=0.108)
* **9.04** - Without PCA, with 3 weeks and 1 last infection (max_depth=5, min_samples_leaf=0.03, min_samples_split=0.108)

In [27]:
k_folds=10
n_iter_search = 100
min_samples = sp_uniform(0.01, 0.35)
params = {'criterion':['mae'], 'max_depth': sp_randint(2, 10), 'min_samples_split': min_samples, 'min_samples_leaf': min_samples}
Tree_optimizer = RandomizedSearchCV(estimator=DecisionTreeRegressor(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

## Random Forests
* 18.34 With 4 previous weeks and without PCA
* 17.79 With fixed 3 previous weeks and PCA at 0.95 (n_estimators= ?, max_depth = 2, min_samples_leaf=0.112, min_samples_split=0.224)
* 17.74 With fixed 3 previous weeks and without PCA (n_estimators= 13 max_depth = 5, min_samples_leaf=0.09, min_samples_split=0.24)
* **9.13** with 3 previous weeks and 1 last infected (n_estimators=9 max_depth = 9, min_samples_leaf=0.014, min_samples_split=0.07)
* 9.22 with 3 previous weeks and 3 last infected (n_estimators=9 max_depth = 9, min_samples_leaf=0.014, min_samples_split=0.08)

In [28]:
k_folds=10
n_iter_search = 40
params = {'n_estimators': sp_randint(2,50), 'criterion':['mae'], 'max_depth': sp_randint(2, 10)}
Forest_optimizer = RandomizedSearchCV(estimator=RandomForestRegressor(n_jobs=-1), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

## Adaboost of Trees
* 10.78 - With 3 last weeks a 3 last infected 
* **8.49** - With 3 last weeks a 3 last infected and only max_depth tuned.

In [29]:
k_folds=10
n_iter_search = 20
params = {'n_estimators': sp_randint(40, 100), 'base_estimator__criterion':['mae'], 'base_estimator__max_depth': sp_randint(2,7)}
AdaTree_optimizer = RandomizedSearchCV(estimator=AdaBoostRegressor(base_estimator=DecisionTreeRegressor()), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

## KNN
* 21.349 - with PCA at 0.65 & 2 previous weeks
* 20.36  - without PCA

In [30]:
k_folds=10
n_iter_search = 100
params = {'n_neighbors': sp_randint(3,150), 'weights': ['uniform', 'distance']}
KNN_optimizer = RandomizedSearchCV(estimator=KNeighborsRegressor(n_jobs=-1), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

# Optimization
* Interestingly, PCA mas all the models worst in this case.

In [36]:
%autoreload
from OurPipeline import create_pipeline
from sklearn.decomposition import PCA

optimizers=[SVR_optimizer]#[Tree_optimizer, Forest_optimizer, AdaTree_optimizer, KNN_optimizer, 
weeks = [1,2]
weeks_infected = [3,4]
pca = [PCA(0.95), None]

n_total = len(optimizers) * len(weeks) * len(weeks_infected) * len(pca)

results=[]
best_attempt = None
best_score = np.inf
idx=0
for opt in optimizers:
    for w in weeks:
        for wi in weeks_infected:
            for p in pca:
                pipeline = create_pipeline(attr, n_weeks=w, n_weeks_infected=wi, estimator_optimizer=opt, pca=p)
                pipeline.fit(X_train_1, y_train)
                score = pipeline.named_steps['est_opt'].best_score_
                best_estimator = pipeline.named_steps['est_opt'].best_estimator_
                attempt = [best_estimator, w, wi, p, score]
                if abs(score) < best_score:
                    best_score = abs(score)
                    best_attempt = attempt
                    print('\nBest score of {} with the estimator {}'.format(best_score, best_estimator))
                idx+=1
                print(str(idx) + '/' + str(n_total), end='\t')
                results.append(attempt)


Best score of 7.332691708334687 with the estimator SVR(C=5734, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
1/8	
Best score of 6.522347109745663 with the estimator SVR(C=5191, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
2/8	3/8	4/8	

KeyboardInterrupt: 

In [19]:
pd.DataFrame(results, columns=['estimator', 'weeks', 'weeks_infected', 'PCA', 'score'])

Unnamed: 0,estimator,weeks,weeks_infected,PCA,score
0,"DecisionTreeRegressor(criterion='mae', max_dep...",1,3,"PCA(copy=True, iterated_power='auto', n_compon...",-12.752929
1,"DecisionTreeRegressor(criterion='mae', max_dep...",1,3,,-8.98346
2,"DecisionTreeRegressor(criterion='mae', max_dep...",1,4,"PCA(copy=True, iterated_power='auto', n_compon...",-13.05031
3,"DecisionTreeRegressor(criterion='mae', max_dep...",1,4,,-8.971744
4,"DecisionTreeRegressor(criterion='mae', max_dep...",2,3,"PCA(copy=True, iterated_power='auto', n_compon...",-13.2357
5,"DecisionTreeRegressor(criterion='mae', max_dep...",2,3,,-9.039628
6,"DecisionTreeRegressor(criterion='mae', max_dep...",2,4,"PCA(copy=True, iterated_power='auto', n_compon...",-12.913163
7,"DecisionTreeRegressor(criterion='mae', max_dep...",2,4,,-9.025844
8,"(DecisionTreeRegressor(criterion='mae', max_de...",1,3,"PCA(copy=True, iterated_power='auto', n_compon...",-9.806561
9,"(DecisionTreeRegressor(criterion='mae', max_de...",1,3,,-7.17419


In [21]:
best_attempt

[RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=6,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=8, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 1,
 3,
 None,
 -7.174190213645762]

In [25]:
best_estimator

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=41, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [32]:
best_attempt

[AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mae', max_depth=6, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, loss='linear', n_estimators=41,
          random_state=None), 2, 4, None, -7.412474155754652]

In [37]:
best_attempt

[SVR(C=5191, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
   kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 1,
 3,
 None,
 -6.522347109745663]

In [38]:
results

[[SVR(C=5734, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
  1,
  3,
  PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False),
  -7.332691708334687],
 [SVR(C=5191, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
  1,
  3,
  None,
  -6.522347109745663],
 [SVR(C=860, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
  1,
  4,
  PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False),
  -7.8651335759783985],
 [SVR(C=860, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
  1,
  4,
  None,
  

# Predict

In [None]:
model = SVR()