In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np

# Loading the Data

In [19]:
X_train_1 = pd.read_csv('data/dengue_features_train.csv')
y_train = pd.read_csv('data/dengue_labels_train.csv')['total_cases']
attr = list(X_train_1)
attr

['city',
 'year',
 'weekofyear',
 'week_start_date',
 'ndvi_ne',
 'ndvi_nw',
 'ndvi_se',
 'ndvi_sw',
 'precipitation_amt_mm',
 'reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'reanalysis_dew_point_temp_k',
 'reanalysis_max_air_temp_k',
 'reanalysis_min_air_temp_k',
 'reanalysis_precip_amt_kg_per_m2',
 'reanalysis_relative_humidity_percent',
 'reanalysis_sat_precip_amt_mm',
 'reanalysis_specific_humidity_g_per_kg',
 'reanalysis_tdtr_k',
 'station_avg_temp_c',
 'station_diur_temp_rng_c',
 'station_max_temp_c',
 'station_min_temp_c',
 'station_precip_mm']

## Cleaning the noisy training data

In [20]:
def bools_to_indexes(booleans):
    r = []
    for idx, x in enumerate(booleans):
        if x:
            r.append(idx)
    return r

idx = bools_to_indexes(X_train_1['weekofyear'] == 53)
y_train.drop(idx, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_train_1.drop(idx, inplace=True)
X_train_1.reset_index(drop=True, inplace=True)
X_train_1.shape

(1451, 24)

# Data Pipeline

In [21]:
%autoreload
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from utils.ContinuityImputer import ContinuityImputer
from utils.DataFrameDropper import DataFrameDropper
from utils.LastWeeks import LastWeeks
lw = LastWeeks(attributes=['ndvi_ne', 'precipitation_amt_mm', 'reanalysis_relative_humidity_percent'], weeks=3)

pipeline = Pipeline([
    ('imputer', ContinuityImputer(attributes=attr[4:])),
    ('lw', LastWeeks(attributes=attr[4:], weeks=2)),
    #('dataframe_dropper', DataFrameDropper(attribute_names=attr[:4])),
    #('scaler', StandardScaler()),
    #('pca', PCA(n_components=0.9))
])

In [22]:
X_train = pipeline.fit_transform(X_train_1)
X_train.shape

X:  (1451, 24)
R:  (1451, 40)


(1456, 64)

# Model Selection

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
score_metric='neg_mean_absolute_error'
jobs=-1 #-1 to make it execute in parallel
verbose_level = 1
random_n = 42

## SVR
* The results with the kernel *sigmoid* were too bad, so we removed them.

In [14]:
k_folds=4
n_iter_search = 20
C = sp_randint(0, 10000)
params = {'kernel':['rbf', 'linear'], 'gamma':['scale'], 'C': C}

In [None]:
SVR_optimizer = RandomizedSearchCV(estimator=SVR(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
SVR_optimizer.fit(X_train, y_train)
SVR_optimizer.best_score_

In [17]:
params = {'kernel':['poly'], 'degree':sp_randint(2,8), 'gamma':['scale'], 'C': C}

In [None]:
SVR_poly_optimizer = RandomizedSearchCV(estimator=SVR(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
SVR_poly_optimizer.fit(X_train, y_train)
SVR_poly_optimizer.best_score_

## Regression Trees

In [61]:
k_folds=10
n_iter_search = 100
min_samples = sp_uniform(0.03, 0.35)
params = {'criterion':['mae'], 'max_depth': sp_randint(2, 5), 'min_samples_split': min_samples, 'min_samples_leaf': min_samples}

In [62]:
Tree_optimizer = RandomizedSearchCV(estimator=DecisionTreeRegressor(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
Tree_optimizer.fit(X_train, y_train)
Tree_optimizer.best_score_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 319 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   22.8s finished


-18.391109579600275

## Random Forests

In [66]:
k_folds=5
n_iter_search = 100
min_samples = sp_uniform(0.01, 0.35)
params = {'n_estimators': sp_randint(5,30), 'criterion':['mae'], 'max_depth': sp_randint(2, 8), 'min_samples_split': min_samples, 'min_samples_leaf': min_samples}

In [67]:
Forest_optimizer = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
Forest_optimizer.fit(X_train, y_train)
Forest_optimizer.best_score_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 261 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   53.8s finished


-18.976912474155757

In [34]:
np.sum(np.isnan(X_train), axis=0)

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5])

In [35]:
5*60

300

In [37]:
np.nan

nan

In [12]:
X_train.loc[idx,:]

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,last_weeks_1_reanalysis_precip_amt_kg_per_m2,last_weeks_1_reanalysis_relative_humidity_percent,last_weeks_1_reanalysis_sat_precip_amt_mm,last_weeks_1_reanalysis_specific_humidity_g_per_kg,last_weeks_1_reanalysis_tdtr_k,last_weeks_1_station_avg_temp_c,last_weeks_1_station_diur_temp_rng_c,last_weeks_1_station_max_temp_c,last_weeks_1_station_min_temp_c,last_weeks_1_station_precip_mm
139,,,,,,,,,,,...,24.94,76.661429,0.0,15.251429,2.642857,26.685714,8.385714,32.2,21.7,47.5
451,,,,,,,,,,,...,12.8,78.418571,0.0,16.564286,2.128571,25.928571,6.042857,30.0,22.2,29.7
763,,,,,,,,,,,...,21.68,74.778571,0.0,14.261429,1.957143,24.985714,4.9,28.3,21.1,23.8
1170,,,,,,,,,,,...,96.21,95.53,93.73,18.408571,7.228571,28.4,10.4,34.2,22.5,232.1
1430,,,,,,,,,,,...,31.2,87.641429,19.04,18.001429,8.628571,28.966667,11.266667,35.2,22.5,0.8


In [28]:
df = pd.DataFrame([[1,2],[3,4],[5,6]], columns=['a','b'])
df

Unnamed: 0,a,b
0,1,2
1,3,4
2,5,6


In [29]:
df.drop([1],inplace=True)
df

Unnamed: 0,a,b
0,1,2
2,5,6


In [30]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,a,b
0,1,2
1,5,6


In [18]:
for idx, (_, x) in enumerate(df.iterrows()):
    print(idx)

0
1
