In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np

# Loading the Data

In [156]:
X_train_1 = pd.read_csv('data/dengue_features_train.csv')
y_train = pd.read_csv('data/dengue_labels_train.csv')['total_cases']
attr = list(X_train_1)
attr

['city',
 'year',
 'weekofyear',
 'week_start_date',
 'ndvi_ne',
 'ndvi_nw',
 'ndvi_se',
 'ndvi_sw',
 'precipitation_amt_mm',
 'reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'reanalysis_dew_point_temp_k',
 'reanalysis_max_air_temp_k',
 'reanalysis_min_air_temp_k',
 'reanalysis_precip_amt_kg_per_m2',
 'reanalysis_relative_humidity_percent',
 'reanalysis_sat_precip_amt_mm',
 'reanalysis_specific_humidity_g_per_kg',
 'reanalysis_tdtr_k',
 'station_avg_temp_c',
 'station_diur_temp_rng_c',
 'station_max_temp_c',
 'station_min_temp_c',
 'station_precip_mm']

## Cleaning the noisy training data

In [3]:
def bools_to_indexes(booleans):
    r = []
    for idx, x in enumerate(booleans):
        if x:
            r.append(idx)
    return r

idx = bools_to_indexes(X_train_1['weekofyear'] == 53)
y_train.drop(idx, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_train_1.drop(idx, inplace=True)
X_train_1.reset_index(drop=True, inplace=True)
X_train_1.shape

(1451, 24)

# Model Selection

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
score_metric='neg_mean_absolute_error'
jobs=-1 #-1 to make it execute in parallel
verbose_level = 0
random_n = 42
base_args = {'estimator': None, 'param_distributions': None, 'n_iter': None, 'scoring': score_metric, 'n_jobs': jobs, 'cv': None, 'verbose': verbose_level, 'random_state': random_n, 'return_train_score': True, 'iid': True}

## SVR
* The results with the kernel *sigmoid* and *poly* were too bad, so we removed them.

In [5]:
k_folds=10
n_iter_search = 20
C = sp_randint(0, 10000)
params = {'kernel':['linear'], 'gamma':['scale'], 'C': C}
SVR_optimizer = RandomizedSearchCV(estimator=SVR(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

## Regression Trees
* 18.01 - with 2 previous weeks & without PCA & with (max_depth=6, min_samples_leaf=0.1611807565247405, min_samples_split=0.11193019906931466)
* 18.29 - With PCA at 0.9
* 18.27 - With PCA at 0.95
* 18.36 - With PCA at 0.65. PCA appears to be only making the model worse.
* 18.38 - Without PCA and with previous weeks. Clearly the previous weeks are useful
* 17.87 - Without PCA and with 3 previous weeks
* 17.86 - Without PCA and with 4 previous weeks
* 18.28 - With PCA 0.95 and 3 previous weeks fixed
* 9.16 - Without PCA, with 3 weeks and 1 last infection (max_depth=5, min_samples_leaf=0.03, min_samples_split=0.108)
* **9.04** - Without PCA, with 3 weeks and 1 last infection (max_depth=5, min_samples_leaf=0.03, min_samples_split=0.108)

In [6]:
k_folds=10
n_iter_search = 100
min_samples = sp_uniform(0.01, 0.35)
params = {'criterion':['mae'], 'max_depth': sp_randint(2, 10), 'min_samples_split': min_samples, 'min_samples_leaf': min_samples}
Tree_optimizer = RandomizedSearchCV(estimator=DecisionTreeRegressor(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

## Random Forests
* 18.34 With 4 previous weeks and without PCA
* 17.79 With fixed 3 previous weeks and PCA at 0.95 (n_estimators= ?, max_depth = 2, min_samples_leaf=0.112, min_samples_split=0.224)
* 17.74 With fixed 3 previous weeks and without PCA (n_estimators= 13 max_depth = 5, min_samples_leaf=0.09, min_samples_split=0.24)
* **9.13** with 3 previous weeks and 1 last infected (n_estimators=9 max_depth = 9, min_samples_leaf=0.014, min_samples_split=0.07)
* 9.22 with 3 previous weeks and 3 last infected (n_estimators=9 max_depth = 9, min_samples_leaf=0.014, min_samples_split=0.08)

In [8]:
k_folds=10
n_iter_search = 40
params = {'n_estimators': sp_randint(2,50), 'criterion':['mae'], 'max_depth': sp_randint(2, 10)}
Forest_optimizer = RandomizedSearchCV(estimator=RandomForestRegressor(n_jobs=-1), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

## Adaboost of Trees
* 10.78 - With 3 last weeks a 3 last infected 
* **8.49** - With 3 last weeks a 3 last infected and only max_depth tuned.

In [9]:
k_folds=10
n_iter_search = 20
params = {'n_estimators': sp_randint(40, 100), 'base_estimator__criterion':['mae'], 'base_estimator__max_depth': sp_randint(2,7)}
AdaTree_optimizer = RandomizedSearchCV(estimator=AdaBoostRegressor(base_estimator=DecisionTreeRegressor()), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

## KNN
* 21.349 - with PCA at 0.65 & 2 previous weeks
* 20.36  - without PCA

In [10]:
k_folds=10
n_iter_search = 100
params = {'n_neighbors': sp_randint(3,150), 'weights': ['uniform', 'distance']}
KNN_optimizer = RandomizedSearchCV(estimator=KNeighborsRegressor(n_jobs=-1), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)

# Optimization
* Interestingly, PCA makes all the models worst in this case.
* After the exaustive search, the best model was the SVR which obtained an MAE of 6.52.

In [15]:
%autoreload
from OurPipeline import create_pipeline
from sklearn.decomposition import PCA

optimizers=[Tree_optimizer, Forest_optimizer, AdaTree_optimizer, KNN_optimizer]#, SVR_optimizer]
weeks = [1]
weeks_infected = [3]
pca = [None]

n_total = len(optimizers) * len(weeks) * len(weeks_infected) * len(pca)

results=[]
best_attempt = None
best_score = np.inf
idx=0
for opt in optimizers:
    for w in weeks:
        for wi in weeks_infected:
            for p in pca:
                pipeline = create_pipeline(attr, n_weeks=w, n_weeks_infected=wi, estimator_optimizer=opt, add_noise=True, noise_mean=6.5, noise_std=6.5, pca=None)
                pipeline.fit(X_train_1, y_train)
                score = pipeline.named_steps['est_opt'].best_score_
                best_estimator = pipeline.named_steps['est_opt'].best_estimator_
                attempt = [best_estimator, w, wi, p, score]
                if abs(score) < best_score:
                    best_score = abs(score)
                    best_attempt = attempt
                    print('\nBest score of {} with the estimator {}'.format(best_score, best_estimator))
                idx+=1
                print(str(idx) + '/' + str(n_total), end='\t')
                results.append(attempt)


Best score of 10.757898351648352 with the estimator DecisionTreeRegressor(criterion='mae', max_depth=7, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=0.048518173584686866,
           min_samples_split=0.08977730688967958,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
1/4	
Best score of 8.57545699492815 with the estimator RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=26, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
2/4	3/4	4/4	

In [None]:
pd.DataFrame(results, columns=['estimator', 'weeks', 'weeks_infected', 'PCA', 'score'])

In [37]:
best_attempt

[SVR(C=5191, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
   kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 1,
 3,
 None,
 -6.522347109745663]

# Predict

In [171]:
%autoreload
from OurPipeline import create_pipeline

pipeline = create_pipeline(attr, n_weeks=1, n_weeks_infected=3, pca=None)
X_train = pipeline.fit_transform(X_train_1, y_train)

In [56]:
model = SVR(kernel= 'linear', C=5191, gamma='scale')
model.fit(X_train, y_train)

SVR(C=5191, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

### Loading test data

In [157]:
X_test_1 = pd.read_csv('data/dengue_features_test.csv')
print(X_test_1.shape)

(416, 24)


## One by one prediction
* Given that we are making sequential predictions, i.e.: the prediction from a week relies on the prediction from the previous weeks, we must make the transformations and predictions one by one.
* Given that this kind of prediction is very prone to a snowball effect on errors our first solution had an error of 26. To solve this we came up with the idea of adding noise to the train data. However for this solution we need to know both: the mean of the error and its standard deviation (*std*). We already know the mean (MAE), we just need to know the *std*

In [172]:
predictions=[]
for idx in range(X_test_1.shape[0]):
    x = pipeline.transform(X_test_1.loc[idx:idx,:])
    pred = model.predict(x)
    pred = int(np.round(pred))
    pipeline.named_steps['l_infected'].append_y(pred)
    predictions.append(pred)
len(predictions)

416

### Calculating an approximation of the *std*
* It is approximately 10.9. We can see that the MAE is close to the one calculated in the cross-validation.

In [216]:
%autoreload
from OurPipeline import create_pipeline
from sklearn.model_selection import ShuffleSplit

pipeline = create_pipeline(attr, n_weeks=1, n_weeks_infected=3, pca=None)
X_train = pipeline.fit_transform(X_train_1, y_train)

In [217]:
sp = ShuffleSplit(n_splits=1, train_size=1000, test_size=None, random_state=random_n)
for train, test in sp.split(X_train, y_train):
    X_train_std = X_train[train]
    y_train_std = y_train[train]
    X_test_std = X_train[test]
    y_test_std = y_train[test]
X_train_std.shape, y_train_std.shape
X_test_std.shape, y_test_std.shape

In [222]:
model = SVR(kernel= 'linear', C=5191, gamma='scale')
model.fit(X_train_std, y_train_std)

SVR(C=5191, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [234]:
predictions = model.predict(X_test_std)
predictions = list(map(lambda x: int(np.round(x)), predictions))
errors = list(map(abs, predictions - y_test_std))
np.mean(errors), np.std(errors)

(6.7785087719298245, 10.959317651673116)

In [152]:
from random import choice, gauss
r=[]
for _ in range(100000):
    r.append(int(np.round(choice([-1,1]) * gauss(mu=0, sigma=8.2))))
r=np.abs(r)
np.mean(r), np.std(r)

(6.53353, 4.950353092366241)

# One by one prediction with noise
* When dealing with the test data, the noise adding feature of the pipeline must be disabled, otherwise our predictions will be based on 2 layers of noise: our synthetic noise and the one created by the predictive model.
* A very likely guess is that the errors when y is low is much smaller than when y is high.

In [157]:
%autoreload
from OurPipeline import create_pipeline

pipeline = create_pipeline(attr, n_weeks=1, n_weeks_infected=3, add_noise=True, noise_mean=0, noise_std=8.2, pca=None)
X_train = pipeline.fit_transform(X_train_1, y_train)

In [155]:
model = RandomForestRegressor(criterion='mae', n_estimators=100, max_depth=3)

In [156]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [161]:
%autoreload
from utils.OurPipeline import create_pipeline
from utils.predict_in_order import predict_in_order

pipeline = create_pipeline(attr, n_weeks=1, n_weeks_infected=3, add_noise=False, pca=None)
pipeline.fit_transform(X_train_1, y_train)

predict_in_order(X_test_1, model, pipeline)

416

## Submission

In [148]:
submit = pd.DataFrame(predictions, columns=['total_cases'])
x_3 = X_test_1.iloc[:,:3].copy()
submit = pd.concat([x_3, submit], axis=1)
submit.to_csv('data/submit.csv', index=False)

In [162]:
pd.concat([X_test_f, pd.DataFrame(predictions, columns=['pred'])], axis=1).head(10)

Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,last_weeks_0_reanalysis_tdtr_k,last_weeks_0_station_avg_temp_c,last_weeks_0_station_diur_temp_rng_c,last_weeks_0_station_max_temp_c,last_weeks_0_station_min_temp_c,last_weeks_0_station_precip_mm,last_infected_0,last_infected_1,last_infected_2,pred
0,-0.0189,-0.0189,0.102729,0.0912,78.6,298.492857,298.55,294.527143,301.1,296.4,...,3.957143,27.042857,7.514286,31.7,23.3,0.3,5.0,3.0,1.0,6
1,-0.018,-0.0124,0.082043,0.072314,12.56,298.475714,298.557143,294.395714,300.8,296.7,...,3.128571,26.528571,7.057143,33.3,21.7,75.2,6.0,5.0,3.0,6
2,-0.0015,-0.0124,0.151083,0.091529,3.66,299.455714,299.357143,295.308571,302.2,296.4,...,2.571429,26.071429,5.557143,30.0,22.2,34.3,6.0,6.0,5.0,6
3,-0.0015,-0.019867,0.124329,0.125686,0.0,299.69,299.728571,294.402857,303.0,296.9,...,4.428571,27.928571,7.785714,32.8,22.8,3.0,6.0,6.0,6.0,6
4,0.0568,0.039833,0.062267,0.075914,0.76,299.78,299.671429,294.76,302.3,297.3,...,4.342857,28.057143,6.271429,33.3,24.4,0.3,6.0,6.0,6.0,6
5,-0.044,-0.030467,0.132,0.083529,71.17,299.768571,299.728571,295.314286,301.9,297.6,...,3.542857,27.614286,7.085714,33.3,23.3,84.1,6.0,6.0,6.0,6
6,-0.0443,-0.024925,0.132271,0.159157,48.99,300.062857,300.007143,295.65,302.4,297.5,...,2.857143,28.0,5.171429,32.8,25.0,27.7,6.0,6.0,6.0,6
7,-0.0443,0.08215,0.144371,0.116729,30.81,300.484286,300.578571,295.997143,303.5,297.5,...,3.157143,27.4,6.042857,31.1,23.3,91.7,6.0,6.0,6.0,6
8,0.0108,0.0499,0.100571,0.117329,8.02,300.601429,300.621429,296.268571,302.5,298.5,...,3.9,28.757143,6.985714,34.4,24.4,0.3,6.0,6.0,6.0,6
9,0.072667,0.10666,0.155429,0.1649,17.52,300.497143,300.528571,296.411429,302.3,298.7,...,2.785714,28.657143,6.242857,32.8,23.9,28.7,6.0,6.0,6.0,6


In [158]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(model.predict(X_train), y_train)

7.860576923076923

In [160]:
max(model.predict(X_train))

371.265

In [163]:
max(predictions)

6

# Test split of tail
* To simulate what we are doing with the test data, we are going to split the train data, for each city, by sampling N entries from the tail of each city for testing.

In [101]:
idx_sj = X_train_1['city'] == 'sj'
X_sj = X_train_1[idx_sj]
y_sj = y_train[idx_sj]

idx_iq = X_train_1['city'] == 'iq'
X_iq = X_train_1[idx_iq]
y_iq = y_train[idx_iq]

X_sj.shape, y_sj.shape, X_iq.shape, y_iq.shape

((933, 24), (933,), (518, 24), (518,))

In [102]:
from sklearn.model_selection import train_test_split

l = train_test_split(X_sj, y_sj, train_size=0.5, test_size=None, shuffle=False)
X_train_sj = l[0]
X_test_sj = l[1]
y_train_sj = l[2]
y_test_sj = l[3]

l = train_test_split(X_iq, y_iq, train_size=0.5, test_size=None, shuffle=False)
X_train_iq = l[0]
X_test_iq = l[1]
y_train_iq = l[2]
y_test_iq = l[3]

X_train_sj.shape, X_test_sj.shape, y_train_sj.shape, y_test_sj.shape, X_train_iq.shape, X_test_iq.shape, y_train_iq.shape, y_test_iq.shape

((466, 24), (467, 24), (466,), (467,), (259, 24), (259, 24), (259,), (259,))

In [103]:
X_train_2 = pd.concat([X_train_sj, X_train_iq])
y_train_2 = pd.concat([y_train_sj, y_train_iq])
X_test_2 = pd.concat([X_test_sj, X_test_iq])
y_test_2 = pd.concat([y_test_sj, y_test_iq])

X_train_2.reset_index(drop=True, inplace=True)
X_test_2.reset_index(drop=True, inplace=True)
y_train_2.reset_index(drop=True, inplace=True)
y_test_2.reset_index(drop=True, inplace=True)
X_train_2.shape, y_train_2.shape, X_test_2.shape, y_test_2.shape

((725, 24), (725,), (726, 24), (726,))

### Pipeline

In [151]:
%autoreload
from OurPipeline import create_pipeline

pipeline = create_pipeline(attr, n_weeks=1, n_weeks_infected=3, add_noise=False, pca=None)
X_train = pipeline.fit_transform(X_train_2, y_train_2)

### Train

In [152]:
model = RandomForestRegressor(criterion='mae', n_estimators=150, max_depth=3)
model.fit(X_train, y_train_2)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [153]:
%autoreload
from utils.OurPipeline import create_pipeline
from utils.predict_in_order import predict_in_order

pipeline = create_pipeline(attr, n_weeks=1, n_weeks_infected=3, add_noise=False, pca=None)
pipeline.fit_transform(X_train_2, y_train_2)

pred = predict_in_order(X_test_2, model=model, pipeline=pipeline)

In [154]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(pred, y_test_2)

11.414600550964188

### Submit

In [158]:
%autoreload
from utils.OurPipeline import create_pipeline
from utils.predict_in_order import predict_in_order

pipeline = create_pipeline(attr, n_weeks=1, n_weeks_infected=3, add_noise=False, pca=None)
X_train = pipeline.fit_transform(X_train_1, y_train)

model.fit(X_train, y_train)

pred = predict_in_order(X_test_1, model=model, pipeline=pipeline)

In [161]:
submit = pd.DataFrame(pred, columns=['total_cases'])
x_3 = X_test_1.iloc[:,:3].copy()
submit = pd.concat([x_3, submit], axis=1)
submit.to_csv('data/submit.csv', index=False)