In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np

# Loading the Data

In [48]:
X_train_1 = pd.read_csv('data/dengue_features_train.csv')
y_train = pd.read_csv('data/dengue_labels_train.csv')['total_cases']
attr = list(X_train_1)
attr

['city',
 'year',
 'weekofyear',
 'week_start_date',
 'ndvi_ne',
 'ndvi_nw',
 'ndvi_se',
 'ndvi_sw',
 'precipitation_amt_mm',
 'reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'reanalysis_dew_point_temp_k',
 'reanalysis_max_air_temp_k',
 'reanalysis_min_air_temp_k',
 'reanalysis_precip_amt_kg_per_m2',
 'reanalysis_relative_humidity_percent',
 'reanalysis_sat_precip_amt_mm',
 'reanalysis_specific_humidity_g_per_kg',
 'reanalysis_tdtr_k',
 'station_avg_temp_c',
 'station_diur_temp_rng_c',
 'station_max_temp_c',
 'station_min_temp_c',
 'station_precip_mm']

## Cleaning the noisy training data

In [49]:
y_train = y_train[X_train_1['weekofyear'] != 53]
X_train_1 = X_train_1[X_train_1['weekofyear'] != 53]

# Data Pipeline

In [50]:
%autoreload
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from utils.ContinuityImputer import ContinuityImputer
from utils.DataFrameSelector import DataFrameSelector

pipeline = Pipeline([
    ('imputer', ContinuityImputer(attributes=attr[4:])),
    ('dataframe_selector', DataFrameSelector(attribute_names=attr[4:])),
    ('scaler', StandardScaler()),
])

In [51]:
X_train = pipeline.fit_transform(X_train_1)
X_train.shape

(1451, 20)

# Model Selection

In [127]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint as sp_randint
score_metric='neg_mean_absolute_error'
jobs=-1 #-1 to make it execute in parallel
k_folds=10
n_iter_search = 20
verbose_level = 1
random_n = 42

## SVR

In [128]:
C = sp_randint(0, 10000)
params = {'kernel':['rbf', 'sigmoid','linear'], 'gamma':['scale'], 'C': C}

In [None]:
SVR_optimizer = RandomizedSearchCV(estimator=SVR(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
SVR_optimizer.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
params = {'kernel':['poly'], 'degree':sp_randint(2,8), 'gamma':['scale'], 'C': C},

In [None]:
SVR_poly_optimizer = RandomizedSearchCV(estimator=SVR(), param_distributions=params, n_iter=n_iter_search, scoring=score_metric, n_jobs=jobs, cv=k_folds, verbose=verbose_level, random_state=random_n, return_train_score=True, iid=True)
SVR_poly_optimizer.fit(X_train, y_train)

In [None]:
pd.DataFrame(SVR_optimizer.cv_results_)[['mean_fit_time','param_C', 'param_kernel', 'mean_test_score', 'mean_train_score']]