# Fourth iteration 

Trains models on data set of one city first and then train on data from the target city.

No longer workin with Adaboost, only one submission tried with Convulational Neural Network, focusing mainly on LSTM and Random Forest, reasons for this are limited number of possible submissions (3 per day), previous attempts didn't get very good results.

## Load the Data

In [8]:
#Import the necessary libraries
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns

from pandas.plotting import scatter_matrix
from pandas import DataFrame
from pandas import concat
from pandas import read_csv
from pandas import datetime

from matplotlib import pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error,make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.model_selection import train_test_split

# create scorer
scorer = make_scorer(mean_absolute_error)

# Load Iquitos preprocessed Data 
features_train_iq = pd.read_csv("preprocessed data/dengue_features_train_iq.csv")
labels_train_iq = pd.read_csv("preprocessed data/dengue_labels_train_iq.csv")
features_test_iq = pd.read_csv("preprocessed data/dengue_features_test_iq.csv")

# drop all columns referencing times
stripped_features_train_iq = features_train_iq.drop(['year','weekofyear','week_start_date'], axis=1)
stripped_labels_train_iq = labels_train_iq.drop(['year','weekofyear'], axis=1)
stripped_features_test_iq = features_test_iq.drop(['year','weekofyear','week_start_date'], axis=1)

# Load San Juan preprocessed Data 
features_train_sj = pd.read_csv("preprocessed data/dengue_features_train_sj.csv")
labels_train_sj = pd.read_csv("preprocessed data/dengue_labels_train_sj.csv")
features_test_sj = pd.read_csv("preprocessed data/dengue_features_test_sj.csv")

stripped_features_train_sj = features_train_sj.drop(['year','weekofyear','week_start_date'], axis=1)
stripped_labels_train_sj = labels_train_sj.drop(['year','weekofyear'], axis=1)
stripped_features_test_sj = features_test_sj.drop(['year','weekofyear','week_start_date'], axis=1)

In [9]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
    data: Sequence of observations as a list or NumPy array.
    n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
        indexes of removed rows

    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
        
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
            
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)

    return agg


def prepare_data_with_window (data_train, data_labels, data_test, window_size):
    
    data = data_train.append(data_test)
    
    data_w = series_to_supervised(data, n_in=window_size, dropnan=True)
    
    #split
    data_train_w = data_w.iloc[ : (len(data_train) - window_size)]
    data_test_w = data_w.iloc[(len(data_train) - window_size) : ]
    data_labels_w = data_labels.iloc[window_size : ]
    
    return data_train_w, data_labels_w, data_test_w
 
window_size = 20
    
# prepare IQ dataset with a window of size n
w_stripped_features_train_iq, w_stripped_labels_train_iq, w_stripped_features_test_iq = prepare_data_with_window(
    stripped_features_train_iq, 
    stripped_labels_train_iq, 
    stripped_features_test_iq, window_size)

# prepare SJ dataset with a window of size n
w_stripped_features_train_sj, w_stripped_labels_train_sj, w_stripped_features_test_sj = prepare_data_with_window(
    stripped_features_train_sj, 
    stripped_labels_train_sj, 
    stripped_features_test_sj, window_size)

### Initialization

In [10]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Activation
from keras.callbacks import ModelCheckpoint
from keras.regularizers import L1L2

reg = L1L2(l1=0.0, l2=0.00001)
#reg = L1L2(l1=0.0, l2=0.00)

columns_to_scale = w_stripped_features_test_iq.columns.difference(['data_set'])

# Initialize a scaler and apply it to the features
scaler = MinMaxScaler(feature_range=(0, 1)) # default=(0, 1)

### Iquitos

In [11]:
# Normalization should be done on both, train and test features datasets, ensuring that the values in both datasets 
# remain of same magnitude. Therefore these datasets will be joined, applied the minmax normalization, and then splitted.
w_stripped_features_train_iq['data_set'] = 'train'
w_stripped_features_test_iq['data_set']  = 'test'

dengue_norm_features_iq  = w_stripped_features_train_iq.append(w_stripped_features_test_iq)
dengue_norm_features_iq[columns_to_scale] = scaler.fit_transform(dengue_norm_features_iq[columns_to_scale])

# separate into the original datasets, dropping the temporary columns 'dataset'
stripped_norm_dengue_features_train_iq = dengue_norm_features_iq[dengue_norm_features_iq['data_set'] == 'train']
stripped_norm_dengue_features_train_iq = stripped_norm_dengue_features_train_iq.reset_index(drop = True)
stripped_norm_dengue_features_train_iq = stripped_norm_dengue_features_train_iq.drop(['data_set'], axis=1)

stripped_norm_dengue_features_test_iq = dengue_norm_features_iq[dengue_norm_features_iq['data_set'] == 'test']
stripped_norm_dengue_features_test_iq = stripped_norm_dengue_features_test_iq.reset_index(drop = True)
stripped_norm_dengue_features_test_iq = stripped_norm_dengue_features_test_iq.drop(['data_set'], axis=1)

# normalize labels
scalerLabels_iq = scaler.fit(w_stripped_labels_train_iq)
stripped_norm_dengue_labels_train_iq = scalerLabels_iq.transform(w_stripped_labels_train_iq)

# split data into train and test
X_train_iq, X_test_iq = np.split(stripped_norm_dengue_features_train_iq, [int(.8*len(stripped_norm_dengue_features_train_iq))])
y_train_iq, y_test_iq = np.split(stripped_norm_dengue_labels_train_iq, [int(.8*len(stripped_norm_dengue_labels_train_iq))])

# reshape input to be 3D [samples, timesteps, features]
X_submission_iq = stripped_norm_dengue_features_test_iq.values
X_submission_iq_c = X_submission_iq.reshape((X_submission_iq.shape[0], X_submission_iq.shape[1], 1)) # conv1d
X_submission_iq = X_submission_iq.reshape((X_submission_iq.shape[0], 1, X_submission_iq.shape[1])) # LSTM

### San Juan

In [12]:
# Normalization should be done on both, train and test features datasets, ensuring that the values in both datasets 
# remain of same magnitude. Therefore these datasets will be joined, applied the minmax normalization, and then splitted.
w_stripped_features_train_sj['data_set'] = 'train'
w_stripped_features_test_sj['data_set']  = 'test'

dengue_norm_features_sj  = w_stripped_features_train_sj.append(w_stripped_features_test_sj)
dengue_norm_features_sj[columns_to_scale] = scaler.fit_transform(dengue_norm_features_sj[columns_to_scale])

# separate into the original datasets, dropping the temporary columns 'dataset'
stripped_norm_dengue_features_train_sj = dengue_norm_features_sj[dengue_norm_features_sj['data_set'] == 'train']
stripped_norm_dengue_features_train_sj = stripped_norm_dengue_features_train_sj.reset_index(drop = True)
stripped_norm_dengue_features_train_sj = stripped_norm_dengue_features_train_sj.drop(['data_set'], axis=1)

stripped_norm_dengue_features_test_sj = dengue_norm_features_sj[dengue_norm_features_sj['data_set'] == 'test']
stripped_norm_dengue_features_test_sj = stripped_norm_dengue_features_test_sj.reset_index(drop = True)
stripped_norm_dengue_features_test_sj = stripped_norm_dengue_features_test_sj.drop(['data_set'], axis=1)

# normalize labels
scalerLabels_sj = scaler.fit(w_stripped_labels_train_sj)
stripped_norm_dengue_labels_train_sj = scalerLabels_sj.transform(w_stripped_labels_train_sj)

# split data into train and test
X_train_sj, X_test_sj = np.split(stripped_norm_dengue_features_train_sj, [int(.8*len(stripped_norm_dengue_features_train_sj))])
y_train_sj, y_test_sj = np.split(stripped_norm_dengue_labels_train_sj, [int(.8*len(stripped_norm_dengue_labels_train_sj))])

# prepare test dataset
X_submission_sj = stripped_norm_dengue_features_test_sj.values
X_submission_sj_c = X_submission_sj.reshape((X_submission_sj.shape[0], X_submission_sj.shape[1], 1)) # conv1d
X_submission_sj = X_submission_sj.reshape((X_submission_sj.shape[0], 1, X_submission_sj.shape[1])) # LSTM

In [13]:
# prepare training and validation datasets
X_train  = X_train_iq.append(X_train_sj)
X_test  = X_test_iq.append(X_test_sj)

y_train = np.append(y_train_iq, y_train_sj, axis=0)
y_test  = np.append(y_test_iq, y_test_sj, axis=0)

X_train = X_train.values
X_test = X_test.values

# reshape input for conv1D
X_train_c = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_c = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# reshape data for LSTM, input to be 3D [samples, timesteps, features]
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

## Long Short-Term Memory (LSTM)

In [14]:
#design network
model = Sequential()
model.add(LSTM(150, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True, activation='relu', kernel_regularizer=reg))
model.add(LSTM(300, return_sequences=True, activation='relu', kernel_regularizer=reg))
model.add(LSTM(150, kernel_regularizer=reg))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation("linear"))
model.compile(loss='mse', optimizer='adam')

In [15]:
# train model
checkpointer = ModelCheckpoint(filepath='saved_models/weights.LSTM.it5.hdf5', verbose=2, 
                               save_best_only=True)

# fit network       
history = model.fit(X_train, y_train, epochs=200, batch_size=16, validation_data=(X_test, y_test), verbose=0, shuffle=False,
                    callbacks=[checkpointer])

# load best weights
model.load_weights('saved_models/weights.LSTM.it5.hdf5')

Epoch 00001: val_loss improved from inf to 0.00862, saving model to saved_models/weights.LSTM.it5.hdf5
Epoch 00002: val_loss improved from 0.00862 to 0.00768, saving model to saved_models/weights.LSTM.it5.hdf5
Epoch 00003: val_loss improved from 0.00768 to 0.00755, saving model to saved_models/weights.LSTM.it5.hdf5
Epoch 00004: val_loss improved from 0.00755 to 0.00751, saving model to saved_models/weights.LSTM.it5.hdf5
Epoch 00005: val_loss improved from 0.00751 to 0.00742, saving model to saved_models/weights.LSTM.it5.hdf5
Epoch 00006: val_loss improved from 0.00742 to 0.00735, saving model to saved_models/weights.LSTM.it5.hdf5
Epoch 00007: val_loss improved from 0.00735 to 0.00730, saving model to saved_models/weights.LSTM.it5.hdf5
Epoch 00008: val_loss improved from 0.00730 to 0.00725, saving model to saved_models/weights.LSTM.it5.hdf5
Epoch 00009: val_loss improved from 0.00725 to 0.00723, saving model to saved_models/weights.LSTM.it5.hdf5
Epoch 00010: val_loss improved from 0.007

Epoch 00139: val_loss did not improve
Epoch 00140: val_loss did not improve
Epoch 00141: val_loss did not improve
Epoch 00142: val_loss did not improve
Epoch 00143: val_loss did not improve
Epoch 00144: val_loss did not improve
Epoch 00145: val_loss did not improve
Epoch 00146: val_loss did not improve
Epoch 00147: val_loss did not improve
Epoch 00148: val_loss did not improve
Epoch 00149: val_loss did not improve
Epoch 00150: val_loss did not improve
Epoch 00151: val_loss did not improve
Epoch 00152: val_loss did not improve
Epoch 00153: val_loss did not improve
Epoch 00154: val_loss did not improve
Epoch 00155: val_loss did not improve
Epoch 00156: val_loss did not improve
Epoch 00157: val_loss did not improve
Epoch 00158: val_loss did not improve
Epoch 00159: val_loss did not improve
Epoch 00160: val_loss did not improve
Epoch 00161: val_loss did not improve
Epoch 00162: val_loss did not improve
Epoch 00163: val_loss did not improve
Epoch 00164: val_loss did not improve
Epoch 00165:

In [16]:
# make a prediction for Iquitos
y_submission = model.predict(X_submission_iq)

# invert scaling for forecast
y_submission = scalerLabels_iq.inverse_transform(y_submission)
y_submission_iq = np.around(y_submission, decimals=0)
y_submission_iq = y_submission_iq.astype(int)

In [17]:
# make a prediction for San Juan
y_submission = model.predict(X_submission_sj)

# invert scaling for forecast
y_submission = scalerLabels_iq.inverse_transform(y_submission)
y_submission_sj = np.around(y_submission, decimals=0)
y_submission_sj = y_submission_sj.astype(int)

### Create LSTM Submission File

In [18]:
# San Juan - city,year,weekofyear,total_cases
submission_sj = features_test_sj[['year','weekofyear']]
submission_sj.insert( 0,'city','sj')

df_y_submission_sj = pd.DataFrame(y_submission_sj, columns=['total_cases'])
submission_sj = pd.concat([submission_sj, df_y_submission_sj], axis=1)

# Iquitos - city,year,weekofyear,total_cases
submission_iq = features_test_iq[['year','weekofyear']]
submission_iq.insert( 0,'city','iq')

df_y_submission_iq = pd.DataFrame(y_submission_iq, columns=['total_cases'])
submission_iq = pd.concat([submission_iq, df_y_submission_iq], axis=1)

# join both predictions
submission = pd.concat([submission_sj, submission_iq])
submission = submission.reset_index(drop = True)

#write into csv
submission.to_csv("Submission/Submission_it5_lstm_b16_L2_w20.csv", encoding='utf-8', index=False)

### LSTM Submission Score 

#### With kernel regularizer L2 (0.00001),  Batch Size 8 and Window Size 8:


#### With kernel regularizer L2 (0.00001),  Batch Size 16 and Window Size4:


#### Without kernel regularizer,  Batch Size 32 and window size 8:



#### With kernel regularizer L2 (0.00001),  Batch Size 32 and Window Size 8:


#### With kernel regularizer L2 (0.00001),  Batch Size 32 and Window Size 10:
Bad result

#### With kernel regularizer L2 (0.00001),  Batch Size 32 and Window Size 12:
Bad result

## Sequence classification with 1D convolutions

### Train

In [82]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.layers import Activation
from keras.callbacks import ModelCheckpoint
from keras.regularizers import L1L2

model_c = Sequential()
model_c.add(Conv1D(96, 2, activation='relu', input_shape=( X_train_c.shape[1], 1), kernel_regularizer=reg))
model_c.add(Conv1D(96, 2, activation='relu', kernel_regularizer=reg))
model_c.add(MaxPooling1D(1))
model_c.add(Conv1D(192, 2, activation='relu', kernel_regularizer=reg))
model_c.add(Conv1D(192, 2, activation='relu', kernel_regularizer=reg))
model_c.add(GlobalAveragePooling1D())
model_c.add(Dropout(0.5))
model_c.add(Dense(1, activation='sigmoid'))

model_c.compile(loss='mse', optimizer='adam')

In [83]:
checkpointer = ModelCheckpoint(filepath='saved_models/weights.conv1D.b32.it2.iq.hdf5', verbose=2, 
                               save_best_only=True)
        
history = model_c.fit(X_train_c, y_train, epochs=200, batch_size=32, validation_data=(X_test_c, y_test), verbose=0, shuffle=False,
                    callbacks=[checkpointer])

# load best weights
model_c.load_weights('saved_models/weights.conv1D.b32.it2.iq.hdf5')

Epoch 00001: val_loss improved from inf to 0.01363, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00002: val_loss improved from 0.01363 to 0.01297, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00003: val_loss improved from 0.01297 to 0.01251, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00004: val_loss improved from 0.01251 to 0.01007, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss improved from 0.01007 to 0.00967, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss improved from 0.00967 to 0.00793, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss improved from 0.00793 to 0.00783, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00012: val_loss did not improve
Ep

Epoch 00147: val_loss did not improve
Epoch 00148: val_loss did not improve
Epoch 00149: val_loss did not improve
Epoch 00150: val_loss improved from 0.00678 to 0.00676, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00151: val_loss did not improve
Epoch 00152: val_loss did not improve
Epoch 00153: val_loss did not improve
Epoch 00154: val_loss did not improve
Epoch 00155: val_loss improved from 0.00676 to 0.00676, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00156: val_loss did not improve
Epoch 00157: val_loss did not improve
Epoch 00158: val_loss did not improve
Epoch 00159: val_loss did not improve
Epoch 00160: val_loss did not improve
Epoch 00161: val_loss did not improve
Epoch 00162: val_loss did not improve
Epoch 00163: val_loss improved from 0.00676 to 0.00676, saving model to saved_models/weights.conv1D.b32.it2.iq.hdf5
Epoch 00164: val_loss did not improve
Epoch 00165: val_loss did not improve
Epoch 00166: val_loss did not improve
Epoch 

In [85]:
# make a prediction
y_submission = model_c.predict(X_submission_iq_c)

# invert scaling for forecast
y_submission = scalerLabels_iq.inverse_transform(y_submission)
y_submission_iq = np.around(y_submission, decimals=0)
y_submission_iq = y_submission_iq.astype(int)

In [86]:
# make a prediction
y_submission = model_c.predict(X_submission_sj_c)

# invert scaling for forecast
y_submission = scalerLabels_iq.inverse_transform(y_submission)
y_submission_sj = np.around(y_submission, decimals=0)
y_submission_sj = y_submission_sj.astype(int)

### Create Conv1 Submission File

In [87]:
# San Juan - city,year,weekofyear,total_cases
submission_sj = features_test_sj[['year','weekofyear']]
submission_sj.insert( 0,'city','sj')

df_y_submission_sj = pd.DataFrame(y_submission_sj, columns=['total_cases'])
submission_sj = pd.concat([submission_sj, df_y_submission_sj], axis=1)

# Iquitos - city,year,weekofyear,total_cases
submission_iq = features_test_iq[['year','weekofyear']]
submission_iq.insert( 0,'city','iq')

df_y_submission_iq = pd.DataFrame(y_submission_iq, columns=['total_cases'])
submission_iq = pd.concat([submission_iq, df_y_submission_iq], axis=1)

# join both predictions
submission = pd.concat([submission_sj, submission_iq])
submission = submission.reset_index(drop = True)

#write into csv
submission.to_csv("Submission/Submission_5_conv1d_b32_L2_w12.csv", encoding='utf-8', index=False)

#### With kernel regularizer L2 (0.00001), Batch Size 32, window 12:
35.2380

#### With kernel regularizer L2 (0.00001), Batch Size 32, window 4:
