# 1. Data Preprocessing
## Import Dependencies

In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/test_round2.csv
./data/.DS_Store
./data/train_round2.csv
./data/.graph_round2.csv.icloud
./data/test.csv
./data/graph.csv
./data/submission_round2.csv
./data/submission.csv
./data/train.csv


## Load Data

In [3]:
data_dir = 'data'
temporal_data_path = os.path.join(data_dir, 'train_round2.csv')
mobility_data_path = os.path.join(data_dir, 'graph_round2.csv')

temporal_data = pd.read_csv(temporal_data_path)
temporal_data.describe()
#  training data from 04/12/2020 to 11/22/2020.

Unnamed: 0,ID,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
count,11250.0,11250.0,11250.0,9419.0,11250.0,11250.0,10550.0,5047.0,10550.0,11250.0,5047.0
mean,5624.5,95836.25,3057.178311,40276.189086,59072.48,1357.938709,1161268.0,6250.044185,3.300561,20085.845676,12.197144
std,3247.739599,151662.1,5377.615328,81746.843553,124372.8,1199.193935,2031813.0,13880.694435,2.039538,19319.956138,5.255183
min,0.0,270.0,0.0,0.0,-989.0,32.771004,5459.0,10.0,0.0,460.300152,2.302896
25%,2812.25,10546.0,273.0,3030.5,4599.5,374.587118,157964.0,595.0,1.708176,5761.788808,8.387037
50%,5624.5,41758.0,1031.0,10376.0,15791.0,1045.850447,490165.5,2068.0,2.802207,15172.226199,11.351161
75%,8436.75,117495.8,3387.0,48028.0,52896.75,2046.182244,1247765.0,6134.5,4.425834,27209.961598,15.396088
max,11249.0,1153529.0,34319.0,913796.0,1095798.0,9537.675412,19565150.0,89995.0,9.741481,134755.702292,38.50119


In [4]:
states = list(np.unique(temporal_data['Province_State']))
state_cum_temporal_data = dict.fromkeys(states, None)
state_temporal_data = dict.fromkeys(states, None)
# state_cum_temporal_scaler = defaultdict(StandardScaler)
# state_temporal_scaler = defaultdict(StandardScaler)
dropped_attr = ['Date',
                'Active',
                'ID', 
                'Province_State', 
                'Incident_Rate', 
                'Recovered', 
                'People_Tested', 
                'People_Hospitalized', 
                'Mortality_Rate', 
                'Testing_Rate', 
                'Hospitalization_Rate']

for s in states:
    df_filter = temporal_data['Province_State'] == s
    state_df = temporal_data[df_filter]
    # Daily difference data
    state_temporal_data[s] = state_df.drop(dropped_attr, 1)
    for col in state_temporal_data[s]:
        data = state_temporal_data[s][col].tolist()
        diff = [i - j for i, j in zip(data, [*[data[0]], *data[:len(data) - 1]])]
        state_temporal_data[s][col] = diff
    save_columns = state_temporal_data[s].columns
#     state_temporal_data[s] = state_temporal_scaler[s].fit_transform(state_temporal_data[s])
    state_temporal_data[s] = pd.DataFrame(state_temporal_data[s], columns=save_columns)
    # Daily cumulative data
    state_cum_temporal_data[s] = state_df.drop(dropped_attr, 1)
    save_columns = state_cum_temporal_data[s].columns
#     state_cum_temporal_data[s] = state_cum_temporal_scaler[s].fit_transform(state_cum_temporal_data[s])
    state_cum_temporal_data[s] = pd.DataFrame(state_cum_temporal_data[s], columns=save_columns)
    
print(state_temporal_data['California'])
print(state_cum_temporal_data['California'])

       Confirmed  Deaths
4              0       0
54          1136      74
104         1425      53
154         1330      93
204          991      96
...          ...     ...
11004      11755     106
11054      13134      89
11104      12576      98
11154      15685      32
11204       9089      54

[225 rows x 2 columns]
       Confirmed  Deaths
4          22795     640
54         23931     714
104        25356     767
154        26686     860
204        27677     956
...          ...     ...
11004    1064040   18453
11054    1077174   18542
11104    1089750   18640
11154    1105435   18672
11204    1114524   18726

[225 rows x 2 columns]


## Train a RNN Model with LSTM

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras import Sequential
from tensorflow.keras import Input
from tensorflow.keras.models import Model

# window_size is use n-1 days to predict nth day
window_size = 22
forecast_days = 7

# test model for current_state
current_state = 'Georgia'
data_training = state_temporal_data[current_state]

In [6]:
scaler = MinMaxScaler()
data_training = scaler.fit_transform(data_training)

In [7]:
X_train = []
y_train = []

data_training_np = np.array(data_training)

for i in range(data_training.shape[0] - window_size):
    X_train.append(data_training[i : i + window_size])
    y_train.append(data_training_np[i + window_size])
    
X_train, y_train = np.array(X_train), np.array(y_train)

In [8]:
X_train.shape

(203, 22, 2)

In [9]:
y_train.shape

(203, 2)

In [10]:
model = keras.Sequential()

model.add(LSTM(units=10, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.1))

model.add(LSTM(units=20, return_sequences=False))
model.add(Dropout(0.1))

model.add(Dense(units = 2))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 22, 10)            520       
_________________________________________________________________
dropout (Dropout)            (None, 22, 10)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 20)                2480      
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense (Dense)                (None, 2)                 42        
Total params: 3,042
Trainable params: 3,042
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.compile(optimizer='adam', loss = 'mean_squared_logarithmic_error')
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1)

Train on 203 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f84df4a2d68>

In [12]:
prediction_queue = X_train[-1:]
prediction_queue = np.array(prediction_queue)
# prediction_queue

In [13]:
# prediction_queue should take in this y_pred as the last day and pop the first day in the queue
# use model.predict(prediction_queue) with the new prediction_queue to get second day.
# need to predict for next forecast_days days.
for i in range(forecast_days):  
    y_pred = model.predict(prediction_queue)
    prediction_queue = np.append(prediction_queue, y_pred)
    prediction_queue = np.delete(prediction_queue, 0)
    prediction_queue = np.delete(prediction_queue, 0)
    prediction_queue = prediction_queue.reshape(1,int(prediction_queue.shape[0]/2),2)

In [14]:
y_case_forecast = []
y_death_forecast = []

for i in range(forecast_days):
    y_case_forecast.append(prediction_queue[0][i + window_size - forecast_days][0])
    y_death_forecast.append(prediction_queue[0][i + window_size - forecast_days][1])

In [15]:
case_scale = 1/scaler.scale_[0]
death_scale = 1/scaler.scale_[1]

In [16]:
y_case_forecast = [case_scale * i for i in y_case_forecast]
y_death_forecast = [death_scale * i for i in y_death_forecast]

In [17]:
last_day_case = state_cum_temporal_data[current_state]['Confirmed'].iloc[-1]
last_day_death = state_cum_temporal_data[current_state]['Deaths'].iloc[-1]

y_cum_case_forecast = []
y_cum_death_forecast = []

cum_case = last_day_case
cum_death = last_day_death
for case in y_case_forecast:
    cum_case += case
    y_cum_case_forecast.append(cum_case)
    
for death in y_death_forecast:
    cum_death += death
    y_cum_death_forecast.append(cum_death)

In [18]:
y_cum_case_forecast

[452587.0703679025,
 456063.4661125466,
 459573.43573272973,
 463116.4421086088,
 466302.81951126456,
 469534.4814619869,
 472808.1929969862]

In [19]:
y_cum_death_forecast

[9246.42029929161,
 9294.37745976448,
 9342.326924085617,
 9390.386852502823,
 9435.158864736557,
 9480.6154088974,
 9526.674355745316]

## Train a RNN with LSTM for Every State

In [20]:
num_states = len(states_data.keys())
initialization = np.zeros((forecast_day*num_states, 3))
result_df = pd.DataFrame(initialization, columns=['ForecastID', 'Confirmed', 'Deaths'], dtype='int')

def forcast(window_size, forecast_day, predict_label):
    predict_index = 0
    if predict_label == 'Deaths':
        predict_index = 1
        
    state_count = 0
    for state in states_data.keys():
        current_state = state
        num_state_train = state_train_data[current_state].shape[0]
        num_state_validate = state_validate_data[current_state].shape[0]
        validate_range = np.arange(num_state_train, num_state_train+num_state_validate)
        data_training = np.array(state_train_data[current_state])

        X_train = []
        y_train = []

        for i in range(window_size, data_training.shape[0]-forecast_day):
            X_train.append(data_training[i-window_size:i])
            y_train.append(data_training[i+forecast_day, predict_index])

        X_train, y_train = np.array(X_train), np.array(y_train)
        model = keras.Sequential()

        model.add(LSTM(units = 60, activation = 'relu', return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
        model.add(Dropout(0.2))

        model.add(LSTM(units = 80, activation = 'relu', return_sequences = True))
        model.add(Dropout(0.2))

        model.add(LSTM(units = 120, activation = 'relu', return_sequences = True))
        model.add(Dropout(0.2))

        model.add(LSTM(units = 200, activation = 'relu'))
        model.add(Dropout(0.2))

        model.add(Dense(units = 1))

        # model.summary()

        model.compile(optimizer='adam', loss = 'mean_squared_error')
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)

        previous_days = state_train_data[current_state].tail(window_size+forecast_day)
        inputs = np.array(previous_days)
        X_test = []
        y_test = []

        for i in range(window_size, inputs.shape[0]):
            X_test.append(inputs[i-window_size:i])
            y_test.append(inputs[i,predict_index])

        X_test, y_test = np.array(X_test), np.array(y_test)

        y_pred = model.predict(X_test)
        y_result = y_pred.flatten().tolist()
        y_result = np.array(y_result) * state_scaler[current_state].scale_[predict_index] + state_scaler[current_state].mean_[predict_index]

        date_count = 0
        for r in y_result:
            forecast_id = state_count + date_count * num_states
            result_df.loc[forecast_id]['ForecastID'] = forecast_id;
            result_df.loc[forecast_id][predict_label] = r;
            date_count += 1
        state_count += 1

NameError: name 'states_data' is not defined

In [None]:
forcast(7, 7, 'Confirmed')
forcast(7, 7, 'Deaths')

In [None]:
result_df.to_csv('./data/submission_round2.csv')