# 1. Data Preprocessing

## 1.1 Import Dependencies

In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/test_round2.csv
./data/.DS_Store
./data/train_round2.csv
./data/graph_round2.csv
./data/test.csv
./data/graph.csv
./data/submission_round2.csv
./data/submission.csv
./data/train.csv


## 1.2 Load Data

In [3]:
data_dir = 'data'
temporal_data_path = os.path.join(data_dir, 'train.csv')
mobility_data_path = os.path.join(data_dir, 'graph_round2.csv')

temporal_data = pd.read_csv(temporal_data_path)
temporal_data.describe()
# train_round2.csv 04/12/2020 to 11/22/2020.
# train.csv 04/12/2020 to 08/31/2020

Unnamed: 0,ID,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
count,7100.0,7100.0,7100.0,5771.0,7100.0,7100.0,7100.0,5047.0,7100.0,7100.0,5047.0
mean,3549.5,56374.339577,2323.883099,20422.274649,37635.120986,747.27379,642706.9,6250.044185,3.653231,9666.099116,12.197144
std,2049.737788,95058.665054,4742.274729,39262.213465,79067.101832,637.39747,1127555.0,13880.694435,2.074042,7779.560336,5.255183
min,0.0,270.0,0.0,0.0,-989.0,32.771004,5459.0,10.0,0.0,460.300152,2.302896
25%,1774.75,6097.5,153.0,1614.5,2849.0,224.815702,100077.5,595.0,1.928925,3379.288445,8.387037
50%,3549.5,22892.0,681.0,5881.0,10409.0,540.836745,264323.5,2068.0,3.362027,7842.820591,11.351161
75%,5324.25,62959.25,2334.0,26103.0,32106.5,1126.282666,704839.5,6134.5,4.797564,14127.521137,15.396088
max,7099.0,712475.0,32957.0,507499.0,699453.0,3187.773001,11373300.0,89995.0,9.741481,49502.860687,38.50119


In [4]:
states = list(np.unique(temporal_data['Province_State']))
state_cum_temporal_data = dict.fromkeys(states, None)
state_temporal_data = dict.fromkeys(states, None)
# state_cum_temporal_scaler = defaultdict(StandardScaler)
# state_temporal_scaler = defaultdict(StandardScaler)
dropped_attr = ['Date',
                'Active',
                'ID', 
                'Province_State', 
                'Incident_Rate', 
                'Recovered', 
                'People_Tested', 
                'People_Hospitalized', 
                'Mortality_Rate', 
                'Testing_Rate', 
                'Hospitalization_Rate']

for s in states:
    df_filter = temporal_data['Province_State'] == s
    state_df = temporal_data[df_filter]
    # Daily difference data
    state_temporal_data[s] = state_df.drop(dropped_attr, 1)
    for col in state_temporal_data[s]:
        data = state_temporal_data[s][col].tolist()
        diff = [i - j for i, j in zip(data, [*[data[0]], *data[:len(data) - 1]])]
        state_temporal_data[s][col] = diff
    save_columns = state_temporal_data[s].columns
#     state_temporal_data[s] = state_temporal_scaler[s].fit_transform(state_temporal_data[s])
    state_temporal_data[s] = pd.DataFrame(state_temporal_data[s], columns=save_columns)
    # Daily cumulative data
    state_cum_temporal_data[s] = state_df.drop(dropped_attr, 1)
    save_columns = state_cum_temporal_data[s].columns
#     state_cum_temporal_data[s] = state_cum_temporal_scaler[s].fit_transform(state_cum_temporal_data[s])
    state_cum_temporal_data[s] = pd.DataFrame(state_cum_temporal_data[s], columns=save_columns)
    
print(state_temporal_data['California'])
print(state_cum_temporal_data['California'])

      Confirmed  Deaths
4             0       0
54         1136      74
104        1425      53
154        1330      93
204         991      96
...         ...     ...
6854       4817     145
6904       5564     128
6954       4653      89
7004       3913      43
7054       6524      85

[142 rows x 2 columns]
      Confirmed  Deaths
4         22795     640
54        23931     714
104       25356     767
154       26686     860
204       27677     956
...         ...     ...
6854     691821   12677
6904     697385   12805
6954     702038   12894
7004     705951   12937
7054     712475   13022

[142 rows x 2 columns]


## Train a RNN Model with LSTM

In [128]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras import Sequential
from tensorflow.keras import Input
from tensorflow.keras.models import Model

# window_size is use n-1 days to predict nth day
window_size = 22
forecast_days = 26

# test model for current_state
current_state = 'Arkansas'
data_training = state_temporal_data[current_state]

In [129]:
scaler = MinMaxScaler()
data_training = scaler.fit_transform(data_training)

In [130]:
X_train = []
y_train = []

data_training_np = np.array(data_training)

for i in range(data_training.shape[0] - window_size):
    X_train.append(data_training[i : i + window_size])
    y_train.append(data_training_np[i + window_size])
    
X_train, y_train = np.array(X_train), np.array(y_train)

In [131]:
X_train

array([[[0.27378508, 0.04347826],
        [0.36276523, 0.13043478],
        [0.3340178 , 0.17391304],
        ...,
        [0.31211499, 0.17391304],
        [0.29774127, 0.39130435],
        [0.31827515, 0.2173913 ]],

       [[0.36276523, 0.13043478],
        [0.3340178 , 0.17391304],
        [0.32238193, 0.08695652],
        ...,
        [0.29774127, 0.39130435],
        [0.31827515, 0.2173913 ],
        [0.31074606, 0.2173913 ]],

       [[0.3340178 , 0.17391304],
        [0.32238193, 0.08695652],
        [0.30869268, 0.2173913 ],
        ...,
        [0.31827515, 0.2173913 ],
        [0.31074606, 0.2173913 ],
        [0.29705681, 0.17391304]],

       ...,

       [[0.96577687, 0.30434783],
        [0.80219028, 0.65217391],
        [0.66529774, 0.43478261],
        ...,
        [0.71800137, 0.95652174],
        [0.76796715, 0.34782609],
        [0.84736482, 0.7826087 ]],

       [[0.80219028, 0.65217391],
        [0.66529774, 0.43478261],
        [0.71526352, 0.52173913],
        .

In [132]:
y_train

array([[0.31074606, 0.2173913 ],
       [0.29705681, 0.17391304],
       [0.33264887, 0.2173913 ],
       [0.33675565, 0.04347826],
       [0.30390144, 0.08695652],
       [0.27378508, 0.04347826],
       [0.45516769, 0.17391304],
       [0.29500342, 0.17391304],
       [0.35660507, 0.08695652],
       [0.32306639, 0.13043478],
       [0.36276523, 0.08695652],
       [0.34017796, 0.04347826],
       [0.35249829, 0.04347826],
       [0.39767283, 0.04347826],
       [0.31074606, 0.13043478],
       [0.34907598, 0.13043478],
       [0.32854209, 0.26086957],
       [0.58521561, 0.17391304],
       [0.37919233, 0.17391304],
       [0.3853525 , 0.13043478],
       [0.3744011 , 0.08695652],
       [0.34702259, 0.08695652],
       [0.37713895, 0.13043478],
       [0.34017796, 0.08695652],
       [0.45242984, 0.26086957],
       [0.43737166, 0.34782609],
       [0.43531828, 0.08695652],
       [0.43805613, 0.04347826],
       [0.40383299, 0.04347826],
       [0.53045859, 0.17391304],
       [0.

In [133]:
model = keras.Sequential()

model.add(LSTM(units=20, activation = 'relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))

model.add(LSTM(units=30, activation = 'relu', return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(units=40, activation = 'relu', return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(units = 2))

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_19 (LSTM)               (None, 22, 20)            1840      
_________________________________________________________________
dropout_19 (Dropout)         (None, 22, 20)            0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 22, 30)            6120      
_________________________________________________________________
dropout_20 (Dropout)         (None, 22, 30)            0         
_________________________________________________________________
lstm_21 (LSTM)               (None, 40)                11360     
_________________________________________________________________
dropout_21 (Dropout)         (None, 40)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 2)                

In [134]:
model.compile(optimizer='adam', loss = 'mean_squared_logarithmic_error')
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=1)

Train on 120 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fd26e31f048>

In [135]:
prediction_queue = X_train[-1:]
prediction_queue = np.array(prediction_queue)
prediction_queue

array([[[0.66529774, 0.43478261],
        [0.71526352, 0.52173913],
        [0.53593429, 0.52173913],
        [0.74264203, 0.30434783],
        [0.73237509, 0.47826087],
        [0.70225873, 0.26086957],
        [0.        , 0.60869565],
        [0.73442847, 0.        ],
        [0.55578371, 0.2173913 ],
        [0.55441478, 0.73913043],
        [0.77275838, 0.56521739],
        [0.6495551 , 0.47826087],
        [0.88090349, 1.        ],
        [0.64818617, 0.52173913],
        [0.53045859, 0.60869565],
        [0.49281314, 0.43478261],
        [0.60232717, 0.69565217],
        [0.71800137, 0.95652174],
        [0.76796715, 0.34782609],
        [0.84736482, 0.7826087 ],
        [0.81793292, 0.73913043],
        [0.60095825, 0.56521739]]])

In [136]:
# prediction_queue should take in this y_pred as the last day and pop the first day in the queue
# use model.predict(prediction_queue) with the new prediction_queue to get second day.
# need to predict for next forecast_days days.
for i in range(forecast_days):  
    y_pred = model.predict(prediction_queue)
    prediction_queue = np.append(prediction_queue, y_pred)
    prediction_queue = np.delete(prediction_queue, 0)
    prediction_queue = np.delete(prediction_queue, 0)
    prediction_queue = prediction_queue.reshape(1,int(prediction_queue.shape[0]/2),2)

prediction_queue

array([[[0.55174327, 0.56135386],
        [0.54735863, 0.56098646],
        [0.54214996, 0.56425303],
        [0.54351187, 0.55921417],
        [0.53842252, 0.57075614],
        [0.53662515, 0.57971072],
        [0.53388244, 0.57318926],
        [0.52781695, 0.56612372],
        [0.5238868 , 0.56303334],
        [0.51557171, 0.54047745],
        [0.51186264, 0.53646958],
        [0.50936419, 0.53180474],
        [0.50722307, 0.53286809],
        [0.50313467, 0.5248214 ],
        [0.49715108, 0.50796896],
        [0.4913041 , 0.50604516],
        [0.48482725, 0.49203867],
        [0.47967643, 0.48025283],
        [0.47718105, 0.47747689],
        [0.47483638, 0.4759523 ],
        [0.47218186, 0.47321808],
        [0.46927962, 0.46955955]]])

In [137]:
y_case_forecast = []
y_death_forecast = []

for i in range(forecast_days):
    y_case_forecast.append(prediction_queue[0][i + window_size - forecast_days][0])
    y_death_forecast.append(prediction_queue[0][i + window_size - forecast_days][1])

In [138]:
y_case_forecast

[0.4771810472011566,
 0.4748363792896271,
 0.47218185663223267,
 0.46927961707115173,
 0.5517432689666748,
 0.5473586320877075,
 0.5421499609947205,
 0.5435118675231934,
 0.5384225249290466,
 0.5366251468658447,
 0.5338824391365051,
 0.5278169512748718,
 0.5238867998123169,
 0.5155717134475708,
 0.5118626356124878,
 0.5093641877174377,
 0.5072230696678162,
 0.5031346678733826,
 0.49715107679367065,
 0.49130409955978394,
 0.4848272502422333,
 0.47967642545700073,
 0.4771810472011566,
 0.4748363792896271,
 0.47218185663223267,
 0.46927961707115173]

In [139]:
y_death_forecast

[0.47747689485549927,
 0.47595229744911194,
 0.47321808338165283,
 0.46955955028533936,
 0.561353862285614,
 0.5609864592552185,
 0.564253032207489,
 0.559214174747467,
 0.5707561373710632,
 0.5797107219696045,
 0.5731892585754395,
 0.5661237239837646,
 0.5630333423614502,
 0.540477454662323,
 0.536469578742981,
 0.5318047404289246,
 0.5328680872917175,
 0.524821400642395,
 0.5079689621925354,
 0.5060451626777649,
 0.49203866720199585,
 0.48025283217430115,
 0.47747689485549927,
 0.47595229744911194,
 0.47321808338165283,
 0.46955955028533936]

In [140]:
case_scale = 1/scaler.scale_[0]
death_scale = 1/scaler.scale_[1]

In [141]:
case_scale

1461.0

In [142]:
death_scale

23.0

In [143]:
y_case_forecast = [case_scale * i for i in y_case_forecast]
y_death_forecast = [death_scale * i for i in y_death_forecast]

In [144]:
last_day_case = state_cum_temporal_data[current_state]['Confirmed'].iloc[-1]
last_day_death = state_cum_temporal_data[current_state]['Deaths'].iloc[-1]

y_cum_case_forecast = []
y_cum_death_forecast = []

cum_case = last_day_case
cum_death = last_day_death
for case in y_case_forecast:
    cum_case += case
    y_cum_case_forecast.append(cum_case)
    
for death in y_death_forecast:
    cum_death += death
    y_cum_death_forecast.append(cum_death)

In [145]:
y_cum_case_forecast

[61921.16150996089,
 62614.897460103035,
 63304.75515264273,
 63990.37267318368,
 64796.46958914399,
 65596.16055062413,
 66388.24164363742,
 67182.3124820888,
 67968.94779101014,
 68752.95713058114,
 69532.95937415957,
 70304.09993997216,
 71069.49855449796,
 71822.74882784486,
 72570.5801384747,
 73314.76121672988,
 74055.81412151456,
 74790.89387127757,
 75517.23159447312,
 76235.02688392997,
 76943.35949653387,
 77644.16675412655,
 78341.32826408744,
 79035.06421422958,
 79724.92190676928,
 80410.53942731023]

In [146]:
y_cum_death_forecast

[807.9819685816765,
 818.9288714230061,
 829.8128873407841,
 840.6127569973469,
 853.523895829916,
 866.426584392786,
 879.4044041335583,
 892.26633015275,
 905.3937213122845,
 918.7270679175854,
 931.9104208648205,
 944.9312665164471,
 957.8810333907604,
 970.3120148479939,
 982.6508151590824,
 994.8823241889477,
 1007.1382901966572,
 1019.2091824114323,
 1030.8924685418606,
 1042.5315072834492,
 1053.848396629095,
 1064.894211769104,
 1075.8761803507805,
 1086.82308319211,
 1097.707099109888,
 1108.5069687664509]

# 2. Train a RNN with LSTM for Every State

## 2.1 Import Dependencies

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras import Sequential
from tensorflow.keras import Input
from tensorflow.keras.models import Model

# window_size is use n-1 days to predict nth day
window_size = 22
forecast_days = 26

## 2.2 Helper Functions

In [None]:
states = state_temporal_data.keys()

def get_training_data(state):
    state_temporal_train = state_temporal_data[state]
    scaler = MinMaxScaler()
    state_temporal_train = scaler.fit_transform(state_temporal_train)
    
    X_train = []
    y_train = []

    state_temporal_train_np = np.array(state_temporal_train)

    for i in range(state_temporal_train.shape[0] - window_size):
        X_train.append(state_temporal_train[i : i + window_size])
        y_train.append(state_temporal_train_np[i + window_size])

    return np.array(X_train), np.array(y_train), scaler
    

def get_model(X_train):
    model = keras.Sequential()

    model.add(LSTM(units=20, activation = 'relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(0.2))

    model.add(LSTM(units=30, activation = 'relu', return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(units=40, activation = 'relu', return_sequences=False))
    model.add(Dropout(0.2))

    model.add(Dense(units = 2))
    
    return model


def get_prediction_queue(X_train):
    prediction_queue = X_train[-1:]
    return np.array(prediction_queue)


def predict(model, queue, scaler):
    prediction_queue = queue
    
    for i in range(forecast_days):  
        y_pred = model.predict(prediction_queue)
        prediction_queue = np.append(prediction_queue, y_pred)
        prediction_queue = np.delete(prediction_queue, 0)
        prediction_queue = np.delete(prediction_queue, 0)
        prediction_queue = prediction_queue.reshape(1, prediction_queue.shape[0] // 2, 2)
        
    y_case_forecast = []
    y_death_forecast = []

    for i in range(forecast_days):
        y_case_forecast.append(prediction_queue[0][i + window_size - forecast_days][0])
        y_death_forecast.append(prediction_queue[0][i + window_size - forecast_days][1])
        
    case_scale = 1 / scaler.scale_[0]
    death_scale = 1 / scaler.scale_[1]
    
    return [case_scale * i for i in y_case_forecast], [death_scale * i for i in y_death_forecast]


def get_cum_forecast(state, y_case_forecast, y_death_forecast):
    last_day_case = state_cum_temporal_data[state]['Confirmed'].iloc[-1]
    last_day_death = state_cum_temporal_data[state]['Deaths'].iloc[-1]

    y_cum_case_forecast = []
    y_cum_death_forecast = []

    cum_case = last_day_case
    cum_death = last_day_death
    for case in y_case_forecast:
        cum_case += case
        y_cum_case_forecast.append(cum_case)

    for death in y_death_forecast:
        cum_death += death
        y_cum_death_forecast.append(cum_death)
        
    return y_cum_case_forecast, y_cum_death_forecast

## 2.3 Train and Predict

In [None]:
forecasted = {}

for state in states:
    X_train, y_train, scaler = get_training_data(state)

    model = get_model(X_train)
    model.compile(optimizer='adam', loss = 'mean_squared_logarithmic_error')
    model.fit(X_train, y_train, epochs=50, batch_size=16)
    
    y_case_forecast, y_death_forecast = predict(model, get_prediction_queue(X_train), scaler)
    y_cum_case_forecast, y_cum_death_forecast = get_cum_forecast(state, y_case_forecast, y_death_forecast)
    
    forecasted[state] = (y_cum_case_forecast, y_cum_death_forecast)

## 2.4 Generate Submission File

In [None]:
forecastID = [x for x in range(forecast_days * 50)]
deaths = []
confirmed = []

for i in range(forecast_days):
    for s in states:
        print(f"Day {i+1}/26, {s}")
        confirmed.append(forecasted[s][0][i])
        deaths.append(forecasted[s][1][i])
        
final = pd.DataFrame(list(zip(forecastID, confirmed, deaths)), 
               columns =['ForecastID', 'Confirmed', 'Deaths']) 

final.to_csv("submission.csv", index=False)