# 1. Data Preprocessing

## 1.1 Import Dependencies

In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/test_round2.csv
./data/.DS_Store
./data/train_round2.csv
./data/graph_round2.csv
./data/test.csv
./data/graph.csv
./data/submission_round2.csv
./data/submission.csv
./data/train.csv
./data/covid-full.csv


## 1.2 Load Data

In [3]:
data_dir = 'data'
temporal_data_path = os.path.join(data_dir, 'train.csv')
mobility_data_path = os.path.join(data_dir, 'graph_round2.csv')

temporal_data = pd.read_csv(temporal_data_path)
temporal_data.describe()

# train.csv 04/12/2020 to 08/31/2020
# train_round2.csv 04/12/2020 to 11/22/2020
# train_round2-1205.csv 04/12/2020 to 12/05/2020
# to predict 12/07/2020 to 12/13/2020, forecast 8 days, take the last 7 days

Unnamed: 0,ID,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
count,7100.0,7100.0,7100.0,5771.0,7100.0,7100.0,7100.0,5047.0,7100.0,7100.0,5047.0
mean,3549.5,56374.339577,2323.883099,20422.274649,37635.120986,747.27379,642706.9,6250.044185,3.653231,9666.099116,12.197144
std,2049.737788,95058.665054,4742.274729,39262.213465,79067.101832,637.39747,1127555.0,13880.694435,2.074042,7779.560336,5.255183
min,0.0,270.0,0.0,0.0,-989.0,32.771004,5459.0,10.0,0.0,460.300152,2.302896
25%,1774.75,6097.5,153.0,1614.5,2849.0,224.815702,100077.5,595.0,1.928925,3379.288445,8.387037
50%,3549.5,22892.0,681.0,5881.0,10409.0,540.836745,264323.5,2068.0,3.362027,7842.820591,11.351161
75%,5324.25,62959.25,2334.0,26103.0,32106.5,1126.282666,704839.5,6134.5,4.797564,14127.521137,15.396088
max,7099.0,712475.0,32957.0,507499.0,699453.0,3187.773001,11373300.0,89995.0,9.741481,49502.860687,38.50119


In [4]:
states = list(np.unique(temporal_data['Province_State']))
state_cum_temporal_data = dict.fromkeys(states, None)
state_temporal_data = dict.fromkeys(states, None)
# state_cum_temporal_scaler = defaultdict(StandardScaler)
# state_temporal_scaler = defaultdict(StandardScaler)
dropped_attr = ['Date',
                'Active',
                'ID', 
                'Province_State', 
                'Incident_Rate', 
                'Recovered', 
                'People_Tested', 
                'People_Hospitalized', 
                'Mortality_Rate', 
                'Testing_Rate', 
                'Hospitalization_Rate']

for s in states:
    df_filter = temporal_data['Province_State'] == s
    state_df = temporal_data[df_filter]
    # Daily difference data
    state_temporal_data[s] = state_df.drop(dropped_attr, 1)
    for col in state_temporal_data[s]:
        data = state_temporal_data[s][col].tolist()
        diff = [i - j for i, j in zip(data, [*[data[0]], *data[:len(data) - 1]])]
        state_temporal_data[s][col] = diff
    save_columns = state_temporal_data[s].columns
#     state_temporal_data[s] = state_temporal_scaler[s].fit_transform(state_temporal_data[s])
    state_temporal_data[s] = pd.DataFrame(state_temporal_data[s], columns=save_columns)
    # Daily cumulative data
    state_cum_temporal_data[s] = state_df.drop(dropped_attr, 1)
    save_columns = state_cum_temporal_data[s].columns
#     state_cum_temporal_data[s] = state_cum_temporal_scaler[s].fit_transform(state_cum_temporal_data[s])
    state_cum_temporal_data[s] = pd.DataFrame(state_cum_temporal_data[s], columns=save_columns)

In [5]:
state_temporal_data['South Dakota'].values

array([[  0,   0],
       [138,   0],
       [120,   0],
       [180,   0],
       [143,   1],
       [100,   0],
       [131,   0],
       [ 93,   0],
       [ 50,   0],
       [ 70,   1],
       [103,   1],
       [ 98,   0],
       [ 84,   1],
       [107,   0],
       [ 65,   1],
       [ 32,   0],
       [ 69,   0],
       [ 60,   2],
       [ 76,   4],
       [ 76,   4],
       [ 63,   0],
       [ 43,   0],
       [ 37,   0],
       [ 53,   3],
       [ 59,   5],
       [126,   2],
       [239,   0],
       [248,   3],
       [124,   0],
       [ 97,   0],
       [ 49,   5],
       [ 69,   0],
       [ 60,   4],
       [ 95,   1],
       [ 72,   0],
       [ 28,   0],
       [ 40,   0],
       [ 58,   2],
       [ 92,   0],
       [  0,   2],
       [179,   2],
       [108,   0],
       [ 99,   0],
       [ 23,   0],
       [ 67,   0],
       [ 57,   4],
       [ 83,   0],
       [ 73,   5],
       [ 94,   3],
       [ 33,   0],
       [ 41,   0],
       [ 33,   0],
       [ 95,

## Train a RNN Model with LSTM

In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras import Sequential
from tensorflow.keras import Input
from tensorflow.keras.models import Model

# window_size is use n days to predict (n+1)th day
window_size = 14
forecast_days = 26

# test model for current_state
current_state = 'South Dakota'
data_training = state_temporal_data[current_state]

In [7]:
scaler = MinMaxScaler()
# scaler = StandardScaler()
data_training = scaler.fit_transform(data_training)

In [8]:
X_train = []
y_train = []

data_training = np.array(data_training)
data_training_np = np.array(data_training)

for i in range(data_training.shape[0] - window_size):
    X_train.append(data_training[i : i + window_size])
    y_train.append(data_training_np[i + window_size])
    
X_train, y_train = np.array(X_train), np.array(y_train)

In [9]:
X_train.shape

(128, 14, 2)

In [10]:
y_train.shape

(128, 2)

In [11]:
model = keras.Sequential()

model.add(LSTM(units=20, activation = 'relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.3))

model.add(LSTM(units=10, activation = 'relu', return_sequences=True))
model.add(Dropout(0.3))

model.add(LSTM(units=20, activation = 'relu', return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(units = 2))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 14, 20)            1840      
_________________________________________________________________
dropout (Dropout)            (None, 14, 20)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 14, 10)            1240      
_________________________________________________________________
dropout_1 (Dropout)          (None, 14, 10)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 20)                2480      
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense (Dense)                (None, 2)                 4

In [13]:
model.compile(optimizer='adam', loss = 'mean_squared_logarithmic_error')
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1)

Train on 128 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fad5a8d5208>

In [14]:
prediction_queue = X_train[-1:]
prediction_queue = np.array(prediction_queue)
prediction_queue

array([[[0.13804173, 0.        ],
        [0.13001605, 0.16666667],
        [0.20064205, 0.16666667],
        [0.20064205, 0.33333333],
        [0.30979133, 0.33333333],
        [0.40288925, 0.16666667],
        [0.22632424, 0.16666667],
        [0.23916533, 0.        ],
        [0.12841091, 0.        ],
        [0.105939  , 0.16666667],
        [1.        , 0.        ],
        [0.51845907, 0.5       ],
        [0.68218299, 0.33333333],
        [0.60995185, 0.        ]]])

In [15]:
y_case_forecast = []
y_death_forecast = []

for i in range(forecast_days):  
    y_pred = model.predict(prediction_queue)
    y_case_forecast.append(y_pred[0][0])
    y_death_forecast.append(y_pred[0][1])
    prediction_queue = np.append(prediction_queue, y_pred)
    prediction_queue = np.delete(prediction_queue, 0)
    prediction_queue = np.delete(prediction_queue, 0)
    prediction_queue = prediction_queue.reshape(1,int(prediction_queue.shape[0] / 2),2)


In [16]:
y_case_forecast

[0.14655918,
 0.1492577,
 0.15208769,
 0.15374026,
 0.1550808,
 0.15498659,
 0.15324968,
 0.15238926,
 0.15063655,
 0.14949316,
 0.14874186,
 0.14180624,
 0.1393598,
 0.13587993,
 0.13238254,
 0.13237335,
 0.132336,
 0.13227093,
 0.13218659,
 0.13208279,
 0.13196623,
 0.1318525,
 0.13174048,
 0.13163614,
 0.1315398,
 0.13145022]

In [17]:
y_death_forecast

[0.19343343,
 0.1963694,
 0.20019878,
 0.20275058,
 0.2053798,
 0.20604958,
 0.2040247,
 0.2033237,
 0.201071,
 0.19966875,
 0.19907501,
 0.18860173,
 0.18577138,
 0.18075037,
 0.17496932,
 0.1750572,
 0.17514911,
 0.17523506,
 0.17531028,
 0.1753673,
 0.17542052,
 0.17546085,
 0.17548376,
 0.17549466,
 0.17548037,
 0.17545009]

In [18]:
case_scale = 1/scaler.scale_[0]
death_scale = 1/scaler.scale_[1]

In [19]:
y_case_forecast = [case_scale * i for i in y_case_forecast]
y_death_forecast = [death_scale * i for i in y_death_forecast]

In [20]:
last_day_case = state_cum_temporal_data[current_state]['Confirmed'].iloc[-1]
last_day_death = state_cum_temporal_data[current_state]['Deaths'].iloc[-1]

y_cum_case_forecast = []
y_cum_death_forecast = []

cum_case = last_day_case
cum_death = last_day_death
for case in y_case_forecast:
    cum_case += case
    y_cum_case_forecast.append(cum_case)
    
for death in y_death_forecast:
    cum_death += death
    y_cum_death_forecast.append(cum_death)

In [21]:
y_cum_case_forecast

[13600.306368410587,
 13693.293918386102,
 13788.044548287988,
 13883.824728414416,
 13980.440063878894,
 14076.996709540486,
 14172.471260800958,
 14267.409768626094,
 14361.25634162128,
 14454.390578970313,
 14547.056755393744,
 14635.402045935392,
 14722.223202586174,
 14806.876401364803,
 14889.350724965334,
 14971.819320693612,
 15054.26465216279,
 15136.669442921877,
 15219.021689504385,
 15301.309267759323,
 15383.52423107624,
 15465.668334022164,
 15547.742653474212,
 15629.751970395446,
 15711.701269984245,
 15793.594757631421]

In [22]:
y_cum_death_forecast

[168.1606006026268,
 169.33881697058678,
 170.54000967741013,
 171.75651314854622,
 172.98879194259644,
 174.2250894010067,
 175.44923761487007,
 176.6691798567772,
 177.8756058216095,
 179.07361832261086,
 180.2680684030056,
 181.39967879652977,
 182.51430705189705,
 183.59880927205086,
 184.64862516522408,
 185.69896838068962,
 186.74986305832863,
 187.80127343535423,
 188.85313513875008,
 189.90533891320229,
 190.9578620493412,
 192.01062712073326,
 193.06352970004082,
 194.11649763584137,
 195.1693798303604,
 196.22208034992218]

# 2. Train a RNN with LSTM for Every State

## 2.1 Import Dependencies

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras import Sequential
from tensorflow.keras import Input
from tensorflow.keras.models import Model

# window_size is use n-1 days to predict nth day
window_size = 21
forecast_days = 26

## 2.2 Helper Functions

In [None]:
states = state_temporal_data.keys()

def get_training_data(state):
    state_temporal_train = state_temporal_data[state]
    scaler = MinMaxScaler()
    state_temporal_train = scaler.fit_transform(state_temporal_train)
    
    X_train = []
    y_train = []

    state_temporal_train_np = np.array(state_temporal_train)

    for i in range(state_temporal_train.shape[0] - window_size):
        X_train.append(state_temporal_train[i : i + window_size])
        y_train.append(state_temporal_train_np[i + window_size])

    return np.array(X_train), np.array(y_train), scaler
    

def get_model(X_train):
    model = keras.Sequential()

    model.add(LSTM(units=20, activation = 'relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(0.3))

    model.add(LSTM(units=10, activation = 'relu', return_sequences=True))
    model.add(Dropout(0.3))

    model.add(LSTM(units=20, activation = 'relu', return_sequences=False))
    model.add(Dropout(0.3))
    model.add(Dense(units = 2))
    
    return model


def get_prediction_queue(X_train):
    prediction_queue = X_train[-1:]
    return np.array(prediction_queue)


def predict(model, queue, scaler):
    prediction_queue = queue
    
    for i in range(forecast_days):  
        y_pred = model.predict(prediction_queue)
        prediction_queue = np.append(prediction_queue, y_pred)
        prediction_queue = np.delete(prediction_queue, 0)
        prediction_queue = np.delete(prediction_queue, 0)
        prediction_queue = prediction_queue.reshape(1, prediction_queue.shape[0] // 2, 2)
        
    y_case_forecast = []
    y_death_forecast = []

    for i in range(forecast_days):
        y_case_forecast.append(prediction_queue[0][i + window_size - forecast_days][0])
        y_death_forecast.append(prediction_queue[0][i + window_size - forecast_days][1])
        
    case_scale = 1 / scaler.scale_[0]
    death_scale = 1 / scaler.scale_[1]
    
    return [case_scale * i for i in y_case_forecast], [death_scale * i for i in y_death_forecast]


def get_cum_forecast(state, y_case_forecast, y_death_forecast):
    last_day_case = state_cum_temporal_data[state]['Confirmed'].iloc[-1]
    last_day_death = state_cum_temporal_data[state]['Deaths'].iloc[-1]

    y_cum_case_forecast = []
    y_cum_death_forecast = []

    cum_case = last_day_case
    cum_death = last_day_death
    for case in y_case_forecast:
        cum_case += case
        y_cum_case_forecast.append(cum_case)

    for death in y_death_forecast:
        cum_death += death
        y_cum_death_forecast.append(cum_death)
        
    return y_cum_case_forecast, y_cum_death_forecast

## 2.3 Train and Predict

In [None]:
forecasted = {}

for state in states:
    X_train, y_train, scaler = get_training_data(state)

    model = get_model(X_train)
    model.compile(optimizer='adam', loss = 'mean_squared_logarithmic_error')
    model.fit(X_train, y_train, epochs=50, batch_size=32)
    
    y_case_forecast, y_death_forecast = predict(model, get_prediction_queue(X_train), scaler)
    y_cum_case_forecast, y_cum_death_forecast = get_cum_forecast(state, y_case_forecast, y_death_forecast)
    
    forecasted[state] = (y_cum_case_forecast, y_cum_death_forecast)

## 2.4 Generate Submission File

In [None]:
forecastID = [x for x in range(forecast_days * 50)]
deaths = []
confirmed = []

for i in range(forecast_days):
    for s in states:
        print(f"Day {i+1}/26, {s}")
        confirmed.append(forecasted[s][0][i])
        deaths.append(forecasted[s][1][i])
        
final = pd.DataFrame(list(zip(forecastID, confirmed, deaths)), 
               columns =['ForecastID', 'Confirmed', 'Deaths']) 

final.to_csv("submission.csv", index=False)