In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from os import path
from tqdm import tqdm

from tensorflow import keras
from tensorflow.keras import layers as l

## Preprocess Data

- We decided to remove rows with null values (given the high abundance of training examples) and to remove rows which included a stray value one day in New York as per https://github.com/CSSEGISandData/COVID-19/issues/3103.
- We assigned the 14-day covid history as separate predictors, as well as a few demographic indicators
- We standardized the data given high variance between the ranges of different features

In [2]:
def num_to_date(i):
    if i>9:
        return str(i)
    return "0" + str(i)

augmented_df = pd.read_csv("../processed_data/combined.csv", index_col=0)

# drop null values
augmented_df = augmented_df.dropna(subset=([f"{k}_before" for k in range(15)]+['pct_impoverished']))

# remove data with spike from NY county https://github.com/CSSEGISandData/COVID-19/issues/3103
augmented_df = augmented_df.drop(["36061_08-31-2020"]+[f"36061_09-{num_to_date(i)}-2020" for i in range(1,15)])

In [3]:
augmented_df.head()

Unnamed: 0,14_before,13_before,12_before,11_before,10_before,9_before,8_before,7_before,6_before,5_before,...,female_percentage,lat,long,life_expectancy,mortality_risk,all_poverty,median_hh_income,sq_miles,pct_impoverished,pop_density
10003_04-07-2020,27.0,8.0,7.0,19.0,25.0,11.0,15.0,41.0,29.0,19.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-08-2020,8.0,7.0,19.0,25.0,11.0,15.0,41.0,29.0,19.0,34.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-09-2020,7.0,19.0,25.0,11.0,15.0,41.0,29.0,19.0,34.0,121.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-10-2020,19.0,25.0,11.0,15.0,41.0,29.0,19.0,34.0,121.0,36.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-11-2020,25.0,11.0,15.0,41.0,29.0,19.0,34.0,121.0,36.0,60.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605


In [4]:
predictors = [f"{k + 1}_before" for k in range(14)] + ['median_age', 'female_percentage', 'life_expectancy', 'pct_impoverished', 'median_hh_income']
X = augmented_df[predictors]
y = augmented_df['0_before']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=209)

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Prediction



In [6]:
# define window length (assuming it's first N features) so we can divide up two input streams
window = 14
dim = X_train_scaled.shape[0]

# reshape sequences for RNN cells
X_train_seq = X_train_scaled[:, :window].reshape(-1, 1, window)

# slice off demographic data
X_train_demo = X_train_scaled[:, window:]

#### Define network architecture

In [10]:
# input layer for sequence- shape is batch_size/1/window_size
seq_in = keras.Input(shape=(1, window,))

# input layer for demographic data
demo_in = keras.Input(shape=(X_train_demo.shape[1],))

# network for sequences
h1_seq = l.LSTM(16)(seq_in)

# network for demographics
h1_demo = l.Dense(16, activation='relu')(demo_in)

# concat
x = l.concatenate([h1_seq, h1_demo])

# dense on top of concatenated layer
x = l.Dense(16, activation='relu')(x)

out = l.Dense(1)(x)

#### Compile and summarize model

In [11]:
model = keras.Model(
    inputs=[seq_in, demo_in],
    outputs=[out],
)

model.compile(loss='mse')
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1, 14)]      0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 5)]          0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 16)           1984        input_3[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 16)           96          input_4[0][0]                    
_______________________________________________________________________________________

#### Fit

In [12]:
model.fit(x=[X_train_seq, X_train_demo], y=y_train)



<tensorflow.python.keras.callbacks.History at 0x19f047541c8>