In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from os import path
from tqdm import tqdm

# Baseline Model

- In this notebook, we build a basic model to predict new covid cases on a given day at the county level, given the previous 14 day history of cases and demographic information.

## Preprocess Data

- We decided to remove rows with null values (given the high abundance of training examples) and to remove rows which included a stray value one day in New York as per https://github.com/CSSEGISandData/COVID-19/issues/3103.
- We assigned the 14-day covid history as separate predictors, as well as a few demographic indicators
- We standardized the data given high variance between the ranges of different features

In [2]:
def num_to_date(i):
    if i>9:
        return str(i)
    return "0" + str(i)

augmented_df = pd.read_csv("../processed_data/combined.csv", index_col=0)

# drop null values
augmented_df = augmented_df.dropna(subset=([f"{k}_before" for k in range(15)]+['pct_impoverished']))

# remove data with spike from NY county https://github.com/CSSEGISandData/COVID-19/issues/3103
augmented_df = augmented_df.drop(["36061_08-31-2020"]+[f"36061_09-{num_to_date(i)}-2020" for i in range(1,15)])

In [3]:
predictors = [f"{k + 1}_before" for k in range(14)] + ['median_age', 'female_percentage', 'life_expectancy', 'pct_impoverished', 'median_hh_income']
X = augmented_df[predictors]
y = augmented_df['0_before']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=209)

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Prediction

- We fit a model to the scaled, cleaned data and report MSE and coefficient values

In [5]:
baseline = LinearRegression()
baseline.fit(X_train_scaled, y_train)

y_train_predict = baseline.predict(X_train_scaled)
y_test_predict = baseline.predict(X_test_scaled)

train_mse = mean_squared_error(y_train_predict, y_train)
test_mse = mean_squared_error(y_test_predict, y_test)

print(f"Train MSE: {train_mse}\n Test MSE: {test_mse}\n")

print("Coefficient values")
for coef, col in zip(baseline.coef_, predictors):
    print(f"{col}: {coef}")

Train MSE: 2288.5313032393974
 Test MSE: 1410.6052237455647

Coefficient values
1_before: 9.186384225905895
2_before: 11.279167231487282
3_before: 11.658264570448987
4_before: 6.720346054784222
5_before: 3.9931847405511505
6_before: 14.491819623990756
7_before: 15.551266860594458
8_before: 1.1444712812665954
9_before: 0.7076124258938843
10_before: 0.244652940910131
11_before: 4.072149213933631
12_before: 0.4449296399997864
13_before: -3.469810690790506
14_before: 1.737584364639329
median_age: -0.9844958463607211
female_percentage: 0.4895378893085517
life_expectancy: 0.7505754563157813
pct_impoverished: 0.41151039481048046
median_hh_income: 0.6003583211846797
