In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from os import path
from tqdm import tqdm

In [2]:
def num_to_date(i):
    if i>9:
        return str(i)
    return "0" + str(i)

augmented_df = pd.read_csv("../processed_data/combined.csv", index_col=0)

# drop null values
augmented_df = augmented_df.dropna(subset=([f"{k}_before" for k in range(15)]+['pct_impoverished']))

# remove data with spike from NY county https://github.com/CSSEGISandData/COVID-19/issues/3103
augmented_df = augmented_df.drop(["36061_08-31-2020"]+[f"36061_09-{num_to_date(i)}-2020" for i in range(1,15)])



In [3]:
predictors = [f"{k + 1}_before" for k in range(14)] + ['median_age', 'female_percentage', 'life_expectancy', 'pct_impoverished', 'median_hh_income']
X = augmented_df[predictors]
y = augmented_df['0_before']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
baseline = LinearRegression()
baseline.fit(X_train_scaled, y_train)

y_train_predict = baseline.predict(X_train_scaled)
y_test_predict = baseline.predict(X_test_scaled)

train_mse = mean_squared_error(y_train_predict, y_train)
test_mse = mean_squared_error(y_test_predict, y_test)

print(f"Train MSE: {train_mse}\n Test MSE: {test_mse}\n")

print("Coefficient values")
for coef, col in zip(baseline.coef_, predictors):
    print(f"{col}: {coef}")

Train MSE: 2203.771351140823
 Test MSE: 2197.295475036151

Coefficient values
1_before: 10.511588702590272
2_before: 10.73714144725318
3_before: 11.1479598690995
4_before: 5.582722622072973
5_before: 3.58461871230837
6_before: 13.44590015616119
7_before: 16.060866708238702
8_before: 1.5387510764499326
9_before: 0.5425768313559217
10_before: 0.45412442301425154
11_before: 4.102901158513511
12_before: -1.554615534245421
13_before: -2.3425822046774982
14_before: 2.9503481225370036
median_age: -1.0622941521627283
female_percentage: 0.546726933358288
life_expectancy: 0.7676587549301737
pct_impoverished: 0.44584296854219074
median_hh_income: 0.714252398256056
