In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from os import path

In [2]:
covid_df = pd.read_csv("../processed_data/new_confirmed.csv")
demographics_df = pd.read_csv("../processed_data/demographic_data.csv")
protests_df = pd.read_csv("../processed_data/protests.csv")

df = demographics_df.join(covid_df)
demographics_df.head()

Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,long,life_expectancy,mortality_risk,all_poverty,median_hh_income,sq_miles,pct_impoverished,pop_density
0,1001,Autauga County,Alabama,AL,26874,28326,37.8,55200,51.315217,32.534923,-86.64273,75.668023,58.373616,7444.0,54487.0,594.44,0.134855,92.860507
1,1011,Bullock County,Alabama,AL,5663,4689,39.6,10352,45.295595,32.100525,-85.715679,73.859057,56.831543,2841.0,34278.0,622.81,0.27444,16.621442
2,1021,Chilton County,Alabama,AL,21672,22258,38.6,43930,50.66697,32.847867,-86.71879,74.527386,59.62526,7960.0,44188.0,692.85,0.181197,63.404777
3,1033,Colbert County,Alabama,AL,26075,28420,42.7,54495,52.151574,34.700471,-87.804928,75.07179,58.774889,8982.0,46572.0,592.62,0.164822,91.95606
4,1035,Conecuh County,Alabama,AL,6037,6477,44.8,12514,51.758031,31.429237,-86.993662,74.050503,56.796792,3464.0,29758.0,850.16,0.27681,14.719582


In [3]:
if not path.exists("../processed_data/combined.csv"):
    WINDOW_SIZE = 15
    
    nondate_cols = df.columns[:np.where(df.columns == '03-23-2020')[0][0]:]
    date_cols = df.columns[np.where(df.columns == '03-23-2020')[0][0]:]

    df = df[0:5]

    augmented_data = {f"{x}_before":[] for x in range(WINDOW_SIZE)}
    augmented_data.update({k:None for k in nondate_cols})
    augmented_df = pd.DataFrame(data=augmented_data)

    for index, row in df.iterrows():
        for i, col in enumerate(date_cols[:-WINDOW_SIZE]):
            series = row[date_cols[i:i+WINDOW_SIZE]].reset_index(drop=True)
            series_dict = {f"{14 - k}_before": v for k, v in series.to_dict().items()}
            series_dict.update(row[nondate_cols].to_dict())
            augmented_df.at[f"{row['fips']}_{date_cols[i+WINDOW_SIZE-1]}", :] = series_dict

    augmented_df = augmented_df[1:]
    augmented_df.to_csv("../processed_data/combined.csv")
else:
    augmented_df = pd.read_csv("../processed_data/combined.csv")

read file


In [4]:
predictors = [f"{k + 1}_before" for k in range(14)] + ['median_age', 'female_percentage', 'life_expectancy', 'all_poverty', 'median_hh_income']
X = augmented_df[predictors]
y = augmented_df['0_before']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [5]:
baseline = LinearRegression()
baseline.fit(X_train, y_train)
train_mse = mean_squared_error(baseline.predict(X_train), y_train)
test_mse = mean_squared_error(baseline.predict(X_test), y_test)

print(f"Train MSE: {train_mse}\n Test MSE: {test_mse}")

Train MSE: 1033.9852241883966
 Test MSE: 844.6089782244622
