In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from os import path
from tqdm import tqdm

# Creating the Prediction Dataset

- In this notebook, we aggregate the processed COVID-19 data with the demographic data, and create new rows in an augmented dataset including 14-day windows of COVID-19 cases from mid-March as distinct examples.

In [2]:
covid_df = pd.read_csv("../processed_data/new_confirmed.csv")
demographics_df = pd.read_csv("../processed_data/demographic_data.csv")

df = covid_df.merge(demographics_df, left_on="FIPS", right_on="fips")
df.head()

Unnamed: 0,FIPS,03-23-2020,03-24-2020,03-25-2020,03-26-2020,03-27-2020,03-28-2020,03-29-2020,03-30-2020,03-31-2020,...,female_percentage,lat,long,life_expectancy,mortality_risk,all_poverty,median_hh_income,sq_miles,pct_impoverished,pop_density
0,10003,44.0,27.0,8.0,7.0,19.0,25.0,11.0,15.0,41.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
1,11001,120.0,21.0,46.0,44.0,40.0,33.0,38.0,59.0,94.0,...,52.537334,38.904734,-77.016294,76.856195,46.035418,119778.0,74093.0,61.05,0.174987,11212.088452
2,12001,36.0,1.0,8.0,4.0,8.0,10.0,4.0,8.0,8.0,...,51.624941,29.67475,-82.357714,78.641544,49.53956,55990.0,45294.0,875.02,0.21277,300.733698
3,12009,7.0,2.0,2.0,2.0,3.0,10.0,2.0,3.0,6.0,...,51.106261,28.293722,-80.732271,78.522022,47.875878,84028.0,51214.0,1015.66,0.145678,567.91446
4,12011,263.0,48.0,44.0,149.0,110.0,200.0,24.0,314.0,67.0,...,51.302333,26.152322,-80.487126,80.25258,41.503756,256027.0,54055.0,1209.79,0.134105,1578.084626


In [3]:
WINDOW_SIZE = 15

date_cols = df.columns[1:np.where(df.columns == 'fips')[0][0]]
nondate_cols = df.columns[np.where(df.columns == 'fips')[0][0]:]

augmented_data = []
augmented_index = []
    
for index, row in tqdm(df.iterrows()):
    for i, col in enumerate(date_cols[:-WINDOW_SIZE]):
        series = row[date_cols[i:i+WINDOW_SIZE]].reset_index(drop=True)
        series_dict = {f"{14 - k}_before": v for k, v in series.to_dict().items()}
        series_dict.update(row[nondate_cols].to_dict())
        augmented_data.append(series_dict)
        augmented_index.append(f"{row['fips']}_{date_cols[i+WINDOW_SIZE-1]}")

augmented_df = pd.DataFrame(data=augmented_data, index=augmented_index)
augmented_df = augmented_df[1:]
augmented_df.to_csv("../processed_data/combined.csv")

1699it [04:07,  6.86it/s]


In [4]:
augmented_df.head()

Unnamed: 0,14_before,13_before,12_before,11_before,10_before,9_before,8_before,7_before,6_before,5_before,...,female_percentage,lat,long,life_expectancy,mortality_risk,all_poverty,median_hh_income,sq_miles,pct_impoverished,pop_density
10003_04-07-2020,27.0,8.0,7.0,19.0,25.0,11.0,15.0,41.0,29.0,19.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-08-2020,8.0,7.0,19.0,25.0,11.0,15.0,41.0,29.0,19.0,34.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-09-2020,7.0,19.0,25.0,11.0,15.0,41.0,29.0,19.0,34.0,121.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-10-2020,19.0,25.0,11.0,15.0,41.0,29.0,19.0,34.0,121.0,36.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-11-2020,25.0,11.0,15.0,41.0,29.0,19.0,34.0,121.0,36.0,60.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
