In [45]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Creating the Prediction Dataset

- In this notebook, we aggregate the processed COVID-19 data with the demographic data, and create new rows in an augmented dataset including 14-day windows of COVID-19 cases from mid-March as distinct examples.

In [47]:
demographics = pd.read_csv("../processed_data/demographic_data.csv").rename(columns = {"fips": "FIPS"})

elections = pd.read_csv("../processed_data/elections.csv")

restrictions = pd.read_csv("../processed_data/restriction.csv").rename(columns = {"Date": "date", "RegionName": "state"})
restrictions["date"] = restrictions.date.str.slice(5)

protests = pd.read_csv("../processed_data/protests_fips.csv")

covid = pd.read_csv("../processed_data/new_confirmed.csv")
covid.columns = covid.columns.str.slice(stop = 5)

In [48]:
restrictions

Unnamed: 0,state,date,C4_Restrictions on gatherings,C6_Stay at home requirements,StringencyIndex
0,Alaska,01-01,0.0,0.0,0.00
1,Alaska,01-02,0.0,0.0,0.00
2,Alaska,01-03,0.0,0.0,0.00
3,Alaska,01-04,0.0,0.0,0.00
4,Alaska,01-05,0.0,0.0,0.00
...,...,...,...,...,...
17616,Wyoming,12-02,3.0,0.0,42.59
17617,Wyoming,12-03,3.0,0.0,42.59
17618,Wyoming,12-04,3.0,0.0,42.59
17619,Wyoming,12-05,3.0,0.0,42.59


In [54]:
combined = demographics.merge(elections, left_on = "FIPS", right_on = "FIPS")
combined = pd.concat([combined.assign(date = d) for d in covid.columns[15:]], ignore_index=True)#.set_index(["date", "state", "FIPS"])
combined.merge(restrictions, left_on = ["date", "state"], right_on = ["date", "state"]).set_index(["date", "state", "FIPS"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,county,state_code,male,female,median_age,population,female_percentage,lat,long,life_expectancy,...,pct_hispanic,pct_non_hispanic_white,pct_not_proficient_in_english,pct_rural,pct_impoverished,pop_density,r_voteshare,C4_Restrictions on gatherings,C6_Stay at home requirements,StringencyIndex
date,state,FIPS,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
04-06,Alabama,1001,Autauga County,AL,26874,28326,37.8,55200,51.315217,32.534923,-86.642730,75.668023,...,3.0,74.3,1,42.0,0.134855,92.860507,0.727666,4.0,2.0,65.74
04-06,Alabama,1003,Baldwin County,AL,101188,106919,42.8,208107,51.376936,30.727479,-87.722564,78.075681,...,4.6,83.1,1,42.3,,130.903018,0.765457,4.0,2.0,65.74
04-06,Alabama,1005,Barbour County,AL,13697,12085,39.9,25782,46.873788,31.869581,-85.393210,75.421683,...,4.3,45.6,2,67.8,,29.136154,0.520967,4.0,2.0,65.74
04-06,Alabama,1007,Bibb County,AL,12152,10375,39.9,22527,46.055844,32.998628,-87.126475,73.967404,...,2.6,74.6,0,68.4,,36.183302,0.764032,4.0,2.0,65.74
04-06,Alabama,1009,Blount County,AL,28434,29211,40.8,57645,50.673953,33.980869,-86.567380,76.164452,...,9.6,86.9,2,90.0,,89.402587,0.893348,4.0,2.0,65.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11-15,Wyoming,56037,Sweetwater County,WY,22882,21235,34.6,44117,48.133373,41.659534,-108.879563,77.866565,...,16.1,79.3,2,10.9,0.096652,4.231177,0.709515,3.0,0.0,42.59
11-15,Wyoming,56039,Teton County,WY,11911,11148,39.3,23059,48.345548,43.934651,-110.589818,83.459206,...,14.9,81.5,4,46.4,0.072857,5.771416,0.310525,3.0,0.0,42.59
11-15,Wyoming,56041,Uinta County,WY,10505,10104,35.5,20609,49.027124,41.287642,-110.547628,77.381026,...,9.2,87.4,1,43.1,0.105925,9.902175,0.726564,3.0,0.0,42.59
11-15,Wyoming,56043,Washakie County,WY,4137,3992,43.5,8129,49.108131,43.904970,-107.682819,78.862800,...,14.1,82.4,0,36.0,0.130397,3.631369,0.763241,3.0,0.0,42.59


In [None]:
combined.to_csv("../processed_data/combined.csv", index = False)

In [None]:

df = covid_df.merge(demographics_df, left_on="FIPS", right_on="fips")
df.head()

In [None]:
WINDOW_SIZE = 15

date_cols = df.columns[1:np.where(df.columns == 'fips')[0][0]]
nondate_cols = df.columns[np.where(df.columns == 'fips')[0][0]:]

augmented_data = []
augmented_index = []
    
for index, row in tqdm(df.iterrows()):
    for i, col in enumerate(date_cols[:-WINDOW_SIZE]):
        series = row[date_cols[i:i+WINDOW_SIZE]].reset_index(drop=True)
        series_dict = {f"{14 - k}_before": v for k, v in series.to_dict().items()}
        series_dict.update(row[nondate_cols].to_dict())
        augmented_data.append(series_dict)
        augmented_index.append(f"{row['fips']}_{date_cols[i+WINDOW_SIZE-1]}")

augmented_df = pd.DataFrame(data=augmented_data, index=augmented_index)
augmented_df = augmented_df[1:]
augmented_df.to_csv("../processed_data/combined.csv")

824it [03:26,  3.96it/s]

In [None]:
augmented_df.head()

In [None]:
augmented_df.shape