In [1]:
import pandas as pd
import numpy as np

# Creating the Prediction Dataset

- In this notebook, we aggregate the processed COVID-19 data with the demographic data, and create new rows in an augmented dataset including 14-day windows of COVID-19 cases from mid-March as distinct examples.

In [2]:
demographics = pd.read_csv("../processed_data/demographic_data.csv", converters = {"fips": lambda x: str(x)})
demographics["fips"] = demographics.fips.str.rjust(5, fillchar = "0")

elections = pd.read_csv("../processed_data/elections.csv", converters = {"FIPS": lambda x: str(x)}).rename(columns = {"FIPS": "fips"})

restrictions = pd.read_csv("../processed_data/restriction.csv").rename(columns = {"Date": "date", "RegionName": "state", "C4_Restrictions on gatherings": "c4", "C6_Stay at home requirements": "c6", "StringencyIndex": "stringency"})
restrictions["date"] = restrictions.date.str.slice(5)

protests = pd.read_csv("../processed_data/protests.csv", converters = {"fips": lambda x: str(x)}).rename(columns = {"EVENT_DATE": "date"})
protests["date"] = protests.date.str.slice(5)
protests["protest_size"] = protests["99 or less"] + 2 * protests["100 to 499"] + 3 * protests["500 to 999"] + 4 * protests["1000 to 4999"] + 5 * protests["more than 4999"]
protests = protests[["fips", "date", "protest_size"]]
protests = protests.groupby(["fips", "date"]).max().reset_index()

covid = pd.read_csv("../processed_data/new_confirmed.csv", converters = {"FIPS": lambda x: str(x)}).rename(columns = {"FIPS": "fips"})
covid["fips"] = covid.fips.str.rjust(5, fillchar = "0")
covid.columns = covid.columns.str.slice(stop = 5)
date_names = covid.columns[7:]
covid = pd.concat([covid.fips, (covid.iloc[:, 1:].rolling(7, axis = 1).mean().iloc[:, 6:])], axis = 1)
covid = pd.melt(covid, id_vars = ['fips'], value_vars = covid.columns[1:], var_name = "date", value_name = "confirmed_cases")

In [3]:
combined = demographics.merge(elections, on = "fips")
combined = pd.concat([combined.assign(date = d) for d in date_names], ignore_index = True)
combined = combined.merge(restrictions, on = ["date", "state"])
combined = combined.merge(protests, how = "left", on = ["date", "fips"]).fillna(value = {"protest_size": -1})
combined = combined.merge(covid, on = ["date", "fips"])
combined = combined.set_index(["date", "state", "fips"])
combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,county,state_code,male,female,median_age,population,female_percentage,lat,long,life_expectancy,...,pct_not_proficient_in_english,pct_rural,pct_impoverished,pop_density,r_voteshare,c4,c6,stringency,protest_size,confirmed_cases
date,state,fips,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
03-29,Delaware,10001,Kent County,DE,84271,90551,37.6,174822,51.796113,39.086169,-75.568422,77.536467,...,1,27.0,,298.239449,0.495871,3.0,2.0,70.37,-1.0,3.571429
03-29,Delaware,10003,New Castle County,DE,268870,286263,38.1,555133,51.566562,39.576833,-75.652692,78.985449,...,2,4.6,0.110458,1302.242605,0.325945,3.0,2.0,70.37,-1.0,20.142857
03-29,Delaware,10005,Sussex County,DE,106429,113111,49.0,219540,51.521818,38.660553,-75.390038,78.763504,...,2,41.3,,234.531237,0.590625,3.0,2.0,70.37,-1.0,9.428571
03-29,Florida,12001,Alachua County,FL,127298,135850,31.3,263148,51.624941,29.674750,-82.357714,78.641544,...,1,21.2,0.212770,300.733698,0.364266,4.0,2.0,71.76,-1.0,10.142857
03-29,Florida,12003,Baker County,FL,14753,13032,37.6,27785,46.903005,30.331098,-82.284629,75.251197,...,1,59.5,,47.477060,0.814785,4.0,2.0,71.76,-1.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12-07,Washington,53069,Wahkiakum County,WA,2000,2189,54.3,4189,52.255908,46.291769,-123.424420,78.932648,...,1,100.0,0.124851,15.904776,0.569975,4.0,1.0,55.56,-1.0,1.000000
12-07,Washington,53071,Walla Walla County,WA,30830,29406,37.1,60236,48.817983,46.229773,-118.478440,79.796321,...,4,17.1,0.131250,47.425067,0.537039,4.0,1.0,55.56,-1.0,30.142857
12-07,Washington,53073,Whatcom County,WA,107228,109584,37.0,216812,50.543328,48.825909,-121.719892,80.979505,...,2,25.9,0.147972,102.907645,0.370182,4.0,1.0,55.56,-1.0,45.714286
12-07,Washington,53075,Whitman County,WA,24749,23844,24.6,48593,49.068796,46.901173,-117.523027,81.396547,...,2,27.5,0.226926,22.506241,0.427795,4.0,1.0,55.56,-1.0,15.428571


In [4]:
combined.to_csv("../processed_data/combined.csv")

In [4]:
# WINDOW_SIZE = 15

# date_cols = df.columns[1:np.where(df.columns == 'fips')[0][0]]
# nondate_cols = df.columns[np.where(df.columns == 'fips')[0][0]:]

# augmented_data = []
# augmented_index = []
    
# for index, row in tqdm(df.iterrows()):
#     for i, col in enumerate(date_cols[:-WINDOW_SIZE]):
#         series = row[date_cols[i:i+WINDOW_SIZE]].reset_index(drop=True)
#         series_dict = {f"{14 - k}_before": v for k, v in series.to_dict().items()}
#         series_dict.update(row[nondate_cols].to_dict())
#         augmented_data.append(series_dict)
#         augmented_index.append(f"{row['fips']}_{date_cols[i+WINDOW_SIZE-1]}")

# augmented_df = pd.DataFrame(data=augmented_data, index=augmented_index)
# augmented_df = augmented_df[1:]
# augmented_df.to_csv("../processed_data/combined.csv")