In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from os import path
from tqdm import tqdm

In [2]:
covid_df = pd.read_csv("../processed_data/new_confirmed.csv")
demographics_df = pd.read_csv("../processed_data/demographic_data.csv")
protests_df = pd.read_csv("../processed_data/protests.csv")

df = covid_df.merge(demographics_df, left_on="FIPS", right_on="fips")
df.head()
print(df.columns[1:np.where(df.columns == 'fips')[0][0]])
print(df.columns[np.where(df.columns == 'fips')[0][0]:])

Index(['03-23-2020', '03-24-2020', '03-25-2020', '03-26-2020', '03-27-2020',
       '03-28-2020', '03-29-2020', '03-30-2020', '03-31-2020', '04-01-2020',
       ...
       '11-06-2020', '11-07-2020', '11-08-2020', '11-09-2020', '11-10-2020',
       '11-11-2020', '11-12-2020', '11-13-2020', '11-14-2020', '11-15-2020'],
      dtype='object', length=238)
Index(['fips', 'county', 'state', 'state_code', 'male', 'female', 'median_age',
       'population', 'female_percentage', 'lat', 'long', 'life_expectancy',
       'mortality_risk', 'all_poverty', 'median_hh_income', 'sq_miles',
       'pct_impoverished', 'pop_density'],
      dtype='object')


In [3]:
WINDOW_SIZE = 15

date_cols = df.columns[1:np.where(df.columns == 'fips')[0][0]]
nondate_cols = df.columns[np.where(df.columns == 'fips')[0][0]:]

augmented_data = []
augmented_index = []
    
for index, row in tqdm(df.iterrows()):
    for i, col in enumerate(date_cols[:-WINDOW_SIZE]):
        series = row[date_cols[i:i+WINDOW_SIZE]].reset_index(drop=True)
        series_dict = {f"{14 - k}_before": v for k, v in series.to_dict().items()}
        series_dict.update(row[nondate_cols].to_dict())
        augmented_data.append(series_dict)
        augmented_index.append(f"{row['fips']}_{date_cols[i+WINDOW_SIZE-1]}")

augmented_df = pd.DataFrame(data=augmented_data, index=augmented_index)
augmented_df = augmented_df[1:]
augmented_df.to_csv("../processed_data/combined.csv")

1699it [04:06,  6.88it/s]


In [4]:
augmented_df.head()

Unnamed: 0,14_before,13_before,12_before,11_before,10_before,9_before,8_before,7_before,6_before,5_before,...,female_percentage,lat,long,life_expectancy,mortality_risk,all_poverty,median_hh_income,sq_miles,pct_impoverished,pop_density
10003_04-07-2020,27.0,8.0,7.0,19.0,25.0,11.0,15.0,41.0,29.0,19.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-08-2020,8.0,7.0,19.0,25.0,11.0,15.0,41.0,29.0,19.0,34.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-09-2020,7.0,19.0,25.0,11.0,15.0,41.0,29.0,19.0,34.0,121.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-10-2020,19.0,25.0,11.0,15.0,41.0,29.0,19.0,34.0,121.0,36.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
10003_04-11-2020,25.0,11.0,15.0,41.0,29.0,19.0,34.0,121.0,36.0,60.0,...,51.566562,39.576833,-75.652692,78.985449,49.253442,61319.0,67246.0,426.29,0.110458,1302.242605
