In [None]:
import numpy as np
import pandas as pd
import missingno as msno
from datetime import datetime

In [None]:
## Load covid_cases and covid_fatalities
covid_cases = pd.read_csv('data/covid_cases.csv')
covid_fatalities = pd.read_csv('data/covid_fatalities.csv')

### Covid Cases

In [None]:
# second row is the column names and drop the information above
covid_cases.columns = covid_cases.iloc[1]
covid_cases = covid_cases.drop(index=[0,1])

# Last rows have no useful information
covid_cases = covid_cases.drop(index=[256,257,258,259,260,261])

In [None]:
# Reduce column names to dates
col_names = covid_cases.columns.to_list()
col_names[1:] = [name[6:] for name in col_names[1:]]
covid_cases.columns = col_names

In [None]:
# melt columns to rows
covid_cases = pd.melt(covid_cases, id_vars=['County Name'], 
             value_vars=covid_cases.columns[1:].to_list(), 
             var_name ='date', 
             value_name = 'CaseCount',)
covid_cases = covid_cases.sort_values('County Name')

In [None]:
## Convert column to date time and add month and year column
covid_cases['date'] = pd.to_datetime(covid_cases['date'], format='%m-%d-%Y')
covid_cases['month'] = pd.DatetimeIndex(covid_cases['date']).month
covid_cases['year'] = pd.DatetimeIndex(covid_cases['date']).year

In [None]:
# Convert cases to int from object and get the mean casecount by month
covid_cases['CaseCount'] = covid_cases['CaseCount'].astype(int)
covid_cases_grouped = covid_cases.groupby(['year', 'month', 'County Name']).agg({'CaseCount':'mean'})
covid_cases_grouped = covid_cases_grouped.sort_values(['County Name', 'year', 'month'])
covid_cases_grouped = covid_cases_grouped.reset_index()

In [None]:
covid_cases_grouped.head(10)

### Covid Fatalities

In [None]:
# second row is the column names and drop the information above
covid_fatalities.columns = covid_fatalities.iloc[1]
covid_fatalities = covid_fatalities.drop(index=[0,1])


In [None]:
# Last row, 257, has no useful information
covid_fatalities = covid_fatalities.drop(index=257)

In [None]:
# Reduce column names to dates
col_names = covid_fatalities.columns.to_list()
col_names[1:] = [name[11:] for name in col_names[1:]]
covid_fatalities.columns = col_names

In [None]:
# melt columns to rows
covid_fatalities = pd.melt(covid_fatalities, id_vars=['County Name'], 
             value_vars=covid_fatalities.columns[1:].to_list(), 
             var_name ='date', 
             value_name = 'fatalities',)
covid_fatalities = covid_fatalities.sort_values('County Name')

In [None]:
## Convert column to date time and add month and year column
covid_fatalities['date'] = pd.to_datetime(covid_fatalities['date'], format='%m-%d-%Y')
covid_fatalities['month'] = pd.DatetimeIndex(covid_fatalities['date']).month
covid_fatalities['year'] = pd.DatetimeIndex(covid_fatalities['date']).year

In [None]:
# Convert cases to int from object and get the mean fatalities by month
covid_fatalities['fatalities'] = covid_fatalities['fatalities'].astype(int)
covid_fatalities_grouped = covid_fatalities.groupby(['year', 'month', 'County Name']).agg({'fatalities':'mean'})
covid_fatalities_grouped = covid_fatalities_grouped.sort_values(['County Name', 'year', 'month'])
covid_fatalities_grouped = covid_fatalities_grouped.reset_index()

In [None]:
covid_fatalities_grouped.head(10)

### Compare Cases and Fatalities county names

In [None]:
# Convert Fatalities to lowercase
covid_fatalities_grouped['County Name'] = covid_fatalities_grouped['County Name'].str.lower()
covid_cases_grouped['County Name'] = covid_cases_grouped['County Name'].str.lower()

In [None]:
# Compare case_counties and fatality_counties
case_counties = covid_cases_grouped['County Name'].unique().tolist()
case_counties = [county.lower() for county in case_counties]
set(case_counties)

fatalities_counties = covid_fatalities_grouped['County Name'].unique().tolist()
fatalities_counties = [county.lower() for county in fatalities_counties]
set(fatalities_counties)

# fatalities_counties uses DE WITT instead of dewitt, and also has an unknown county
set(fatalities_counties) - set(case_counties)

In [None]:
## fatalities includes "de witt" instead of "dewitt".  Also includes "unknown"
covid_fatalities_grouped['County Name'].replace(to_replace='de witt', value='dewitt', inplace=True)
# covid_fatalities_grouped.drop()
covid_fatalities_grouped = covid_fatalities_grouped[covid_fatalities_grouped['County Name'] != 'unknown']

In [None]:
## Check that sets are the same. Should produce an empty set
fatalities_counties_grouped = covid_fatalities_grouped['County Name'].unique().tolist()
set(fatalities_counties_grouped) - set(case_counties)

In [None]:
## Merge the two datasets

covid_merged = pd.merge(covid_fatalities_grouped, covid_cases_grouped)

print(f'fatalities shape: {covid_fatalities_grouped.shape}')
print(f'cases shape: {covid_cases_grouped.shape}')
print(f'merged shape: {covid_merged.shape}')

In [None]:
# # Save csv
# covid_merged.to_csv('covid.csv')