In [87]:
import pandas as pd

home = 'https://github.com/Group8-GovAnalyticsProject/GraceYoung/'

In [88]:
#Reading COVID data

covid_link = home + 'raw/main/biweekly-counts-rates-by-geography-dec-29.xlsx'
covid_cases = pd.read_excel(covid_link, sheet_name='Census')

In [89]:
# Preprocessing COVID data

# Filter for weeks prior to park counts collected
covid_cases_pre = covid_cases[covid_cases['Week_Start'] <= '2020-04-27'] #This is the date when park data was collected.

# Summing up the number of COVID cases per tract prior to the date where park counts were collected
covid_cases_pre = covid_cases_pre.groupby(['Census'], as_index=False)['Positives'].sum()
covid_cases_pre.columns = ['tract_id','precount_positive']

# Also summing all cases to date because having a lot of records with 0 is not good for later regression analysis
covid_cases_all = covid_cases.groupby(['Census'], as_index=False)['Positives'].sum()
covid_cases_all.columns = ['tract_id','all_positive']

In [90]:
#Reading median income data

income_link = home + 'raw/main/median_income_raw.csv'
income = pd.read_csv(link_income)

In [91]:
#Cleaning median income data

#Selecting only the columns I want (census tract ID and median household income)
income = income.iloc[:,[0,24]]
income.head()

Unnamed: 0,GEO_ID,S1901_C01_012E
0,id,Estimate!!Households!!Median income (dollars)
1,1400000US53033000100,55143
2,1400000US53033000200,77463
3,1400000US53033000300,100917
4,1400000US53033000401,32877


In [92]:
#Dropping the first text row

income.drop(0, axis=0, inplace=True)
income.reset_index(drop=True, inplace=True)
income.head()

Unnamed: 0,GEO_ID,S1901_C01_012E
0,1400000US53033000100,55143
1,1400000US53033000200,77463
2,1400000US53033000300,100917
3,1400000US53033000401,32877
4,1400000US53033000402,72150


In [93]:
#Renaming columns
income.columns = ['tract_id', 'median_hhold_inc']
income.head()

Unnamed: 0,tract_id,median_hhold_inc
0,1400000US53033000100,55143
1,1400000US53033000200,77463
2,1400000US53033000300,100917
3,1400000US53033000401,32877
4,1400000US53033000402,72150


In [94]:
#Fixing geo_id to just show tract ID by dropping the first 9 characters
drop_first = lambda s : int(str(s)[9:])

income['tract_id'] = income['tract_id'].apply(drop_first)
income.head()

Unnamed: 0,tract_id,median_hhold_inc
0,53033000100,55143
1,53033000200,77463
2,53033000300,100917
3,53033000401,32877
4,53033000402,72150


In [95]:
#Fixing column data types
income['median_hhold_inc'] = pd.to_numeric(income['median_hhold_inc'], errors='coerce')

#Removing any NaNs
income = income.dropna()

In [96]:
#Writing final cleaned csv files
covid_cases_pre.to_csv('clean_covid_cases_pre.csv', index=False)
covid_cases_all.to_csv('clean_covid_cases_all.csv', index=False)
income.to_csv('clean_median_income.csv', index=False)