# Create the Master Data Table

The following data sources were used:

USA Facts (Affiliated with the CDC)
* usafacts_cdc.p created with the notebook usafacts_cdc_data_explore.ipynb 
* has NYC county level time series data, that no other source has
* has FIPS code already on
* comes with its own population attached

Census Data (ACS Survey)
* census_master.p created with the notebook cencus_data_explore.ipynb
* has a wealth of fields to attach to counties
* comes from the 'covid' area of the census FTP site

Google Mobility Data
* Apple data is not automatically downloadable
* simple and easy to use



In [83]:
import pandas as pd, ftplib, io, pickle, boto3, numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 500)

# Data Pipepline

## Helper Functions
* Be sure to set your aws credentials in the virtual environment using 
* pipenv shell
* aws configure
* enter your ID and secret key

In [84]:
def getS3Data(bucket, file_loc, csv=True):
    client = boto3.client('s3')
    obj = client.get_object(Bucket=bucket, Key=file_loc)
    if(csv):
        data = pd.read_csv(obj['Body'] , dtype=str)
    else:
        data = pd.read_json(obj['Body'])
    
    return data

In [85]:
def dataCleanUp(data, date_field, fips_field, case_field, death_field, suffix): 
    print("Shape: ", data.shape)
    data[fips_field] = data[fips_field].astype('str').apply(lambda x: x.zfill(5))
    data[date_field] = data[date_field].astype('datetime64')
    data[case_field] = data[case_field].astype('float64')
    print("Case control total: ", data[case_field].sum())
    data[death_field] = data[death_field].astype('float64')
    print("Death control total: ", data[death_field].sum())
    data = data.add_suffix(suffix)
    data['date'] = data[date_field + suffix]
    data = data.rename(str.lower, axis='columns') 
    return data

In [86]:
def getDataInfo(data, case_field, death_field):
    print("Dataframe Shape: ", data.shape)
    print("Number of States: ", len(data['state_code'].unique()))
    print("Number of Counties: ", len(data['fips_code'].unique()))
    print("Minimum Date: ", data['date'].min())
    print("Maximum Date: ", data['date'].max())
    print("Duplicate State-Fips-Date: ", 
          data.groupby(['state_code','fips_code', 'date']).size().reset_index().rename(columns={0:'count'})['count'].sum() 
          - data.shape[0])
    print("Null State Code: ", data[data.state_code.isnull()].size)
    print("Null County Code: ", data[data.fips_code.isnull()].size)
    print("Null Dates: ", enigma_agg_data[enigma_agg_data.date.isnull()].size)
    
    print("Case Control Total: ", data[case_field].sum())
    print("Death Control Total: ", data[death_field].sum())

In [87]:
def sumDups(data, county_field, date_field, sum_field):
    out = data[
        data.duplicated([county_field, date_field]) & 
        (data[date_field] == data[date_field].max())
    ][sum_field].sum()
    return out

## Static Data

### County reference data

In [88]:
# Get data
county_ref_data = getS3Data('covid19-lake', 'static-datasets/csv/CountyPopulation/County_Population.csv')

# Rename columns
county_ref_data.columns = ['big_fips', 'fips', 'county', 'state', 'population_2018']

# Fill leading zeros on fips code
county_ref_data['fips'] = county_ref_data['fips'].astype('str').apply(lambda x: x.zfill(5))

### State reference data

In [89]:
# Get data
state_ref_data = getS3Data('covid19-lake', 'static-datasets/csv/state-abv/states_abv.csv')

# Rename columns
state_ref_data.columns = ['state_name','state_code']

### Safegraph Open Census Data
* See notebook: safe_graph_census_data_explore.ipynb

In [90]:
with open('safegraph_census.p', 'rb') as f:
    safegraph_census_data = pickle.load(f)

safegraph_census_data.head(2)

Unnamed: 0,fips_code,pop_total,m_total,m_0_5,m_5_9,m_10_14,m_15_17,m_18_19,m_20_20,m_21_21,...,pir_100_124,pir_125_149,pir_150_184,pir_185_199,pir_200_plus,unins_total,unins_0_18,unins_18_34,unins_35_64,unins_65_plus
0,10001,171474,82687,5678,5534,5951,3443,2578,1766,1402,...,8497,7747,12205,4637,111052,168146,2032,5260,5498,270
1,10003,551997,267276,16832,16455,17992,10852,8324,3766,4164,...,16301,19718,28021,10523,399898,543604,4448,15617,16662,971


In [91]:
# Check for duplicates
safegraph_census_data.drop_duplicates(subset='fips_code', keep='first').shape[0]-safegraph_census_data.shape[0]

0

#### FIPS Reference Data

In [92]:
# Get data
with open('safegraph_ref_data_fips.p', 'rb') as f:
    safegraph_ref_data_fips = pickle.load(f)

# Create state_county concatenation for lookup on google mobility data


#### Land area

In [93]:
# Get data
with open('safegraph_land.p', 'rb') as f:
    safegraph_land = pickle.load(f)

In [94]:
safegraph_land = safegraph_land.rename(columns={"fips":"fips_code"})
safegraph_land.head(2)

Unnamed: 0,fips_code,amount_land,amount_water
0,1001,1539609015,25749942
1,1003,4117584019,1133130502


## COVID Cases and Deaths Timeseries Data

### Enigma Aggregation

#### Download data and cleanup

In [95]:
# Get data
enigma_agg_data = getS3Data('covid19-lake', 'enigma-aggregation/csv/us_counties/enigma_covid_19_us_counties.csv')

In [96]:
# Clean up
enigma_agg_data = dataCleanUp(enigma_agg_data, 'date', 'county_fips', 'cases','deaths','_ea')

Shape:  (197596, 10)
Case control total:  67961951.0
Death control total:  3809028.0


In [97]:
# Change a few fips codes
enigma_agg_data['fips_code'] = enigma_agg_data['county_fips_ea']
enigma_agg_data.loc[enigma_agg_data['fips_code'] == '00nan', 'fips_code'] = '00000'

In [98]:
# Attach 2 digit state code from state fips code
tmp_state = safegraph_ref_data_fips.groupby(['state_fips','state']).size().reset_index().rename(columns={0:'count'})
enigma_agg_data['state_fips_ea'] = enigma_agg_data['state_fips_ea'].astype('str').apply(lambda x: x.zfill(2))

lefton = ['state_fips_ea']
righton = ['state_fips']

enigma_agg_data = pd.merge(enigma_agg_data, tmp_state[['state_fips','state']], how='left', left_on=lefton, right_on=righton)

enigma_agg_data['state_code'] = enigma_agg_data['state']
enigma_agg_data['state_name'] = enigma_agg_data['state_name_ea']

enigma_agg_data = enigma_agg_data.drop(columns=['state', 'state_fips'])

#### Checks

In [99]:
getDataInfo(enigma_agg_data, 'cases_ea', 'deaths_ea')

Dataframe Shape:  (197596, 14)
Number of States:  55
Number of Counties:  2967
Minimum Date:  2020-01-21 00:00:00
Maximum Date:  2020-06-01 00:00:00
Duplicate State-Fips-Date:  0
Null State Code:  0
Null County Code:  0
Null Dates:  0
Case Control Total:  67961951.0
Death Control Total:  3809028.0


### Enigma Johns Hopkins University

#### Download data and basic summary

In [100]:
# NEED TO CHANGE FUNCTION TO , dtype=str and then add some typing logic
### THIS SECTION NEEDS FIXING - they just removed the csv!
#enigma_jh_data = getS3Data('covid19-lake', 'enigma-jhu-timeseries/csv/jhu_csse_covid_19_timeseries_merged.csv')

In [101]:
# Filter to only US
# enigma_jh_data = enigma_jh_data[enigma_jh_data['iso2']=='US']

In [102]:
#enigma_jh_data = dataCleanUp(enigma_jh_data, 'date', 'fips', 'confirmed', 'deaths', '_ejhu')

In [103]:
# Create clean fips_code
#enigma_jh_data['fips_code'] = enigma_jh_data['fips_ejhu']
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu'].isin(jh_unknown_fips_counties), 'fips_code'] = '00000'

In [104]:
# Add "Out of XX" counties to "Unknown"
#jh_unknown_fips_counties = ['Out of AL', 'Out of AK',
#       'Out of AZ', 'Out of AR', 'Out of CA', 'Out of CO', 'Out of CT',
#       'Out of DE', 'Out of DC', 'Out of FL', 'Out of GA', 'Out of HI',
#       'Out of ID', 'Out of IL', 'Out of IN', 'Out of IA', 'Out of KS',
#       'Out of KY', 'Out of LA', 'Out of ME', 'Out of MD', 'Out of MA',
#       'Out of MI', 'Out of MN', 'Out of MS', 'Out of MO', 'Out of MT',
#       'Out of NE', 'Out of NV', 'Out of NH', 'Out of NJ', 'Out of NM',
#       'Out of NY', 'Out of NC', 'Out of ND', 'Out of OH', 'Out of OK',
#       'Out of OR', 'Out of PA', 'Out of RI', 'Out of SC', 'Out of SD',
#       'Out of TN', 'Out of TX', 'Out of UT', 'Out of VT', 'Out of VA',
#       'Out of WA', 'Out of WV', 'Out of WI', 'Out of WY']
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu'].isin(jh_unknown_fips_counties), 'fips_code'] = '00000'

In [105]:
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Dukes and Nantucket', 'fips_code'] = 'n0001'
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Kansas City', 'fips_code'] = 'n0002'
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Michigan Department of Corrections (MDOC)', 'fips_code'] = 'n0003'
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Federal Correctional Institution (FCI)','fips_code'] = 'n0004'
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Bear River', 'fips_code'] = 'n0005'
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Central Utah', 'fips_code'] = 'n0006'
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Southeast Utah', 'fips_code'] = 'n0007'
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Southwest Utah', 'fips_code'] = 'n0008'
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='TriCounty', 'fips_code'] = 'n0009'
#enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Weber-Morgan', 'fips_code'] = 'n0010'

In [106]:
#lefton = ['province_state_ejhu']
#righton = ['state_name']

#enigma_jh_data = pd.merge(
#    enigma_jh_data, state_ref_data, 
#    how='left', left_on=lefton, right_on=righton)

In [107]:
#enigma_jh_data.loc[enigma_jh_data['province_state_ejhu']=='Diamond Princess', 'state_code'] = 'CA'
#enigma_jh_data.loc[enigma_jh_data['province_state_ejhu']=='Grand Princess', 'state_code'] = 'CA'

#enigma_jh_data.loc[enigma_jh_data['province_state_ejhu']=='Diamond Princess', 'state_name'] = 'California'
#enigma_jh_data.loc[enigma_jh_data['province_state_ejhu']=='Grand Princess', 'state_name'] = 'California'

In [108]:
#enigma_jh_data['date'] = enigma_jh_data['date_ejhu']

In [109]:
#enigma_jh_data.head(2)

#### Checks

In [110]:
#getDataInfo(enigma_jh_data, 'confirmed_ejhu', 'deaths_ejhu')

### Enigma New York Times

#### Download data and clean data

In [111]:
# Get data
enigma_nyt_data = getS3Data('covid19-lake', 'enigma-nytimes-data-in-usa/csv/us_county/us_county.csv')

In [112]:
# Clean up
enigma_nyt_data = dataCleanUp(enigma_nyt_data, 'date', 'fips', 'cases','deaths','_enyt')

Shape:  (129747, 6)
Case control total:  31616769.0
Death control total:  1652495.0


In [113]:
# Fix Fips codes
enigma_nyt_data['fips_code'] = enigma_nyt_data['fips_enyt']
enigma_nyt_data.loc[enigma_nyt_data['county_enyt']=='New York City', 'fips_code'] = 'NYC000'
enigma_nyt_data.loc[enigma_nyt_data['county_enyt']=='New York City', 'fips_code'] = 'KC0000'
enigma_nyt_data.loc[enigma_nyt_data['county_enyt']=='Unknown', 'fips_code'] = '00000'

In [114]:
# Add state
lefton = ['state_enyt']
righton = ['state_name']

enigma_nyt_data = pd.merge(
    enigma_nyt_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [115]:
# Fix for extra states
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Puerto Rico', 'state_code'] = 'PR'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Virgin Islands', 'state_code'] = 'VI'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Guam', 'state_code'] = 'GU'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Northern Mariana Islands', 'state_code'] = 'MP'

enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Puerto Rico', 'state_name'] = 'Puerto Rico'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Virgin Islands', 'state_name'] = 'Virgin Islands'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Guam', 'state_name'] = 'Guam'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Northern Mariana Islands', 'state_name'] = 'Northern Mariana Islands'

#### Checks

In [116]:
getDataInfo(enigma_nyt_data, 'cases_enyt', 'deaths_enyt')

Dataframe Shape:  (129747, 10)
Number of States:  55
Number of Counties:  2885
Minimum Date:  2020-01-21 00:00:00
Maximum Date:  2020-05-09 00:00:00
Duplicate State-Fips-Date:  0
Null State Code:  0
Null County Code:  0
Null Dates:  0
Case Control Total:  31616769.0
Death Control Total:  1652495.0


### Rearc New York Times

#### Download data and basic summary

In [117]:
# Get data
rearc_nyt_data = getS3Data('covid19-lake', 'rearc-covid-19-nyt-data-in-usa/csv/us-counties/us-counties.csv')

In [118]:
# Clean up
rearc_nyt_data = dataCleanUp(rearc_nyt_data, 'date', 'fips', 'cases','deaths','_rnyt')

Shape:  (197596, 6)
Case control total:  67961951.0
Death control total:  3809028.0


In [119]:
# Edit fips codes
rearc_nyt_data['fips_code'] = rearc_nyt_data['fips_rnyt']
rearc_nyt_data.loc[rearc_nyt_data['county_rnyt']=='New York City', 'fips_code'] = 'NYC000'
rearc_nyt_data.loc[rearc_nyt_data['county_rnyt']=='New York City', 'fips_code'] = 'KC0000'
rearc_nyt_data.loc[rearc_nyt_data['county_rnyt']=='Unknown', 'fips_code'] = '00000'

In [120]:
# Attach state code
lefton = ['state_rnyt']
righton = ['state_name']

rearc_nyt_data = pd.merge(
    rearc_nyt_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [121]:
# Add some states codes and names

rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Puerto Rico', 'state_code'] = 'PR'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Virgin Islands', 'state_code'] = 'VI'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Guam', 'state_code'] = 'GU'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Northern Mariana Islands', 'state_code'] = 'MP'

rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Puerto Rico', 'state_name'] = 'Puerto Rico'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Virgin Islands', 'state_name'] = 'Virgin Islands'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Guam', 'state_name'] = 'Guam'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Northern Mariana Islands', 'state_name'] = 'Northern Mariana Islands'

#### Checks

In [122]:
getDataInfo(rearc_nyt_data, 'cases_rnyt','deaths_rnyt')

Dataframe Shape:  (197596, 10)
Number of States:  55
Number of Counties:  2969
Minimum Date:  2020-01-21 00:00:00
Maximum Date:  2020-06-01 00:00:00
Duplicate State-Fips-Date:  0
Null State Code:  0
Null County Code:  0
Null Dates:  0
Case Control Total:  67961951.0
Death Control Total:  3809028.0


### USA Facts (CDC Affiliate)

#### Download data and basic summary

In [123]:
# Download the data
url="https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv"
data_confirmed = pd.read_csv(url)

In [124]:
# Download the data
url = "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv"
data_deaths = pd.read_csv(url)

In [125]:
# Download the data
url = "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv"
data_county_pop = pd.read_csv(url)

In [126]:
# Move the date column headers into a single column as rows
data_confirmed = data_confirmed.melt(
    id_vars=['countyFIPS','County Name','State','stateFIPS'], 
    var_name='Date', value_name='Confirmed')

data_deaths = data_deaths.melt(
    id_vars=['countyFIPS','County Name','State','stateFIPS'], 
    var_name='Date', value_name='Deaths')

In [127]:
# Join the raw tables together

# Part 1
lefton = ['countyFIPS', 'County Name', 'State', 'stateFIPS', 'Date']
righton = ['countyFIPS', 'County Name', 'State', 'stateFIPS', 'Date']

data = pd.merge(data_confirmed, data_deaths, how='left', left_on=lefton, right_on=righton)

# Part 2
lefton = ['countyFIPS', 'County Name', 'State']
righton = ['countyFIPS', 'County Name', 'State']

usafacts_cdc_data = pd.merge(data, data_county_pop, how='left', left_on=lefton, right_on=righton)

In [128]:
# Clean up
usafacts_cdc_data = dataCleanUp(usafacts_cdc_data, 'Date', 'countyFIPS', 'Confirmed','Deaths','_cdc')

Shape:  (421740, 8)
Case control total:  67241604.0
Death control total:  3754779.0


In [129]:
# Fips code clean up
usafacts_cdc_data['fips_code'] = usafacts_cdc_data['countyfips_cdc']
usafacts_cdc_data.loc[(usafacts_cdc_data['county name_cdc']=='Grand Princess Cruise Ship'), 'fips_code'] = '99999'

In [130]:
# State code
usafacts_cdc_data['state_code'] = usafacts_cdc_data['state_cdc']

In [131]:
# Attach State Name
lefton = ['state_code']
righton = ['state_code']

usafacts_cdc_data = pd.merge(
    usafacts_cdc_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

#### Checks

In [132]:
getDataInfo(usafacts_cdc_data, 'confirmed_cdc','deaths_cdc')

Dataframe Shape:  (421740, 12)
Number of States:  51
Number of Counties:  3146
Minimum Date:  2020-01-22 00:00:00
Maximum Date:  2020-06-01 00:00:00
Duplicate State-Fips-Date:  0
Null State Code:  0
Null County Code:  0
Null Dates:  0
Case Control Total:  67241604.0
Death Control Total:  3754779.0


### Google Mobility Data

#### Download data

In [133]:
# Get data
url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=57b4ac4fc4052"
data = pd.read_csv(url, dtype=str)

In [134]:
# Clean up
data = data[data['country_region_code'] == 'US']
data['date'] = data['date'].astype('datetime64')
google_data = data

#### State Level Table

In [135]:
# Create state table
google_state_data = google_data[
    (google_data['sub_region_1'].notnull()) &
    (~google_data['sub_region_2'].notnull())
]
google_state_data = google_state_data.add_suffix("_goog_st")

In [136]:
# Add state_code
lefton = ['sub_region_1_goog_st']
righton = ['state_name']

google_state_data = pd.merge(
    google_state_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [137]:
# Drop columns not needed
google_state_data['date'] = google_state_data['date_goog_st']
google_state_data = google_state_data.drop(columns=
                                             ['country_region_code_goog_st',
                                              'country_region_goog_st', 'sub_region_1_goog_st','sub_region_2_goog_st',
                                              'date_goog_st','state_name'])

#### County Level Table

In [138]:
# Create county table
google_county_data = google_data[
    (google_data['sub_region_1'].notnull()) &
    (google_data['sub_region_2'].notnull())
]
google_county_data = google_county_data.add_suffix("_goog_cnty")

In [139]:
# Add state code
lefton = ['sub_region_1_goog_cnty']
righton = ['state_name']

google_county_data = pd.merge(
    google_county_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [140]:
# Create state_county concatenation
google_county_data['state_county'] = google_county_data['state_code'] + google_county_data['sub_region_2_goog_cnty']

In [141]:
# Manual state-county name fixes
google_county_data.loc[google_county_data['state_county']=='AKAnchorage', 'state_county'] = 'AKAnchorage Municipality'
google_county_data.loc[google_county_data['state_county']=='AKBethel', 'state_county'] = 'AKBethel Census Area'
google_county_data.loc[google_county_data['state_county']=='AKFairbanks North Star', 'state_county'] = 'AKFairbanks North Star Borough'
google_county_data.loc[google_county_data['state_county']=='AKJuneau', 'state_county'] = 'AKJuneau City and Borough'
google_county_data.loc[google_county_data['state_county']=='AKKetchikan Gateway', 'state_county'] = 'AKKetchikan Gateway Borough'
google_county_data.loc[google_county_data['state_county']=='AKKodiak Island', 'state_county'] = 'AKKodiak Island Borough'
google_county_data.loc[google_county_data['state_county']=='AKMatanuska-Susitna', 'state_county'] = 'AKMatanuska-Susitna Borough'
google_county_data.loc[google_county_data['state_county']=='AKNorth Slope', 'state_county'] = 'AKNorth Slope Borough'
google_county_data.loc[google_county_data['state_county']=='AKSitka', 'state_county'] = 'AKSitka City and Borough'
google_county_data.loc[google_county_data['state_county']=='AKSoutheast Fairbanks', 'state_county'] = 'AKSoutheast Fairbanks Census Area'
google_county_data.loc[google_county_data['state_county']=='AKValdez-Cordova', 'state_county'] = 'AKValdez-Cordova Census Area'
google_county_data.loc[google_county_data['state_county']=='MDBaltimore', 'state_county'] = 'MDBaltimore city'

google_county_data.loc[google_county_data['state_county']=='MOSt. Louis', 'state_county'] = 'MOSt. Louis city'
google_county_data.loc[google_county_data['state_county']=='NMDoña Ana County', 'state_county'] = 'NMDona Ana County'
google_county_data.loc[google_county_data['state_county']=='VAAlexandria', 'state_county'] = 'VAAlexandria city'
google_county_data.loc[google_county_data['state_county']=='VABristol', 'state_county'] = 'VABristol city'
google_county_data.loc[google_county_data['state_county']=='VABuena Vista', 'state_county'] = 'VABuena Vista city'
google_county_data.loc[google_county_data['state_county']=='VACharlottesville', 'state_county'] = 'VACharlottesville city'
google_county_data.loc[google_county_data['state_county']=='VAChesapeake', 'state_county'] = 'VAChesapeake city'
google_county_data.loc[google_county_data['state_county']=='VAColonial Heights', 'state_county'] = 'VAColonial Heights city'
google_county_data.loc[google_county_data['state_county']=='VACovington', 'state_county'] = 'VACovington city'

google_county_data.loc[google_county_data['state_county']=='VADanville', 'state_county'] = 'VADanville city'
google_county_data.loc[google_county_data['state_county']=='VAEmporia', 'state_county'] = 'VAEmporia city'
google_county_data.loc[google_county_data['state_county']=='VAFairfax', 'state_county'] = 'VAFairfax city'
google_county_data.loc[google_county_data['state_county']=='VAFalls Church', 'state_county'] = 'VAFalls Church city'
google_county_data.loc[google_county_data['state_county']=='VAFranklin', 'state_county'] = 'VAFranklin city'
google_county_data.loc[google_county_data['state_county']=='VAFredericksburg', 'state_county'] = 'VAFredericksburg city'
google_county_data.loc[google_county_data['state_county']=='VAGalax', 'state_county'] = 'VAGalax city'
google_county_data.loc[google_county_data['state_county']=='VAHampton', 'state_county'] = 'VAHampton city'
google_county_data.loc[google_county_data['state_county']=='VAHarrisonburg', 'state_county'] = 'VAHarrisonburg city'
google_county_data.loc[google_county_data['state_county']=='VAHopewell', 'state_county'] = 'VAHopewell city'

google_county_data.loc[google_county_data['state_county']=='VALexington', 'state_county'] = 'VALexington city'
google_county_data.loc[google_county_data['state_county']=='VALynchburg', 'state_county'] = 'VALynchburg city'
google_county_data.loc[google_county_data['state_county']=='VAManassas', 'state_county'] = 'VAManassas city'
google_county_data.loc[google_county_data['state_county']=='VAManassas Park', 'state_county'] = 'VAManassas Park city'
google_county_data.loc[google_county_data['state_county']=='VAMartinsville', 'state_county'] = 'VAMartinsville city'
google_county_data.loc[google_county_data['state_county']=='VANewport News', 'state_county'] = 'VANewport News city'
google_county_data.loc[google_county_data['state_county']=='VANorfolk', 'state_county'] = 'VANorfolk city'
google_county_data.loc[google_county_data['state_county']=='VANorton', 'state_county'] = 'VANorton city'
google_county_data.loc[google_county_data['state_county']=='VAPetersburg', 'state_county'] = 'VAPetersburg city'
google_county_data.loc[google_county_data['state_county']=='VAPoquoson', 'state_county'] = 'VAPoquoson city'
google_county_data.loc[google_county_data['state_county']=='VAPortsmouth', 'state_county'] = 'VAPortsmouth city'
google_county_data.loc[google_county_data['state_county']=='VARadford', 'state_county'] = 'VARadford city'
google_county_data.loc[google_county_data['state_county']=='VARichmond', 'state_county'] = 'VARichmond city'
google_county_data.loc[google_county_data['state_county']=='VARoanoke', 'state_county'] = 'VARoanoke city'

google_county_data.loc[google_county_data['state_county']=='VASalem', 'state_county'] = 'VASalem city'
google_county_data.loc[google_county_data['state_county']=='VAStaunton', 'state_county'] = 'VAStaunton city'
google_county_data.loc[google_county_data['state_county']=='VASuffolk', 'state_county'] = 'VASuffolk city'
google_county_data.loc[google_county_data['state_county']=='VAVirginia Beach', 'state_county'] = 'VAVirginia Beach city'
google_county_data.loc[google_county_data['state_county']=='VAWaynesboro', 'state_county'] = 'VAWaynesboro city'
google_county_data.loc[google_county_data['state_county']=='VAWilliamsburg', 'state_county'] = 'VAWilliamsburg city'
google_county_data.loc[google_county_data['state_county']=='VAWinchester', 'state_county'] = 'VAWinchester city'

In [142]:
# Attach FIPS code
lefton = ['state_county']
righton = ['state_county']

google_county_data = pd.merge(
    google_county_data, safegraph_ref_data_fips, 
    how='left', left_on=lefton, right_on=righton)

In [143]:
# Create fips_code and data columns
google_county_data['fips_code'] = google_county_data['fips']
google_county_data['date'] = google_county_data['date_goog_cnty']

In [144]:
# List counties which did not get a fips code
list(google_county_data[google_county_data.fips.isnull()]['state_county'].unique())

['SDOglala Lakota County']

In [145]:
# Add missing county manually
google_county_data.loc[google_county_data['state_county']=='SDOglala Lakota County', 'fips_code'] = 'SDOGL'

In [146]:
# Drop columns not needed
google_county_data = google_county_data.drop(columns=['country_region_code_goog_cnty',
       'country_region_goog_cnty', 'sub_region_1_goog_cnty',
      'date_goog_cnty','state_name',
       'state_county', 'state', 'state_fips', 'county_fips', 'county',
       'class_code', 'fips'])

## Combine Data Sources

### Merge Data

In [147]:
# Build a list of tables to join
raw_data_list = [
    enigma_agg_data,
    #enigma_jh_data,
    enigma_nyt_data,
    rearc_nyt_data,
    usafacts_cdc_data
]

In [148]:
# Since we want the sum columns to have different names, put them in a list
sum_cols = [
    ['cases_ea', 'deaths_ea'],
    #['confirmed_ejhu', 'deaths_ejhu'],
    ['cases_enyt', 'deaths_enyt'],
    ['cases_rnyt', 'deaths_rnyt'],
    ['confirmed_cdc', 'deaths_cdc']
]

In [149]:
# Recursively group the tables and put that in a new list
grouped_data_list = []
for data, sum_col in zip(raw_data_list, sum_cols):
    data = data.groupby(['state_code','fips_code', 'date'])[sum_col].agg('sum').reset_index()
    grouped_data_list.append(data)

In [150]:
# Add google mobility county data
grouped_data_list.append(
    google_county_data
)

In [151]:
# Join all the tables together
on_col = ['state_code', 'fips_code', 'date']
covid_data = pd.DataFrame(columns=on_col)
for data in grouped_data_list:
    covid_data = pd.merge(
        covid_data, data, how='outer', left_on=on_col, right_on=on_col
    )

In [152]:
# Add google mobility state data
on_col = ['state_code', 'date']
covid_data = covid_data = pd.merge(covid_data, google_state_data, how='left', left_on=on_col, right_on=on_col)

### Attach County Name

In [153]:
# Create county name lookup table from all data used so far
cols = [
    ['county_name_ea'],
    #['?'],
    ['county_enyt'],
    ['county_rnyt'],
    ['county name_cdc'],
    ['sub_region_2_goog_cnty']  
]

raw_data_list = [
    enigma_agg_data,
    #enigma_jh_data,
    enigma_nyt_data,
    rearc_nyt_data,
    usafacts_cdc_data,
    google_county_data
]

county_data_list = []
for data, col in zip(raw_data_list, cols):
    data = data.groupby(['state_code','fips_code','date']+col).size().reset_index().drop(columns=[0])
    data = data.rename(columns={col[0]:"county_name"})
    county_data_list.append(data)
data = pd.concat(county_data_list).dropna(subset=['county_name']).drop(columns=['date'])
data = data.drop_duplicates(subset=['state_code','fips_code'], keep="first").reset_index(drop=True)
county_names = data

In [154]:
# Attach county name
on_col = ['state_code', 'fips_code']
covid_data = pd.merge(covid_data, county_names, how='left', left_on=on_col, right_on=on_col)

In [155]:
# Attach land area
on_col = ['fips_code']
covid_data = pd.merge(covid_data, safegraph_land, how='left', left_on=on_col, right_on=on_col)

In [156]:
# Drop extra google county name columns
covid_data = covid_data.drop(columns=['sub_region_2_goog_cnty'])

### Attach Lat and Long

In [157]:
# Create lat long lookup table
lat_long_data = enigma_agg_data.groupby(['state_code','fips_code','lat_ea','long_ea']).size().reset_index().drop(columns=[0])
lat_long_data = lat_long_data.dropna(subset=['lat_ea','long_ea'])
lat_long_data = lat_long_data.drop_duplicates(subset=['state_code','fips_code'], keep="first").reset_index(drop=True)

In [158]:
# Join lat long data
on_col = ['state_code', 'fips_code']
covid_data = pd.merge(covid_data, lat_long_data, how='left', left_on=on_col, right_on=on_col)

### Attach Safegraph Census Data

In [159]:
# Join safegraph census data
covid_data = pd.merge(covid_data, safegraph_census_data, how='outer', left_on='fips_code', right_on='fips_code')

### Save Master Table

In [160]:
with open('covid_data.p', 'wb') as f:
    pickle.dump(covid_data, f)

### Control Totals

In [161]:
# Safegraph census data control
covid_data[covid_data['date']=='2020-05-10'].pop_total.sum() + covid_data[covid_data['date'].isnull()].pop_total.sum() - safegraph_census_data.pop_total.sum()

0.0

In [162]:
print(enigma_agg_data.cases_ea.sum() - covid_data.cases_ea.sum())
#print(enigma_jh_data.cases_ejhu.sum() - covid_data.cases_ejhu.sum()
print(enigma_nyt_data.cases_enyt.sum() - covid_data.cases_enyt.sum())
print(rearc_nyt_data.cases_rnyt.sum() - covid_data.cases_rnyt.sum())
print(usafacts_cdc_data.confirmed_cdc.sum() - covid_data.confirmed_cdc.sum())

0.0
0.0
0.0
0.0


In [163]:
print(enigma_agg_data.deaths_ea.sum() - covid_data.deaths_ea.sum())
#print(enigma_jh_data.deaths_ejhu.sum() - covid_data.deaths_ejhu.sum()
print(enigma_nyt_data.deaths_enyt.sum() - covid_data.deaths_enyt.sum())
print(rearc_nyt_data.deaths_rnyt.sum() - covid_data.deaths_rnyt.sum())
print(usafacts_cdc_data.deaths_cdc.sum() - covid_data.deaths_cdc.sum())

0.0
0.0
0.0
0.0
