# Create the Master Data Table

The following data sources were used:

USA Facts (Affiliated with the CDC)
* usafacts_cdc.p created with the notebook usafacts_cdc_data_explore.ipynb 
* has NYC county level time series data, that no other source has
* has FIPS code already on
* comes with its own population attached

Census Data (ACS Survey)
* census_master.p created with the notebook cencus_data_explore.ipynb
* has a wealth of fields to attach to counties
* comes from the 'covid' area of the census FTP site

Google Mobility Data
* Apple data is not automatically downloadable
* simple and easy to use



In [1]:
import pandas as pd, ftplib, io, pickle, boto3, numpy as np

# Data Pipepline

## Download and Aggregate Census Data

In [2]:
def getCensusData():
    ftp = ftplib.FTP('ftp2.census.gov')
    print(ftp.login())
    ftp.cwd('programs-surveys/acs/data/covid_19/Data_Profiles_for_HHS/050-County_By_State')
    
    files = ftp.nlst()
    
    # Load and transform files
    data = pd.DataFrame()
    data_list = []
    for file in files:
        download_file = io.BytesIO()
        ftp.retrbinary("RETR {}".format(file), download_file.write)
        download_file.seek(0) # after writing go back to the start of the virtual file
        df = pd.read_csv(download_file, encoding = "ISO-8859-1") # read virtual file into pandas

        # Transform the df to get ready to transpose
        df = df.drop(columns=['Line Number','Table ID'])
        df = df[df.columns.drop(list(df.filter(regex='Percent')))]
        df.columns = df.columns.str.rstrip(' Estimate')  

        df = df.rename(columns = {'Description':'County'})

        # Transpose the df so that we can aggregate files with rows for each county
        df = df.T

        # Fix the column names
        col_names = df.iloc[0,]
        df.columns = col_names

        # Drop the row with the column names
        df = df.drop(df.index[0])

        # Add a state columns
        df["State"] = file[-6:-4].upper()

        # Add the data to a list
        data_list.append(df)

    print(ftp.quit())
    print("Data successfully downloaded.")
    return data_list

In [3]:
data_list = getCensusData()

230-Server: ftp2.census.gov
230-
230-Personal Identifiable Information (PII) shall not be placed on the FTP
230-server without prior special arrangement and in conjunction with ITSO.
230-
230-NOTE: The data available for anonymous FTP download on this FTP server are
230-also available over the Web:
230-http://www2.census.gov
230 Login successful.
221 Goodbye.
Data successfully downloaded.


In [4]:
def createCensusMasterTable(data_list):
    data = pd.DataFrame()
    data_list_2 = []
    
    # Concatenate columns
    for i in range(int(len(data_list)/4)):
        df = pd.concat([data_list[i], data_list[i+52], data_list[i+2*52], data_list[i+3*52]], axis=1)
        data_list_2.append(df)
    
    # Puerto Rico table title says "Selected Social Characteristics in "Puerto Rico" which causes a problem with concatentate
    data_list_2.pop(39)
    
    # Concatenate rows
    data = pd.concat(data_list_2)
    return data

In [5]:
census_data = createCensusMasterTable(data_list)

In [6]:
census_data.head(2)

County,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED STATES,HOUSEHOLDS BY TYPE,Total households,Family households (families),With own children of the householder under 18 years,Married-couple family,With own children of the householder under 18 years.1,"Male householder, no wife present, family",With own children of the householder under 18 years.2,"Female householder, no husband present, family",...,Two races including Some other race,"Two races excluding Some other race, and Three or more races",NaN,Total housing units,NaN.1,"CITIZEN, VOTING AGE POPULATION","Citizen, 18 and over population",Male,Female,State
Aleutians East Borough,,,860,538,243,346,122,79,29,113,...,0,222,,1106,,,2122,1221,901,AK
Aleutians West Census Ar,,,1176,716,407,510,288,87,51,119,...,7,255,,1967,,,3511,2308,1203,AK


## Helper Functions
* Be sure to set your aws credentials in the virtual environment using 
* pipenv shell
* aws configure
* enter your ID and secret key

In [1075]:
def getS3Data(bucket, file_loc, csv=True):
    client = boto3.client('s3')
    obj = client.get_object(Bucket=bucket, Key=file_loc)
    if(csv):
        data = pd.read_csv(obj['Body'] , dtype=str)
    else:
        data = pd.read_json(obj['Body'])
    
    return data

In [984]:
def dataCleanUp(data, date_field, fips_field, case_field, death_field, suffix): 
    print("Shape: ", data.shape)
    data[fips_field] = data[fips_field].astype('str').apply(lambda x: x.zfill(5))
    data[date_field] = data[date_field].astype('datetime64')
    data[case_field] = data[case_field].astype('float64')
    print("Case control total: ", data[case_field].sum())
    data[death_field] = data[death_field].astype('float64')
    print("Death control total: ", data[death_field].sum())
    data = data.add_suffix(suffix)
    data['date'] = data[date_field + suffix]
    data = data.rename(str.lower, axis='columns') 
    return data

In [877]:
def getDataInfo(data, case_field, death_field):
    print("Dataframe Shape: ", data.shape)
    print("Number of States: ", len(data['state_code'].unique()))
    print("Number of Counties: ", len(data['fips_code'].unique()))
    print("Minimum Date: ", data['date'].min())
    print("Maximum Date: ", data['date'].max())
    print("Duplicate State-Fips-Date: ", 
          data.groupby(['state_code','fips_code', 'date']).size().reset_index().rename(columns={0:'count'})['count'].sum() 
          - data.shape[0])
    print("Null State Code: ", data[data.state_code.isnull()].size)
    print("Null County Code: ", data[data.fips_code.isnull()].size)
    print("Null Dates: ", enigma_agg_data[enigma_agg_data.date.isnull()].size)
    
    print("Case Control Total: ", data[case_field].sum())
    print("Death Control Total: ", data[death_field].sum())

In [10]:
def sumDups(data, county_field, date_field, sum_field):
    out = data[
        data.duplicated([county_field, date_field]) & 
        (data[date_field] == data[date_field].max())
    ][sum_field].sum()
    return out

## Static Data

### County reference data

In [11]:
county_ref_data = getS3Data('covid19-lake', 'static-datasets/csv/CountyPopulation/County_Population.csv')

In [12]:
county_ref_data.columns = ['big_fips', 'fips', 'county', 'state', 'population_2018']

In [13]:
county_ref_data['fips'] = county_ref_data['fips'].astype('Int64').astype('str').apply(lambda x: x.zfill(5))

In [14]:
county_ref_data.head(2)

Unnamed: 0,big_fips,fips,county,state,population_2018
0,0500000US01001,1001,Autauga,Alabama,55601
1,0500000US01003,1003,Baldwin,Alabama,218022


In [15]:
county_ref_data.shape

(3220, 5)

### State reference data

In [16]:
state_ref_data = getS3Data('covid19-lake', 'static-datasets/csv/state-abv/states_abv.csv')

In [17]:
state_ref_data.columns = ['state_name','state_code']

In [312]:
state_ref_data.head(2)

Unnamed: 0,state_name,state_code
0,Alabama,AL
1,Alaska,AK


In [19]:
state_ref_data.shape

(51, 2)

### Safegraph Open Census Data
* See notebook: safe_graph_census_data_explore.ipynb

In [20]:
#Add these later as needed

#safegraph_census_data = pickle.load(open('safegraph_census.p','rb'))
#safegraph_census_data_fields = pickle.load(open('safegraph_metadata_fields.p','rb'))

#### FIPS Reference Data

In [21]:
safegraph_ref_data_fips = pickle.load(open('safegraph_ref_data_fips.p','rb'))

In [554]:
# Create state_county concatenation for lookup on google mobility data
safegraph_ref_data_fips['state_county'] = safegraph_ref_data_fips['state'] + safegraph_ref_data_fips['county']

In [22]:
safegraph_ref_data_fips.head(2)

Unnamed: 0,state,state_fips,county_fips,county,class_code,fips
0,AL,1,1,Autauga County,H1,1001
1,AL,1,3,Baldwin County,H1,1003


#### Land area

In [23]:
safegraph_land = pickle.load(open('safegraph_land.p','rb'))

In [24]:
safegraph_land.reset_index(level=0, inplace=True)

In [25]:
safegraph_land.head(2)

Unnamed: 0,fips,amount_land,amount_water
0,1001,1539609015,25749942
1,1003,4117584019,1133130502


## COVID Cases and Deaths Timeseries Data

### Enigma Aggregation

#### Download data and cleanup

In [1137]:
enigma_agg_data = getS3Data('covid19-lake', 'enigma-aggregation/csv/us_counties/enigma_covid_19_us_counties.csv')

In [1138]:
enigma_agg_data = dataCleanUp(enigma_agg_data, 'date', 'county_fips', 'cases','deaths','_ea')

Shape:  (158981, 10)
Case control total:  32826237.0
Death control total:  1929121.0


In [1139]:
# Change a few fips codes
enigma_agg_data['fips_code'] = enigma_agg_data['county_fips_ea']
enigma_agg_data.loc[enigma_agg_data['fips_code'] == '0<NA>', 'fips_code'] = '00000'

In [1140]:
# Attach 2 digit state code from state fips code
tmp_state = safegraph_ref_data_fips.groupby(['state_fips','state']).size().reset_index().rename(columns={0:'count'})
enigma_agg_data['state_fips_ea'] = enigma_agg_data['state_fips_ea'].astype('str').apply(lambda x: x.zfill(2))

lefton = ['state_fips_ea']
righton = ['state_fips']

enigma_agg_data = pd.merge(enigma_agg_data, tmp_state[['state_fips','state']], how='left', left_on=lefton, right_on=righton)

enigma_agg_data['state_code'] = enigma_agg_data['state']
enigma_agg_data['state_name'] = enigma_agg_data['state_name_ea']

enigma_agg_data = enigma_agg_data.drop(columns=['state', 'state_fips'])

#### Checks

In [1141]:
getDataInfo(enigma_agg_data, 'cases_ea', 'deaths_ea')

Dataframe Shape:  (158981, 14)
Number of States:  55
Number of Counties:  2918
Minimum Date:  2020-01-21 00:00:00
Maximum Date:  2020-05-19 00:00:00
Duplicate State-Fips-Date:  0
Null State Code:  0
Null County Code:  0
Null Dates:  0
Case Control Total:  32826237.0
Death Control Total:  1929121.0


### Enigma Johns Hopkins University

#### Download data and basic summary

In [1079]:
# NEED TO CHANGE FUNCTION TO , dtype=str and then add some typing logic
### THIS SECTION NEEDS FIXING - they just removed the csv!
#enigma_jh_data = getS3Data('covid19-lake', 'enigma-jhu-timeseries/csv/jhu_csse_covid_19_timeseries_merged.csv')

In [1082]:
# Filter to only US
  enigma_jh_data = enigma_jh_data[enigma_jh_data['iso2']=='US']

In [325]:
enigma_jh_data = dataCleanUp(enigma_jh_data, 'date', 'fips', 'confirmed', 'deaths', '_ejhu')

In [328]:
# Create clean fips_code
enigma_jh_data['fips_code'] = enigma_jh_data['fips_ejhu']
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu'].isin(jh_unknown_fips_counties), 'fips_code'] = '00000'

In [None]:
# Add "Out of XX" counties to "Unknown"
jh_unknown_fips_counties = ['Out of AL', 'Out of AK',
       'Out of AZ', 'Out of AR', 'Out of CA', 'Out of CO', 'Out of CT',
       'Out of DE', 'Out of DC', 'Out of FL', 'Out of GA', 'Out of HI',
       'Out of ID', 'Out of IL', 'Out of IN', 'Out of IA', 'Out of KS',
       'Out of KY', 'Out of LA', 'Out of ME', 'Out of MD', 'Out of MA',
       'Out of MI', 'Out of MN', 'Out of MS', 'Out of MO', 'Out of MT',
       'Out of NE', 'Out of NV', 'Out of NH', 'Out of NJ', 'Out of NM',
       'Out of NY', 'Out of NC', 'Out of ND', 'Out of OH', 'Out of OK',
       'Out of OR', 'Out of PA', 'Out of RI', 'Out of SC', 'Out of SD',
       'Out of TN', 'Out of TX', 'Out of UT', 'Out of VT', 'Out of VA',
       'Out of WA', 'Out of WV', 'Out of WI', 'Out of WY']
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu'].isin(jh_unknown_fips_counties), 'fips_code'] = '00000'

In [329]:
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Dukes and Nantucket', 'fips_code'] = 'n0001'
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Kansas City', 'fips_code'] = 'n0002'
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Michigan Department of Corrections (MDOC)', 'fips_code'] = 'n0003'
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Federal Correctional Institution (FCI)','fips_code'] = 'n0004'
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Bear River', 'fips_code'] = 'n0005'
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Central Utah', 'fips_code'] = 'n0006'
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Southeast Utah', 'fips_code'] = 'n0007'
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Southwest Utah', 'fips_code'] = 'n0008'
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='TriCounty', 'fips_code'] = 'n0009'
enigma_jh_data.loc[enigma_jh_data['admin2_ejhu']=='Weber-Morgan', 'fips_code'] = 'n0010'

In [330]:
lefton = ['province_state_ejhu']
righton = ['state_name']

enigma_jh_data = pd.merge(
    enigma_jh_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [348]:
enigma_jh_data.loc[enigma_jh_data['province_state_ejhu']=='Diamond Princess', 'state_code'] = 'CA'
enigma_jh_data.loc[enigma_jh_data['province_state_ejhu']=='Grand Princess', 'state_code'] = 'CA'

enigma_jh_data.loc[enigma_jh_data['province_state_ejhu']=='Diamond Princess', 'state_name'] = 'California'
enigma_jh_data.loc[enigma_jh_data['province_state_ejhu']=='Grand Princess', 'state_name'] = 'California'

In [375]:
enigma_jh_data['date'] = enigma_jh_data['date_ejhu']

In [376]:
enigma_jh_data.head(2)

Unnamed: 0,uid_ejhu,fips_ejhu,iso2_ejhu,iso3_ejhu,code3_ejhu,admin2_ejhu,latitude_ejhu,longitude_ejhu,province_state_ejhu,country_region_ejhu,date_ejhu,confirmed_ejhu,deaths_ejhu,recovered_ejhu,fips_code,state_name,state_code,date
0,84001001.0,1001,US,USA,840.0,Autauga,32.539527,-86.644082,Alabama,US,2020-01-22,0,0,,1001,Alabama,AL,2020-01-22
1,84001003.0,1003,US,USA,840.0,Baldwin,30.72775,-87.722071,Alabama,US,2020-01-22,0,0,,1003,Alabama,AL,2020-01-22


#### Checks

In [1085]:
getDataInfo(enigma_jh_data, 'confirmed_ejhu', 'deaths_ejhu')

Dataframe Shape:  (377696, 18)
Number of States:  51
Number of Counties:  3206
Minimum Date:  2020-01-22 00:00:00
Maximum Date:  2020-05-16 00:00:00
Duplicate State-Fips-Date:  0
Null State Code:  0
Null County Code:  0
Null Dates:  0
Case Control Total:  41132706
Death Control Total:  2271471


In [1086]:
pickle.dump(enigma_jh_data, open('enigma_jh_data_20200517.p','wb'))

### Enigma New York Times

#### Download data and clean data

In [1201]:
enigma_nyt_data = getS3Data('covid19-lake', 'enigma-nytimes-data-in-usa/csv/us_county/us_county.csv')

In [1202]:
enigma_nyt_data = dataCleanUp(enigma_nyt_data, 'date', 'fips', 'cases','deaths','_enyt')

Shape:  (129747, 6)
Case control total:  31616769.0
Death control total:  1652495.0


In [1203]:
# Fix Fips codes
enigma_nyt_data['fips_code'] = enigma_nyt_data['fips_enyt']
enigma_nyt_data.loc[enigma_nyt_data['county_enyt']=='New York City', 'fips_code'] = 'NYC000'
enigma_nyt_data.loc[enigma_nyt_data['county_enyt']=='New York City', 'fips_code'] = 'KC0000'
enigma_nyt_data.loc[enigma_nyt_data['county_enyt']=='Unknown', 'fips_code'] = '00000'

In [1204]:
# Add state
lefton = ['state_enyt']
righton = ['state_name']

enigma_nyt_data = pd.merge(
    enigma_nyt_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [1205]:
# Fix for extra states
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Puerto Rico', 'state_code'] = 'PR'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Virgin Islands', 'state_code'] = 'VI'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Guam', 'state_code'] = 'GU'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Northern Mariana Islands', 'state_code'] = 'MP'

enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Puerto Rico', 'state_name'] = 'Puerto Rico'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Virgin Islands', 'state_name'] = 'Virgin Islands'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Guam', 'state_name'] = 'Guam'
enigma_nyt_data.loc[enigma_nyt_data['state_enyt']=='Northern Mariana Islands', 'state_name'] = 'Northern Mariana Islands'

In [1206]:
enigma_nyt_data.head(2)

Unnamed: 0,date_enyt,county_enyt,state_enyt,fips_enyt,cases_enyt,deaths_enyt,date,fips_code,state_name,state_code
0,2020-01-21,Snohomish,Washington,53061,1.0,0.0,2020-01-21,53061,Washington,WA
1,2020-01-22,Snohomish,Washington,53061,1.0,0.0,2020-01-22,53061,Washington,WA


#### Checks

In [1207]:
getDataInfo(enigma_nyt_data, 'cases_enyt', 'deaths_enyt')

Dataframe Shape:  (129747, 10)
Number of States:  55
Number of Counties:  2885
Minimum Date:  2020-01-21 00:00:00
Maximum Date:  2020-05-09 00:00:00
Duplicate State-Fips-Date:  0
Null State Code:  0
Null County Code:  0
Null Dates:  0
Case Control Total:  31616769.0
Death Control Total:  1652495.0


### Rearc New York Times

#### Download data and basic summary

In [1170]:
rearc_nyt_data = getS3Data('covid19-lake', 'rearc-covid-19-nyt-data-in-usa/csv/us-counties/us-counties.csv')

In [1171]:
rearc_nyt_data = dataCleanUp(rearc_nyt_data, 'date', 'fips', 'cases','deaths','_rnyt')

Shape:  (161930, 6)
Case control total:  47539415.0
Death control total:  2606722.0


In [1172]:
rearc_nyt_data['fips_code'] = rearc_nyt_data['fips_rnyt']
rearc_nyt_data.loc[rearc_nyt_data['county_rnyt']=='New York City', 'fips_code'] = 'NYC000'
rearc_nyt_data.loc[rearc_nyt_data['county_rnyt']=='New York City', 'fips_code'] = 'KC0000'
rearc_nyt_data.loc[rearc_nyt_data['county_rnyt']=='Unknown', 'fips_code'] = '00000'

In [1173]:
lefton = ['state_rnyt']
righton = ['state_name']

rearc_nyt_data = pd.merge(
    rearc_nyt_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [1174]:
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Puerto Rico', 'state_code'] = 'PUR'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Virgin Islands', 'state_code'] = 'VGI'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Guam', 'state_code'] = 'GUM'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Northern Mariana Islands', 'state_code'] = 'NMI'

rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Puerto Rico', 'state_name'] = 'Puerto Rico'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Virgin Islands', 'state_name'] = 'Virgin Islands'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Guam', 'state_name'] = 'Guam'
rearc_nyt_data.loc[rearc_nyt_data['state_rnyt']=='Northern Mariana Islands', 'state_name'] = 'Northern Mariana Islands'

In [1175]:
rearc_nyt_data['date'] = rearc_nyt_data['date_rnyt']

In [1185]:
rearc_nyt_data.head(2)

Unnamed: 0,date_rnyt,county_rnyt,state_rnyt,fips_rnyt,cases_rnyt,deaths_rnyt,date,fips_code,state_name,state_code
0,2020-01-21,Snohomish,Washington,53061,1.0,0.0,2020-01-21,53061,Washington,WA
1,2020-01-22,Snohomish,Washington,53061,1.0,0.0,2020-01-22,53061,Washington,WA


#### Checks

In [924]:
getDataInfo(rearc_nyt_data, 'cases_rnyt','deaths_rnyt')

Dataframe Shape:  (158981, 10)
Number of States:  55
Number of Counties:  2920
Minimum Date:  2020-01-21 00:00:00
Maximum Date:  2020-05-19 00:00:00
Duplicate State-Fips-Date:  0
Null State Code:  0
Null County Code:  0
Null Dates:  0
Case Control Total:  45979388.0
Death Control Total:  2513314.0


### USA Facts (CDC Affiliate)

#### Download data and basic summary

In [1005]:
# Download the data
url="https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv"
data_confirmed = pd.read_csv(url)

In [1006]:
url = "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv"
data_deaths = pd.read_csv(url)

In [1007]:
url = "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv"
data_county_pop = pd.read_csv(url)

In [1008]:
# Move the date column headers into a single column as rows
data_confirmed = data_confirmed.melt(
    id_vars=['countyFIPS','County Name','State','stateFIPS'], 
    var_name='Date', value_name='Confirmed')

data_deaths = data_deaths.melt(
    id_vars=['countyFIPS','County Name','State','stateFIPS'], 
    var_name='Date', value_name='Deaths')

In [1009]:
# Join the raw tables together

# Part 1
lefton = ['countyFIPS', 'County Name', 'State', 'stateFIPS', 'Date']
righton = ['countyFIPS', 'County Name', 'State', 'stateFIPS', 'Date']

data = pd.merge(data_confirmed, data_deaths, how='left', left_on=lefton, right_on=righton)

# Part 2
lefton = ['countyFIPS', 'County Name', 'State']
righton = ['countyFIPS', 'County Name', 'State']

usafacts_cdc_data = pd.merge(data, data_county_pop, how='left', left_on=lefton, right_on=righton)

In [1010]:
usafacts_cdc_data = dataCleanUp(usafacts_cdc_data, 'Date', 'countyFIPS', 'Confirmed','Deaths','_cdc')

Shape:  (380205, 8)
Case control total:  45500922.0
Death control total:  2478485.0


In [1012]:
# Fips code clean up
usafacts_cdc_data['fips_code'] = usafacts_cdc_data['countyfips_cdc']
usafacts_cdc_data.loc[(usafacts_cdc_data['county name_cdc']=='Grand Princess Cruise Ship'), 'fips_code'] = '99999'

In [1013]:
usafacts_cdc_data['state_code'] = usafacts_cdc_data['state_cdc']

In [1014]:
# Attach State Name
lefton = ['state_code']
righton = ['state_code']

usafacts_cdc_data = pd.merge(
    usafacts_cdc_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [1015]:
usafacts_cdc_data.head(2)

Unnamed: 0,countyfips_cdc,county name_cdc,state_cdc,statefips_cdc,date_cdc,confirmed_cdc,deaths_cdc,population_cdc,date,fips_code,state_code,state_name
0,0,Statewide Unallocated,AL,1,2020-01-22,0.0,0.0,0.0,2020-01-22,0,AL,Alabama
1,1001,Autauga County,AL,1,2020-01-22,0.0,0.0,55869.0,2020-01-22,1001,AL,Alabama


#### Checks

In [1016]:
getDataInfo(usafacts_cdc_data, 'confirmed_cdc','deaths_cdc')

Dataframe Shape:  (380205, 12)
Number of States:  51
Number of Counties:  3146
Minimum Date:  2020-01-22 00:00:00
Maximum Date:  2020-05-19 00:00:00
Duplicate State-Fips-Date:  0
Null State Code:  0
Null County Code:  0
Null Dates:  0
Case Control Total:  45500922.0
Death Control Total:  2478485.0


### Google Mobility Data

#### Download data and basic summary

In [1054]:
url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=57b4ac4fc4052"
data = pd.read_csv(url, dtype=str)

In [1055]:
data = data[data['country_region_code'] == 'US']
#data = data[data.sub_region_2.notnull()] #Think about this filter in join - may want two tables, state, and county level
data['date'] = data['date'].astype('datetime64')

In [1056]:
google_data = data

In [1057]:
google_data.head(2)

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
108182,US,United States,,,2020-02-15,6,2,15,3,2,-1
108183,US,United States,,,2020-02-16,7,1,16,2,0,-1


#### State Level Table

In [1058]:
google_state_data = google_data[
    (google_data['sub_region_1'].notnull()) &
    (~google_data['sub_region_2'].notnull())
]
google_state_data = google_state_data.add_suffix("_goog_st")

In [1059]:
lefton = ['sub_region_1_goog_st']
righton = ['state_name']

google_state_data = pd.merge(
    google_state_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [1060]:
google_state_data.head(2)

Unnamed: 0,country_region_code_goog_st,country_region_goog_st,sub_region_1_goog_st,sub_region_2_goog_st,date_goog_st,retail_and_recreation_percent_change_from_baseline_goog_st,grocery_and_pharmacy_percent_change_from_baseline_goog_st,parks_percent_change_from_baseline_goog_st,transit_stations_percent_change_from_baseline_goog_st,workplaces_percent_change_from_baseline_goog_st,residential_percent_change_from_baseline_goog_st,state_name,state_code
0,US,United States,Alabama,,2020-02-15,5,2,39,7,2,-1,Alabama,AL
1,US,United States,Alabama,,2020-02-16,0,-2,-7,3,-1,1,Alabama,AL


#### County Level Table

In [1061]:
google_county_data = google_data[
    (google_data['sub_region_1'].notnull()) &
    (google_data['sub_region_2'].notnull())
]
google_county_data = google_county_data.add_suffix("_goog_cnty")

In [1062]:
google_county_data.head(2)

Unnamed: 0,country_region_code_goog_cnty,country_region_goog_cnty,sub_region_1_goog_cnty,sub_region_2_goog_cnty,date_goog_cnty,retail_and_recreation_percent_change_from_baseline_goog_cnty,grocery_and_pharmacy_percent_change_from_baseline_goog_cnty,parks_percent_change_from_baseline_goog_cnty,transit_stations_percent_change_from_baseline_goog_cnty,workplaces_percent_change_from_baseline_goog_cnty,residential_percent_change_from_baseline_goog_cnty
108360,US,United States,Alabama,Autauga County,2020-02-15,5,7,,,-4,
108361,US,United States,Alabama,Autauga County,2020-02-16,0,1,-23.0,,-4,


#### County Data

In [1063]:
# Add state code
lefton = ['sub_region_1_goog_cnty']
righton = ['state_name']

google_county_data = pd.merge(
    google_county_data, state_ref_data, 
    how='left', left_on=lefton, right_on=righton)

In [1064]:
# Create state_county concatenation
google_county_data['state_county'] = google_county_data['state_code'] + google_county_data['sub_region_2_goog_cnty']

In [1065]:
# Manual name fixes
google_county_data.loc[google_county_data['state_county']=='AKAnchorage', 'state_county'] = 'AKAnchorage Municipality'
google_county_data.loc[google_county_data['state_county']=='AKBethel', 'state_county'] = 'AKBethel Census Area'
google_county_data.loc[google_county_data['state_county']=='AKFairbanks North Star', 'state_county'] = 'AKFairbanks North Star Borough'
google_county_data.loc[google_county_data['state_county']=='AKJuneau', 'state_county'] = 'AKJuneau City and Borough'
google_county_data.loc[google_county_data['state_county']=='AKKetchikan Gateway', 'state_county'] = 'AKKetchikan Gateway Borough'
google_county_data.loc[google_county_data['state_county']=='AKKodiak Island', 'state_county'] = 'AKKodiak Island Borough'
google_county_data.loc[google_county_data['state_county']=='AKMatanuska-Susitna', 'state_county'] = 'AKMatanuska-Susitna Borough'
google_county_data.loc[google_county_data['state_county']=='AKNorth Slope', 'state_county'] = 'AKNorth Slope Borough'
google_county_data.loc[google_county_data['state_county']=='AKSitka', 'state_county'] = 'AKSitka City and Borough'
google_county_data.loc[google_county_data['state_county']=='AKSoutheast Fairbanks', 'state_county'] = 'AKSoutheast Fairbanks Census Area'
google_county_data.loc[google_county_data['state_county']=='AKValdez-Cordova', 'state_county'] = 'AKValdez-Cordova Census Area'
google_county_data.loc[google_county_data['state_county']=='MDBaltimore', 'state_county'] = 'MDBaltimore city'

google_county_data.loc[google_county_data['state_county']=='MOSt. Louis', 'state_county'] = 'MOSt. Louis city'
google_county_data.loc[google_county_data['state_county']=='NMDoña Ana County', 'state_county'] = 'NMDona Ana County'
google_county_data.loc[google_county_data['state_county']=='VAAlexandria', 'state_county'] = 'VAAlexandria city'
google_county_data.loc[google_county_data['state_county']=='VABristol', 'state_county'] = 'VABristol city'
google_county_data.loc[google_county_data['state_county']=='VABuena Vista', 'state_county'] = 'VABuena Vista city'
google_county_data.loc[google_county_data['state_county']=='VACharlottesville', 'state_county'] = 'VACharlottesville city'
google_county_data.loc[google_county_data['state_county']=='VAChesapeake', 'state_county'] = 'VAChesapeake city'
google_county_data.loc[google_county_data['state_county']=='VAColonial Heights', 'state_county'] = 'VAColonial Heights city'
google_county_data.loc[google_county_data['state_county']=='VACovington', 'state_county'] = 'VACovington city'

google_county_data.loc[google_county_data['state_county']=='VADanville', 'state_county'] = 'VADanville city'
google_county_data.loc[google_county_data['state_county']=='VAEmporia', 'state_county'] = 'VAEmporia city'
google_county_data.loc[google_county_data['state_county']=='VAFairfax', 'state_county'] = 'VAFairfax city'
google_county_data.loc[google_county_data['state_county']=='VAFalls Church', 'state_county'] = 'VAFalls Church city'
google_county_data.loc[google_county_data['state_county']=='VAFranklin', 'state_county'] = 'VAFranklin city'
google_county_data.loc[google_county_data['state_county']=='VAFredericksburg', 'state_county'] = 'VAFredericksburg city'
google_county_data.loc[google_county_data['state_county']=='VAGalax', 'state_county'] = 'VAGalax city'
google_county_data.loc[google_county_data['state_county']=='VAHampton', 'state_county'] = 'VAHampton city'
google_county_data.loc[google_county_data['state_county']=='VAHarrisonburg', 'state_county'] = 'VAHarrisonburg city'
google_county_data.loc[google_county_data['state_county']=='VAHopewell', 'state_county'] = 'VAHopewell city'

google_county_data.loc[google_county_data['state_county']=='VALexington', 'state_county'] = 'VALexington city'
google_county_data.loc[google_county_data['state_county']=='VALynchburg', 'state_county'] = 'VALynchburg city'
google_county_data.loc[google_county_data['state_county']=='VAManassas', 'state_county'] = 'VAManassas city'
google_county_data.loc[google_county_data['state_county']=='VAManassas Park', 'state_county'] = 'VAManassas Park city'
google_county_data.loc[google_county_data['state_county']=='VAMartinsville', 'state_county'] = 'VAMartinsville city'
google_county_data.loc[google_county_data['state_county']=='VANewport News', 'state_county'] = 'VANewport News city'
google_county_data.loc[google_county_data['state_county']=='VANorfolk', 'state_county'] = 'VANorfolk city'
google_county_data.loc[google_county_data['state_county']=='VANorton', 'state_county'] = 'VANorton city'
google_county_data.loc[google_county_data['state_county']=='VAPetersburg', 'state_county'] = 'VAPetersburg city'
google_county_data.loc[google_county_data['state_county']=='VAPoquoson', 'state_county'] = 'VAPoquoson city'
google_county_data.loc[google_county_data['state_county']=='VAPortsmouth', 'state_county'] = 'VAPortsmouth city'
google_county_data.loc[google_county_data['state_county']=='VARadford', 'state_county'] = 'VARadford city'
google_county_data.loc[google_county_data['state_county']=='VARichmond', 'state_county'] = 'VARichmond city'
google_county_data.loc[google_county_data['state_county']=='VARoanoke', 'state_county'] = 'VARoanoke city'

google_county_data.loc[google_county_data['state_county']=='VASalem', 'state_county'] = 'VASalem city'
google_county_data.loc[google_county_data['state_county']=='VAStaunton', 'state_county'] = 'VAStaunton city'
google_county_data.loc[google_county_data['state_county']=='VASuffolk', 'state_county'] = 'VASuffolk city'
google_county_data.loc[google_county_data['state_county']=='VAVirginia Beach', 'state_county'] = 'VAVirginia Beach city'
google_county_data.loc[google_county_data['state_county']=='VAWaynesboro', 'state_county'] = 'VAWaynesboro city'
google_county_data.loc[google_county_data['state_county']=='VAWilliamsburg', 'state_county'] = 'VAWilliamsburg city'
google_county_data.loc[google_county_data['state_county']=='VAWinchester', 'state_county'] = 'VAWinchester city'

In [1066]:
# Attach FIPS code
lefton = ['state_county']
righton = ['state_county']

google_county_data = pd.merge(
    google_county_data, safegraph_ref_data_fips, 
    how='left', left_on=lefton, right_on=righton)

In [1069]:
google_county_data['fips_code'] = google_county_data['fips']
google_county_data['date'] = google_county_data['date_goog_cnty']

In [1070]:
google_county_data.head(2)

Unnamed: 0,country_region_code_goog_cnty,country_region_goog_cnty,sub_region_1_goog_cnty,sub_region_2_goog_cnty,date_goog_cnty,retail_and_recreation_percent_change_from_baseline_goog_cnty,grocery_and_pharmacy_percent_change_from_baseline_goog_cnty,parks_percent_change_from_baseline_goog_cnty,transit_stations_percent_change_from_baseline_goog_cnty,workplaces_percent_change_from_baseline_goog_cnty,...,state_code,state_county,state,state_fips,county_fips,county,class_code,fips,fips_code,date
0,US,United States,Alabama,Autauga County,2020-02-15,5,7,,,-4,...,AL,ALAutauga County,AL,1,1,Autauga County,H1,1001,1001,2020-02-15
1,US,United States,Alabama,Autauga County,2020-02-16,0,1,-23.0,,-4,...,AL,ALAutauga County,AL,1,1,Autauga County,H1,1001,1001,2020-02-16


In [1051]:
# List counties which did not get a fips code
list(google_county_data[google_county_data.fips.isnull()]['state_county'].unique())

[]

#### Duplicate State - Date Combinations

In [1052]:
google_state_data[google_state_data.duplicated(['sub_region_1_goog_st','sub_region_2_goog_st', 'date_goog_st'])].size

0

#### Duplicate County - Date Combinations

In [1053]:
google_county_data[google_county_data.duplicated(['sub_region_1_goog_cnty','sub_region_2_goog_cnty', 'date_goog_cnty'])].size

0

## FIPS Code Research

### Compare FIPS Codes (Special Attention to NYC)

In [582]:
ref_fips = set(county_ref_data['fips'].tolist())
safegraph_ref_fips = set(safegraph_ref_data_fips['fips'].tolist())
safegraph_land_fips = set(safegraph_land['fips'].tolist())
google_county_fips = set(google_county_data['fips'].tolist())
# ADD google mobility?

ea_fips = set(enigma_agg_data['county_fips_ea'].tolist())
ejhu_fips = set(enigma_jh_data['fips_ejhu'].tolist())
enyt_fips = set(enigma_nyt_data['fips_enyt'].tolist())
rnyt_fips = set(rearc_nyt_data['fips_rnyt'].tolist())
ucdc_fips = set(usafacts_cdc_data['countyFIPS_cdc'].tolist())

#### Helper Functions

In [79]:
def list_diff(set1, set2):
    out = set1-set2
    print("Set 1 length: ", len(set1))
    print("Set 2 length: ", len(set2))
    print("Length of Set 1 - Set 2: ", len(out))
    print("Length of Set 2 - Set 1: ", len(set2-set1))
    return list(out)

#### FIPS Compare: Reference Data and Google County Data
* Shannon County (SD) is in google county data but not in reference data

In [583]:
comp = list_diff(google_county_fips, ref_fips)

Set 1 length:  2828
Set 2 length:  3220
Length of Set 1 - Set 2:  1
Length of Set 2 - Set 1:  393


In [584]:
comp

['46113']

In [587]:
google_county_data[google_county_data['fips']=='46113'].head()

Unnamed: 0,country_region_code_goog_cnty,country_region_goog_cnty,sub_region_1_goog_cnty,sub_region_2_goog_cnty,date_goog_cnty,retail_and_recreation_percent_change_from_baseline_goog_cnty,grocery_and_pharmacy_percent_change_from_baseline_goog_cnty,parks_percent_change_from_baseline_goog_cnty,transit_stations_percent_change_from_baseline_goog_cnty,workplaces_percent_change_from_baseline_goog_cnty,residential_percent_change_from_baseline_goog_cnty,state_name,state_code,state_county,state,state_fips,county_fips,county,class_code,fips
179550,US,United States,South Dakota,Shannon County,2020-02-17,,,,,-56,,South Dakota,SD,SDShannon County,SD,46,113,Shannon County,H1,46113
179551,US,United States,South Dakota,Shannon County,2020-02-18,,,,,-7,,South Dakota,SD,SDShannon County,SD,46,113,Shannon County,H1,46113
179552,US,United States,South Dakota,Shannon County,2020-02-19,,,,,3,,South Dakota,SD,SDShannon County,SD,46,113,Shannon County,H1,46113
179553,US,United States,South Dakota,Shannon County,2020-02-20,,,,,0,,South Dakota,SD,SDShannon County,SD,46,113,Shannon County,H1,46113
179554,US,United States,South Dakota,Shannon County,2020-02-21,,,,,2,,South Dakota,SD,SDShannon County,SD,46,113,Shannon County,H1,46113


#### FIPS Compare: Reference Data and Safegraph Reference Data

In [80]:
comp = list_diff(safegraph_ref_fips, ref_fips)

Set 1 length:  3235
Set 2 length:  3220
Length of Set 1 - Set 2:  17
Length of Set 2 - Set 1:  2


In [81]:
comp

['69085',
 '60040',
 '69110',
 '60020',
 '78030',
 '46113',
 '60030',
 '66010',
 '02270',
 '69120',
 '78020',
 '69100',
 '74300',
 '51515',
 '78010',
 '60050',
 '60010']

#### FIPS Compare: Reference Data and Safegraph Land Data

In [82]:
comp = list_diff(safegraph_land_fips, ref_fips)

Set 1 length:  3220
Set 2 length:  3220
Length of Set 1 - Set 2:  0
Length of Set 2 - Set 1:  0


In [83]:
comp

[]

#### FIPS Compare: Reference Data and Enigma Aggregation

* <font color='red'>No NYC Counties</font>
* All codes included in reference data except 1 unknown

In [84]:
# Reference data appears to have all fip codes
comp = list_diff(ea_fips, ref_fips)

Set 1 length:  2910
Set 2 length:  3220
Length of Set 1 - Set 2:  1
Length of Set 2 - Set 1:  311


In [85]:
comp

['0<NA>']

In [86]:
enigma_agg_data[enigma_agg_data['county_fips_ea']=='0<NA>'].head(2)

Unnamed: 0,state_fips_ea,state_name_ea,county_fips_ea,county_name_ea,area_name_ea,lat_ea,long_ea,date_ea,cases_ea,deaths_ea
3067,36,,0<NA>,,,,,2020-03-01,1.0,0.0
3068,44,,0<NA>,,,,,2020-03-01,2.0,0.0


In [87]:
# New York City counties do not show up
print(enigma_agg_data[enigma_agg_data['state_name_ea']=='New York'].county_name_ea.unique())

['Albany County' 'Allegany County' 'Broome County' 'Cattaraugus County'
 'Cayuga County' 'Chautauqua County' 'Chemung County' 'Chenango County'
 'Clinton County' 'Columbia County' 'Cortland County' 'Delaware County'
 'Dutchess County' 'Erie County' 'Essex County' 'Franklin County'
 'Fulton County' 'Genesee County' 'Greene County' 'Hamilton County'
 'Herkimer County' 'Jefferson County' 'Lewis County' 'Livingston County'
 'Madison County' 'Monroe County' 'Montgomery County' 'Nassau County'
 'Niagara County' 'Oneida County' 'Onondaga County' 'Ontario County'
 'Orange County' 'Orleans County' 'Oswego County' 'Otsego County'
 'Putnam County' 'Rensselaer County' 'Rockland County'
 'St. Lawrence County' 'Saratoga County' 'Schenectady County'
 'Schoharie County' 'Schuyler County' 'Seneca County' 'Steuben County'
 'Suffolk County' 'Sullivan County' 'Tioga County' 'Tompkins County'
 'Ulster County' 'Warren County' 'Washington County' 'Wayne County'
 'Westchester County' 'Wyoming County' 'Yates C

#### FIPS Compare: Reference Data and Enigma Johns Hopkins University
* <font color='red'>NYC Counties show, but are zero, with New York county taking deaths for all 5 boroughs</font>

In [168]:
# Reference data missing a few fip codes
comp = list_diff(ejhu_fips, ref_fips)

Set 1 length:  3247
Set 2 length:  3220
Length of Set 1 - Set 2:  105
Length of Set 2 - Set 1:  78


In [173]:
comp[0:5]

['80022', '80033', '80028', '99999', '80019']

In [198]:
enigma_jh_data[(enigma_jh_data['fips_ejhu'].isin(comp))].head(3)

Unnamed: 0,uid_ejhu,fips_ejhu,iso2_ejhu,iso3_ejhu,code3_ejhu,admin2_ejhu,latitude_ejhu,longitude_ejhu,province_state_ejhu,country_region_ejhu,date_ejhu,confirmed_ejhu,deaths_ejhu,recovered_ejhu
3147,84070002.0,0<NA>,US,USA,840.0,Dukes and Nantucket,41.406747,-70.687635,Massachusetts,US,2020-01-22,0,0,
3148,84070003.0,0<NA>,US,USA,840.0,Kansas City,39.0997,-94.5786,Missouri,US,2020-01-22,0,0,
3149,84080001.0,80001,US,USA,840.0,Out of AL,,,Alabama,US,2020-01-22,0,0,


In [205]:
# 'Out of XX' could go to unknown with fips_code = '00000'
# Others we should create fips_codes for to rpeserve the detail
enigma_jh_data[(enigma_jh_data['fips_ejhu'].isin(comp))]['admin2_ejhu'].unique()

array(['Dukes and Nantucket', 'Kansas City', 'Out of AL', 'Out of AK',
       'Out of AZ', 'Out of AR', 'Out of CA', 'Out of CO', 'Out of CT',
       'Out of DE', 'Out of DC', 'Out of FL', 'Out of GA', 'Out of HI',
       'Out of ID', 'Out of IL', 'Out of IN', 'Out of IA', 'Out of KS',
       'Out of KY', 'Out of LA', 'Out of ME', 'Out of MD', 'Out of MA',
       'Out of MI', 'Out of MN', 'Out of MS', 'Out of MO', 'Out of MT',
       'Out of NE', 'Out of NV', 'Out of NH', 'Out of NJ', 'Out of NM',
       'Out of NY', 'Out of NC', 'Out of ND', 'Out of OH', 'Out of OK',
       'Out of OR', 'Out of PA', 'Out of RI', 'Out of SC', 'Out of SD',
       'Out of TN', 'Out of TX', 'Out of UT', 'Out of VT', 'Out of VA',
       'Out of WA', 'Out of WV', 'Out of WI', 'Out of WY', nan,
       'Unassigned', 'Michigan Department of Corrections (MDOC)',
       'Federal Correctional Institution (FCI)', 'Bear River',
       'Central Utah', 'Southeast Utah', 'Southwest Utah', 'TriCounty',
       'Weber-Mo

In [220]:
# Do they have state codes?
enigma_jh_data[(enigma_jh_data['fips_ejhu'].isin(comp)) & (enigma_jh_data['admin2_ejhu'].isnull())].head(2)

Unnamed: 0,uid_ejhu,fips_ejhu,iso2_ejhu,iso3_ejhu,code3_ejhu,admin2_ejhu,latitude_ejhu,longitude_ejhu,province_state_ejhu,country_region_ejhu,date_ejhu,confirmed_ejhu,deaths_ejhu,recovered_ejhu
3200,84088888.0,88888,US,USA,840.0,,,,Diamond Princess,US,2020-01-22,0,0,
3252,84099999.0,99999,US,USA,840.0,,,,Grand Princess,US,2020-01-22,0,0,


In [225]:
jh_unknown_fips_counties = ['Out of AL', 'Out of AK',
       'Out of AZ', 'Out of AR', 'Out of CA', 'Out of CO', 'Out of CT',
       'Out of DE', 'Out of DC', 'Out of FL', 'Out of GA', 'Out of HI',
       'Out of ID', 'Out of IL', 'Out of IN', 'Out of IA', 'Out of KS',
       'Out of KY', 'Out of LA', 'Out of ME', 'Out of MD', 'Out of MA',
       'Out of MI', 'Out of MN', 'Out of MS', 'Out of MO', 'Out of MT',
       'Out of NE', 'Out of NV', 'Out of NH', 'Out of NJ', 'Out of NM',
       'Out of NY', 'Out of NC', 'Out of ND', 'Out of OH', 'Out of OK',
       'Out of OR', 'Out of PA', 'Out of RI', 'Out of SC', 'Out of SD',
       'Out of TN', 'Out of TX', 'Out of UT', 'Out of VT', 'Out of VA',
       'Out of WA', 'Out of WV', 'Out of WI', 'Out of WY']
# Map these states to state_code = GP or DP respectively

In [204]:
# Princess Cruise Records have that in the state, but county is null.
# These are fine to go to fips_code = '00000'
# Map these states to state_code = GP or DP respectively
# Map
enigma_jh_data[(enigma_jh_data['fips_ejhu'].isin(comp)) & enigma_jh_data['admin2_ejhu'].isnull()].head(2)

Unnamed: 0,uid_ejhu,fips_ejhu,iso2_ejhu,iso3_ejhu,code3_ejhu,admin2_ejhu,latitude_ejhu,longitude_ejhu,province_state_ejhu,country_region_ejhu,date_ejhu,confirmed_ejhu,deaths_ejhu,recovered_ejhu
3200,84088888.0,88888,US,USA,840.0,,,,Diamond Princess,US,2020-01-22,0,0,
3252,84099999.0,99999,US,USA,840.0,,,,Grand Princess,US,2020-01-22,0,0,


In [92]:
# New York City counties show up but are zero, all deaths in 'New York' county
enigma_jh_data[(enigma_jh_data.province_state_ejhu=='New York') & ((enigma_jh_data.date_ejhu=='2020-05-10'))].head(3)

Unnamed: 0,uid_ejhu,fips_ejhu,iso2_ejhu,iso3_ejhu,code3_ejhu,admin2_ejhu,latitude_ejhu,longitude_ejhu,province_state_ejhu,country_region_ejhu,date_ejhu,confirmed_ejhu,deaths_ejhu,recovered_ejhu
327933,84036001.0,36001,US,USA,840.0,Albany,42.600603,-73.977239,New York,US,2020-05-10,1432,59,
327934,84036003.0,36003,US,USA,840.0,Allegany,42.257484,-78.027505,New York,US,2020-05-10,36,0,
327935,84036005.0,36005,US,USA,840.0,Bronx,40.852093,-73.862828,New York,US,2020-05-10,0,0,


#### FIPS Compare: Reference Data and Enigma New York Times
* <font color='red'>No FIPS codes for counties near NYC, but deaths in 'New York City' county</font>
* <font color='red'>For counties near KC, some reporting existed early on, but then shifted to "Kansas City" similar to NYC
* <font color='red'>No FIPS codes for 'Unknown' county, but a state exists</font>

In [93]:
comp = list_diff(enyt_fips, ref_fips)

Set 1 length:  2886
Set 2 length:  3220
Length of Set 1 - Set 2:  1
Length of Set 2 - Set 1:  335


In [94]:
comp

['0<NA>']

In [95]:
enigma_nyt_data.head(2)

Unnamed: 0,date_enyt,county_enyt,state_enyt,fips_enyt,cases_enyt,deaths_enyt
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0


In [96]:
enigma_nyt_data[enigma_nyt_data['fips_enyt']=='0<NA>'].county_enyt.unique()

array(['New York City', 'Unknown', 'Kansas City'], dtype=object)

In [291]:
enigma_nyt_data[enigma_nyt_data['county_enyt']=='New York City']['fips_enyt'].unique()

array(['0<NA>'], dtype=object)

In [295]:
enigma_nyt_data[enigma_nyt_data['county_enyt']=='Kansas City']['fips_enyt'].unique()

array(['0<NA>'], dtype=object)

In [97]:
enigma_nyt_data[(enigma_nyt_data['state_enyt']=='Kansas') & (enigma_nyt_data['county_enyt']=='Clay')].tail(2)

Unnamed: 0,date_enyt,county_enyt,state_enyt,fips_enyt,cases_enyt,deaths_enyt
127698,2020-05-09,Clay,Kansas,20027,4,1
130610,2020-05-10,Clay,Kansas,20027,4,1


In [302]:
enigma_nyt_data[(enigma_nyt_data['state_enyt']=='New York') & (enigma_nyt_data['county_enyt']=='Queens')].size

0

#### FIPS Compare: Reference Data and Rearc New York Times
* <font color='red'>Same issues 'New York City', 'Kansas City' and 'Unknown' as Enigma New York Times</font>

In [99]:
comp = list_diff(rnyt_fips, ref_fips)

Set 1 length:  2910
Set 2 length:  3220
Length of Set 1 - Set 2:  1
Length of Set 2 - Set 1:  311


In [100]:
comp

['0<NA>']

In [101]:
rearc_nyt_data[rearc_nyt_data['fips_rnyt']=='0<NA>'].county_rnyt.unique()

array(['New York City', 'Unknown', 'Kansas City'], dtype=object)

#### FIPS Compare: Reference Data and USA Facts CDC
* <font color='red'>New York City deaths are in the right counties</font>
* <font color='red'>Kansas City deaths are in the right counties</font>
* <font color='red'>FIPS = 00001 is for probable / unconfirmed new york city deaths</font>
* <font color='red'>FIPS = 06000 is for the cruise ship</font>
* <font color='red'>FIPS = 02270 is for Wade Hampton Census Area</font>

In [102]:
comp = list_diff(ucdc_fips, ref_fips)

Set 1 length:  3145
Set 2 length:  3220
Length of Set 1 - Set 2:  3
Length of Set 2 - Set 1:  78


In [103]:
comp

['02270', '00001', '06000']

In [104]:
usafacts_cdc_data[usafacts_cdc_data['countyFIPS_cdc']=='06000'].head(2)

Unnamed: 0,countyFIPS_cdc,County Name_cdc,State_cdc,stateFIPS_cdc,Date_cdc,Confirmed_cdc,Deaths_cdc,population_cdc
192,6000,Grand Princess Cruise Ship,CA,6,2020-01-22,0,0.0,0.0
3387,6000,Grand Princess Cruise Ship,CA,6,2020-01-23,0,0.0,0.0


In [105]:
usafacts_cdc_data[usafacts_cdc_data['countyFIPS_cdc']=='02270'].head(2)

Unnamed: 0,countyFIPS_cdc,County Name_cdc,State_cdc,stateFIPS_cdc,Date_cdc,Confirmed_cdc,Deaths_cdc,population_cdc
95,2270,Wade Hampton Census Area,AK,2,2020-01-22,0,0.0,0.0
3290,2270,Wade Hampton Census Area,AK,2,2020-01-23,0,0.0,0.0


In [106]:
usafacts_cdc_data[usafacts_cdc_data['countyFIPS_cdc']=='00001'].head(2)

Unnamed: 0,countyFIPS_cdc,County Name_cdc,State_cdc,stateFIPS_cdc,Date_cdc,Confirmed_cdc,Deaths_cdc,population_cdc
1862,1,New York City Unallocated/Probable,NY,36,2020-01-22,0,0.0,
5057,1,New York City Unallocated/Probable,NY,36,2020-01-23,0,0.0,


In [107]:
usafacts_cdc_data[(usafacts_cdc_data['State_cdc']=='NY') & (usafacts_cdc_data['Date_cdc']=='2020-05-10')].head()

Unnamed: 0,countyFIPS_cdc,County Name_cdc,State_cdc,stateFIPS_cdc,Date_cdc,Confirmed_cdc,Deaths_cdc,population_cdc
350117,1,New York City Unallocated/Probable,NY,36,2020-05-10,0,873.0,
350118,36001,Albany County,NY,36,2020-05-10,1432,74.0,305506.0
350119,36003,Allegany County,NY,36,2020-05-10,36,2.0,46091.0
350120,36005,Bronx County,NY,36,2020-05-10,41059,3847.0,1418207.0
350121,36007,Broome County,NY,36,2020-05-10,373,27.0,190488.0


In [1254]:
usafacts_cdc_data[(usafacts_cdc_data['State_cdc']=='KS') & (usafacts_cdc_data['Date_cdc']=='2020-05-10')].tail(3)

KeyError: 'State_cdc'

## Combine Data Sources

### Merge Data

In [109]:
# Drop the not needed google mobility columns 'state name etc..'

# Add JH data (which somehow became not available)

# Look into why record counts change during the group bys
#      investigate any duplicates and fix them

# refactor code to be more like a pipeline

# Decide census columns (education, income, race, land)
# Aggregate safegraph census stuff since it is more granular than county
# Join in census columns

# calculated columns (population density, percentiles, distance from epicenters)

# graphs

# Decide how to deal with NaNs in numeric columns (do they graph?)

# graph data comparison (counts)
### Deaths, cases, etc..

# graph death rates over time

# correlations
# graph epicenters, distance from epicenter, population density, income, education, race

In [1247]:
# Build a list of tables to join
raw_data_list = [
    enigma_agg_data,
    enigma_jh_data,
    enigma_nyt_data,
    rearc_nyt_data,
    usafacts_cdc_data,
]

In [1248]:
# Since we want the sum columns to have different names, put them in a list
sum_cols = [
    ['cases_ea', 'deaths_ea'],
    ['confirmed_ejhu', 'deaths_ejhu'],
    ['cases_enyt', 'deaths_enyt'],
    ['cases_rnyt', 'deaths_rnyt'],
    ['confirmed_cdc', 'deaths_cdc']
]

In [1249]:
# Recursively group the tables and put that in a new list
grouped_data_list = []
for data, sum_col in zip(raw_data_list, sum_cols):
    data = data.groupby(['state_code','fips_code', 'date'])[sum_col].agg('sum').reset_index()
    grouped_data_list.append(data)

In [1250]:
# Add google mobility data
grouped_data_list.append(
    google_county_data
)

In [1251]:
# Join all the tables together
on_col = ['state_code', 'fips_code', 'date']
covid_data = pd.DataFrame(columns=on_col)
for data in grouped_data_list:
    covid_data = pd.merge(
        covid_data, data, how='outer', left_on=on_col, right_on=on_col
    )

In [1253]:
covid_data.columns

Index(['state_code', 'fips_code', 'date', 'cases_ea', 'deaths_ea',
       'confirmed_ejhu', 'deaths_ejhu', 'cases_enyt', 'deaths_enyt',
       'cases_rnyt', 'deaths_rnyt', 'confirmed_cdc', 'deaths_cdc',
       'country_region_code_goog_cnty', 'country_region_goog_cnty',
       'sub_region_1_goog_cnty', 'sub_region_2_goog_cnty', 'date_goog_cnty',
       'retail_and_recreation_percent_change_from_baseline_goog_cnty',
       'grocery_and_pharmacy_percent_change_from_baseline_goog_cnty',
       'parks_percent_change_from_baseline_goog_cnty',
       'transit_stations_percent_change_from_baseline_goog_cnty',
       'workplaces_percent_change_from_baseline_goog_cnty',
       'residential_percent_change_from_baseline_goog_cnty', 'state_name',
       'state_county', 'state', 'state_fips', 'county_fips', 'county',
       'class_code', 'fips'],
      dtype='object')