In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import json
import requests
import csv

In [None]:
link = 'https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv'

#Retrieve updated csv from USA Factbook
with requests.Session() as session:
    download = session.get(link)
    decodedContent = download.content.decode('utf-8-sig')

    content = csv.reader(decodedContent.splitlines(), delimiter = ',')
    coviddf = pd.DataFrame(content)

coviddf.head()

In [85]:
#Get rid of unnecessary first few columns
coviddf.columns = coviddf.iloc[0]
coviddf = coviddf[1:]
coviddf['countyFIPS'] = coviddf['countyFIPS'].astype(int)
coviddf.head()

In [86]:
#Get rid of statewide unallocated
coviddf = coviddf[coviddf['countyFIPS']!=0]
coviddf.head()

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/2/20,7/3/20,7/4/20,7/5/20,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20
2,1001,Autauga County,AL,1,0,0,0,0,0,0,...,561,568,591,615,618,644,651,661,670,684
3,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,751,845,863,881,911,997,1056,1131,1187,1224
4,1005,Barbour County,AL,1,0,0,0,0,0,0,...,335,348,350,352,356,360,366,371,381,398
5,1007,Bibb County,AL,1,0,0,0,0,0,0,...,179,189,190,193,197,199,201,211,218,224
6,1009,Blount County,AL,1,0,0,0,0,0,0,...,222,230,234,239,247,255,262,282,292,307


In [87]:
#Drop all the dates except the newest one
coviddf.drop(coviddf.iloc[:,3:-1],axis=1,inplace=True)
coviddf.head()

In [88]:
#Get population data
popdf = pd.read_csv('co-est2019-alldata.csv')

#Condense pop. data
popdf = popdf.loc[:,['STATE','COUNTY','CTYNAME','POPESTIMATE2019']]
#Add leading zeros
popdf['COUNTY'] = popdf['COUNTY'].apply(lambda x: str(x).zfill(3))
popdf.head()

Unnamed: 0,STATE,COUNTY,CTYNAME,POPESTIMATE2019
0,1,0,Alabama,4903185
1,1,1,Autauga County,55869
2,1,3,Baldwin County,223234
3,1,5,Barbour County,24686
4,1,7,Bibb County,22394


In [89]:
#Combine state and county to fips (county fips must match in both datasets)
popdf.rename(columns={'COUNTY':'fips'},inplace=True)
popdf['fips'] = popdf['STATE'].apply(lambda x: str(x))+popdf['fips']
popdf.drop(labels=['STATE'],axis=1,inplace=True)
popdf['fips']=pd.to_numeric(popdf['fips'])

#Get rid of state levels
popdf = popdf.loc[popdf['fips']%1000!=0]
popdf.head()

Unnamed: 0,fips,CTYNAME,POPESTIMATE2019
1,1001,Autauga County,55869
2,1003,Baldwin County,223234
3,1005,Barbour County,24686
4,1007,Bibb County,22394
5,1009,Blount County,57826


In [90]:
#Get difference of counties
print(list(set(coviddf['countyFIPS'])-set(popdf['fips'])))

[6000, 1, 2158, 46102]


In [91]:
#Kusilvak and Wade Hampton are the same, get rid of Kusilvak to match GeoJSON
coviddf = coviddf.loc[coviddf['County Name']!='Kusilvak Census Area']

In [92]:
#Oglala Lakota County in the GeoJSON file is Shannon County (name was changed in 2015)
coviddf.loc[coviddf['County Name']=='Oglala Lakota County', 'countyFIPS'] = 46113
coviddf.loc[coviddf['countyFIPS']==46113, 'County Name'] = 'Shannon County'

In [93]:
popdf.rename(columns={'fips':'countyFIPS'},inplace=True)
popdf.head()

In [94]:
#Merge population/Covid data
coviddf = pd.merge(left = coviddf, right = popdf, on = 'countyFIPS', how = 'outer')
coviddf.head()

In [95]:
#Drop counties that are not officially recognized (Princess Cruise Ship, NYC, etc.)
coviddf = coviddf[coviddf['CTYNAME'].notna()]

In [96]:
#Drop unneeded columns
coviddf.drop(labels=['CTYNAME','State'], axis=1, inplace=True,)
coviddf.head()

Unnamed: 0,countyFIPS,County Name,7/11/20,POPESTIMATE2019
0,1001,Autauga County,684,55869.0
1,1003,Baldwin County,1224,223234.0
2,1005,Barbour County,398,24686.0
3,1007,Bibb County,224,22394.0
4,1009,Blount County,307,57826.0


In [97]:
#Rename most recent date column and convert columns to int
coviddf.rename(columns={coviddf.columns[-2]:'cases'},inplace=True)
coviddf['cases'] = coviddf['cases'].astype(float)
coviddf['POPESTIMATE2019'] = coviddf['POPESTIMATE2019'].astype(float)
coviddf.head()

Unnamed: 0,countyFIPS,County Name,cases,POPESTIMATE2019
0,1001,Autauga County,684.0,55869.0
1,1003,Baldwin County,1224.0,223234.0
2,1005,Barbour County,398.0,24686.0
3,1007,Bibb County,224.0,22394.0
4,1009,Blount County,307.0,57826.0


In [98]:
#Add cases per 100k
coviddf['cases_per_100k'] = coviddf['cases'].div(coviddf['POPESTIMATE2019']).mul(100000)
coviddf.head()

Unnamed: 0,countyFIPS,County Name,cases,POPESTIMATE2019,cases_per_100k
0,1001,Autauga County,684.0,55869.0,1224.292541
1,1003,Baldwin County,1224.0,223234.0,548.303574
2,1005,Barbour County,398.0,24686.0,1612.249858
3,1007,Bibb County,224.0,22394.0,1000.267929
4,1009,Blount County,307.0,57826.0,530.903054


In [99]:
#Format to GeoJSON id
coviddf['countyFIPS']="0500000US"+coviddf['countyFIPS'].apply(lambda x: str(x).zfill(5))
coviddf.head()

Unnamed: 0,countyFIPS,County Name,cases,POPESTIMATE2019,cases_per_100k
0,0500000US01001,Autauga County,684.0,55869.0,1224.292541
1,0500000US01003,Baldwin County,1224.0,223234.0,548.303574
2,0500000US01005,Barbour County,398.0,24686.0,1612.249858
3,0500000US01007,Bibb County,224.0,22394.0,1000.267929
4,0500000US01009,Blount County,307.0,57826.0,530.903054


In [100]:
#Rename for a better naming system
coviddf.rename(columns={'countyFIPS':'county_fips','County Name':'county_name','POPESTIMATE2019':'population'}, inplace=True)
coviddf.head()

Unnamed: 0,county_fips,county_name,cases,population,cases_per_100k
0,0500000US01001,Autauga County,684.0,55869.0,1224.292541
1,0500000US01003,Baldwin County,1224.0,223234.0,548.303574
2,0500000US01005,Barbour County,398.0,24686.0,1612.249858
3,0500000US01007,Bibb County,224.0,22394.0,1000.267929
4,0500000US01009,Blount County,307.0,57826.0,530.903054


In [101]:
#Export
coviddf.to_csv('USCovidbyCounty.csv',index=False)

In [5]:
#Export choropleth map

uscounties = 'uscounties.json'

usmap = json.load(open(uscounties))

fig1=px.choropleth_mapbox(data_frame=coviddf,
                    geojson=usmap,
                    featureidkey='properties.GEO_ID',
                    locations='county_fips',
                    color='cases_per_100k',
                    color_continuous_scale='Viridis',
                    zoom=3,
                    opacity=0.7,
                    #Colours range from minimum of dataset to 99th percentile
                    range_color=(coviddf['cases_per_100k'].min(), np.percentile(coviddf['cases_per_100k'].tolist(), 99)),
                    mapbox_style='carto-positron',
                    hover_name='county_name')

fig1.write_html('uscoviddistribution_current.html')