In [1]:
# See covid_virtual_env.sh for environment setup
import boto3, pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 500)

# Load Data

Set the client for boto3 (amazons toolkit) to read s3

In [2]:
client = boto3.client('s3')

In [3]:
# Johns hopkins cases and deaths over time
obj = client.get_object(Bucket='covid19-lake', Key='enigma-aggregation/csv/us_counties/enigma_covid_19_us_counties.csv')
data = pd.read_csv(obj['Body'])

# Data Exploration

## Fundamental exploration
What values are showing up?

In [4]:
# Quick look at the records
data.head()

Unnamed: 0,state_fips,state_name,county_fips,county_name,area_name,lat,long,date,cases,deaths
0,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-24,1.0,0.0
1,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-25,4.0,0.0
2,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-26,6.0,0.0
3,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-27,6.0,0.0
4,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-28,6.0,0.0


In [5]:
data.dtypes

state_fips       int64
state_name      object
county_fips    float64
county_name     object
area_name      float64
lat            float64
long           float64
date            object
cases          float64
deaths         float64
dtype: object

In [6]:
# Fix the date data type
data['date'] = data['date'].astype('datetime64')

In [7]:
data.dtypes

state_fips              int64
state_name             object
county_fips           float64
county_name            object
area_name             float64
lat                   float64
long                  float64
date           datetime64[ns]
cases                 float64
deaths                float64
dtype: object

In [8]:
data.head()

Unnamed: 0,state_fips,state_name,county_fips,county_name,area_name,lat,long,date,cases,deaths
0,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-24,1.0,0.0
1,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-25,4.0,0.0
2,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-26,6.0,0.0
3,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-27,6.0,0.0
4,1,Alabama,1001.0,Autauga County,,32.532237,-86.646439,2020-03-28,6.0,0.0


In [9]:
# Unique iso2 codes
print(data.state_name.unique())

['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky' 'Louisiana' 'Maine'
 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota' 'Mississippi'
 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire' 'New Jersey'
 'New Mexico' 'New York' 'North Carolina' 'North Dakota' 'Ohio' 'Oklahoma'
 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina' 'South Dakota'
 'Tennessee' 'Texas' 'Washington' 'West Virginia' 'Utah' 'Vermont'
 'Virginia' 'Wisconsin' 'Wyoming' nan 'Hawaii' 'Idaho']


In [10]:
print(data.date.unique())

['2020-03-24T00:00:00.000000000' '2020-03-25T00:00:00.000000000'
 '2020-03-26T00:00:00.000000000' '2020-03-27T00:00:00.000000000'
 '2020-03-28T00:00:00.000000000' '2020-03-29T00:00:00.000000000'
 '2020-03-30T00:00:00.000000000' '2020-03-31T00:00:00.000000000'
 '2020-04-01T00:00:00.000000000' '2020-04-02T00:00:00.000000000'
 '2020-04-03T00:00:00.000000000' '2020-04-04T00:00:00.000000000'
 '2020-04-05T00:00:00.000000000' '2020-04-06T00:00:00.000000000'
 '2020-04-07T00:00:00.000000000' '2020-04-08T00:00:00.000000000'
 '2020-04-09T00:00:00.000000000' '2020-04-10T00:00:00.000000000'
 '2020-04-11T00:00:00.000000000' '2020-04-12T00:00:00.000000000'
 '2020-04-13T00:00:00.000000000' '2020-04-14T00:00:00.000000000'
 '2020-04-15T00:00:00.000000000' '2020-04-16T00:00:00.000000000'
 '2020-04-17T00:00:00.000000000' '2020-04-18T00:00:00.000000000'
 '2020-04-19T00:00:00.000000000' '2020-04-20T00:00:00.000000000'
 '2020-04-21T00:00:00.000000000' '2020-04-22T00:00:00.000000000'
 '2020-04-23T00:00:00.000

## Some basic group-by's
How many records, cases, and deaths are showing up for those values?

In [11]:
# iso2 to iso3 mappings
data.groupby(['state_name'])['state_name'].count()

state_name
Alabama                 3536
Alaska                   644
Arizona                  915
Arkansas                3694
California              3539
Colorado                3163
Connecticut              492
Delaware                 185
District of Columbia      70
Florida                 3814
Georgia                 8353
Hawaii                   256
Idaho                   1664
Illinois                4641
Indiana                 4905
Iowa                    4271
Kansas                  3501
Kentucky                5254
Louisiana               3524
Maine                    850
Maryland                1412
Massachusetts            933
Michigan                4116
Minnesota               3885
Mississippi             4423
Missouri                4667
Montana                 1490
Nebraska                2571
Nevada                   610
New Hampshire            597
New Jersey              1336
New Mexico              1431
New York                3382
North Carolina          5011
Nor

In [12]:
# quick look at NY
## Missing all the NYC data?
max_date = data.date.max()
data[(data.state_name == 'New York') & (data.date == max_date)].groupby('county_name')[['cases','deaths']].sum()

Unnamed: 0_level_0,cases,deaths
county_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Albany County,1538.0,124.0
Allegany County,44.0,0.0
Broome County,3.0,32.0
Cattaraugus County,64.0,1.0
Cayuga County,61.0,1.0
Chautauqua County,45.0,3.0
Chemung County,133.0,5.0
Chenango County,113.0,2.0
Clinton County,85.0,4.0
Columbia County,333.0,0.0


In [13]:
# Global Death Counts
data.groupby('date')[['cases','deaths']].sum()

Unnamed: 0_level_0,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-21,1.0,0.0
2020-01-22,1.0,0.0
2020-01-23,1.0,0.0
2020-01-24,2.0,0.0
2020-01-25,3.0,0.0
2020-01-26,5.0,0.0
2020-01-27,5.0,0.0
2020-01-28,5.0,0.0
2020-01-29,5.0,0.0
2020-01-30,6.0,0.0


In [14]:
# New York Deaths Over Time
data[data['state_name']=='New York'].groupby('date')[['cases','deaths']].sum()

Unnamed: 0_level_0,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-04,9.0,0.0
2020-03-05,18.0,0.0
2020-03-06,39.0,0.0
2020-03-07,77.0,0.0
2020-03-08,92.0,0.0
2020-03-09,122.0,0.0
2020-03-10,36.0,0.0
2020-03-11,46.0,0.0
2020-03-12,230.0,0.0
2020-03-13,266.0,0.0
