In [1]:
# See covid_virtual_env.sh for environment setup
import boto3, pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 500)

# Load Data

Set the client for boto3 (amazons toolkit) to read s3

In [2]:
client = boto3.client('s3')

In [4]:
# New York Times Data
obj = client.get_object(Bucket='covid19-lake', Key='rearc-covid-19-nyt-data-in-usa/csv/us-counties/us-counties.csv')
data = pd.read_csv(obj['Body'])

# Data Exploration

## Fundamental exploration
What values are showing up?

In [5]:
# Quick look at the records
data.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [6]:
data.dtypes

date       object
county     object
state      object
fips      float64
cases       int64
deaths      int64
dtype: object

In [7]:
# Fix the date data type
data['date'] = data['date'].astype('datetime64')

In [8]:
data.dtypes

date      datetime64[ns]
county            object
state             object
fips             float64
cases              int64
deaths             int64
dtype: object

In [9]:
data.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [10]:
# Unique iso2 codes
print(data.state.unique())

['Washington' 'Illinois' 'California' 'Arizona' 'Massachusetts'
 'Wisconsin' 'Texas' 'Nebraska' 'Utah' 'Oregon' 'Florida' 'New York'
 'Rhode Island' 'Georgia' 'New Hampshire' 'North Carolina' 'New Jersey'
 'Colorado' 'Maryland' 'Nevada' 'Tennessee' 'Hawaii' 'Indiana' 'Kentucky'
 'Minnesota' 'Oklahoma' 'Pennsylvania' 'South Carolina'
 'District of Columbia' 'Kansas' 'Missouri' 'Vermont' 'Virginia'
 'Connecticut' 'Iowa' 'Louisiana' 'Ohio' 'Michigan' 'South Dakota'
 'Arkansas' 'Delaware' 'Mississippi' 'New Mexico' 'North Dakota' 'Wyoming'
 'Alaska' 'Maine' 'Alabama' 'Idaho' 'Montana' 'Puerto Rico'
 'Virgin Islands' 'Guam' 'West Virginia' 'Northern Mariana Islands']


In [11]:
# Looks to have an extra day compared to JH
print(data.date.unique())

['2020-01-21T00:00:00.000000000' '2020-01-22T00:00:00.000000000'
 '2020-01-23T00:00:00.000000000' '2020-01-24T00:00:00.000000000'
 '2020-01-25T00:00:00.000000000' '2020-01-26T00:00:00.000000000'
 '2020-01-27T00:00:00.000000000' '2020-01-28T00:00:00.000000000'
 '2020-01-29T00:00:00.000000000' '2020-01-30T00:00:00.000000000'
 '2020-01-31T00:00:00.000000000' '2020-02-01T00:00:00.000000000'
 '2020-02-02T00:00:00.000000000' '2020-02-03T00:00:00.000000000'
 '2020-02-04T00:00:00.000000000' '2020-02-05T00:00:00.000000000'
 '2020-02-06T00:00:00.000000000' '2020-02-07T00:00:00.000000000'
 '2020-02-08T00:00:00.000000000' '2020-02-09T00:00:00.000000000'
 '2020-02-10T00:00:00.000000000' '2020-02-11T00:00:00.000000000'
 '2020-02-12T00:00:00.000000000' '2020-02-13T00:00:00.000000000'
 '2020-02-14T00:00:00.000000000' '2020-02-15T00:00:00.000000000'
 '2020-02-16T00:00:00.000000000' '2020-02-17T00:00:00.000000000'
 '2020-02-18T00:00:00.000000000' '2020-02-19T00:00:00.000000000'
 '2020-02-20T00:00:00.000

## Some basic group-by's
How many records, cases, and deaths are showing up for those values?

In [12]:
# iso2 to iso3 mappings
data.groupby(['state'])['state'].count()

state
Alabama                     3536
Alaska                       646
Arizona                      925
Arkansas                    3751
California                  3556
Colorado                    3218
Connecticut                  539
Delaware                     220
District of Columbia          70
Florida                     3866
Georgia                     8409
Guam                          62
Hawaii                       281
Idaho                       1667
Illinois                    4690
Indiana                     4907
Iowa                        4294
Kansas                      3508
Kentucky                    5296
Louisiana                   3583
Maine                        904
Maryland                    1443
Massachusetts                994
Michigan                    4167
Minnesota                   3935
Mississippi                 4427
Missouri                    4760
Montana                     1490
Nebraska                    2591
Nevada                       660
New 

In [13]:
# quick look at NY
## NYC counties are also not broken out, cant see queens, richmond, kings, new york seperately
max_date = data.date.max()
data[(data.state == 'New York') & (data.date == max_date)].groupby('county')[['cases','deaths']].sum()

Unnamed: 0_level_0,cases,deaths
county,Unnamed: 1_level_1,Unnamed: 2_level_1
Albany,1538,124
Allegany,44,0
Broome,390,32
Cattaraugus,64,1
Cayuga,61,1
Chautauqua,45,3
Chemung,133,5
Chenango,113,2
Clinton,85,4
Columbia,333,30


In [14]:
# US Death Counts
data.groupby('date')[['cases','deaths']].sum()

Unnamed: 0_level_0,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-21,1,0
2020-01-22,1,0
2020-01-23,1,0
2020-01-24,2,0
2020-01-25,3,0
2020-01-26,5,0
2020-01-27,5,0
2020-01-28,5,0
2020-01-29,5,0
2020-01-30,6,0


In [15]:
# New York Deaths Over Time
data[data['state']=='New York'].groupby('date')[['cases','deaths']].sum()

Unnamed: 0_level_0,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-01,1,0
2020-03-02,1,0
2020-03-03,2,0
2020-03-04,11,0
2020-03-05,22,0
2020-03-06,44,0
2020-03-07,89,0
2020-03-08,106,0
2020-03-09,142,0
2020-03-10,173,0
