In [1]:
# See covid_virtual_env.sh for environment setup
import boto3, pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 500)

# Load Data

Set the client for boto3 (amazons toolkit) to read s3

In [2]:
client = boto3.client('s3')

In [3]:
# Johns hopkins cases and deaths over time
obj = client.get_object(Bucket='covid19-lake', Key='enigma-jhu-timeseries/csv/jhu_csse_covid_19_timeseries_merged.csv')
data = pd.read_csv(obj['Body'])

# Data Exploration

## Fundamental exploration
What values are showing up?

In [4]:
# Quick look at the records
data.head()

Unnamed: 0,uid,fips,iso2,iso3,code3,admin2,latitude,longitude,province_state,country_region,date,confirmed,deaths,recovered
0,16.0,60.0,AS,ASM,16.0,,-14.271,-170.132,American Samoa,US,2020-01-22,0,0,
1,316.0,66.0,GU,GUM,316.0,,13.4443,144.7937,Guam,US,2020-01-22,0,0,
2,580.0,69.0,MP,MNP,580.0,,15.0979,145.6739,Northern Mariana Islands,US,2020-01-22,0,0,
3,630.0,72.0,PR,PRI,630.0,,18.2208,-66.5901,Puerto Rico,US,2020-01-22,0,0,
4,850.0,78.0,VI,VIR,850.0,,18.3358,-64.8963,Virgin Islands,US,2020-01-22,0,0,


In [5]:
data.dtypes

uid               float64
fips              float64
iso2               object
iso3               object
code3             float64
admin2             object
latitude          float64
longitude         float64
province_state     object
country_region     object
date               object
confirmed           int64
deaths              int64
recovered         float64
dtype: object

In [6]:
# Fix the date data type
data['date'] = data['date'].astype('datetime64')

In [7]:
data.dtypes

uid                      float64
fips                     float64
iso2                      object
iso3                      object
code3                    float64
admin2                    object
latitude                 float64
longitude                float64
province_state            object
country_region            object
date              datetime64[ns]
confirmed                  int64
deaths                     int64
recovered                float64
dtype: object

In [8]:
data.head()

Unnamed: 0,uid,fips,iso2,iso3,code3,admin2,latitude,longitude,province_state,country_region,date,confirmed,deaths,recovered
0,16.0,60.0,AS,ASM,16.0,,-14.271,-170.132,American Samoa,US,2020-01-22,0,0,
1,316.0,66.0,GU,GUM,316.0,,13.4443,144.7937,Guam,US,2020-01-22,0,0,
2,580.0,69.0,MP,MNP,580.0,,15.0979,145.6739,Northern Mariana Islands,US,2020-01-22,0,0,
3,630.0,72.0,PR,PRI,630.0,,18.2208,-66.5901,Puerto Rico,US,2020-01-22,0,0,
4,850.0,78.0,VI,VIR,850.0,,18.3358,-64.8963,Virgin Islands,US,2020-01-22,0,0,


In [9]:
# Unique iso2 codes
print(data.iso2.unique())

['AS' 'GU' 'MP' 'PR' 'VI' 'US' nan]


In [10]:
# Unique iso3 codes
print(data.province_state.unique())

['American Samoa' 'Guam' 'Northern Mariana Islands' 'Puerto Rico'
 'Virgin Islands' 'Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California'
 'Colorado' 'Connecticut' 'Delaware' 'District of Columbia' 'Florida'
 'Georgia' 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas'
 'Kentucky' 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan'
 'Minnesota' 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada'
 'New Hampshire' 'New Jersey' 'New Mexico' 'New York' 'North Carolina'
 'North Dakota' 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island'
 'South Carolina' 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont'
 'Virginia' 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming'
 'Diamond Princess' 'Grand Princess' nan 'Australian Capital Territory'
 'New South Wales' 'Northern Territory' 'Queensland' 'South Australia'
 'Tasmania' 'Victoria' 'Western Australia' 'Anhui' 'Beijing' 'Chongqing'
 'Fujian' 'Gansu' 'Guangdong' 'Guangxi' 'Guizhou' 'Hainan' 'Hebei'
 'Heilongjiang' 'Henan' 'Hong 

In [11]:
# Unique country_region
print(data.country_region.unique())

['US' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Brazil'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Cabo Verde' 'Cambodia' 'Cameroon'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia'
 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica' "Cote d'Ivoire"
 'Croatia' 'Diamond Princess' 'Cuba' 'Cyprus' 'Czechia' 'Denmark'
 'Djibouti' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji'
 'Finland' 'France' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece'
 'Guatemala' 'Guinea' 'Guyana' 'Haiti' 'Holy See' 'Honduras' 'Hungary'
 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy'
 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan' 'Kenya' 'Korea, South' 'Kuwait'
 'Kyrgyzstan' 'Latvia' 'Lebanon' 'Liber

In [12]:
print(data.date.unique())

['2020-01-22T00:00:00.000000000' '2020-01-23T00:00:00.000000000'
 '2020-01-24T00:00:00.000000000' '2020-01-25T00:00:00.000000000'
 '2020-01-26T00:00:00.000000000' '2020-01-27T00:00:00.000000000'
 '2020-01-28T00:00:00.000000000' '2020-01-29T00:00:00.000000000'
 '2020-01-30T00:00:00.000000000' '2020-01-31T00:00:00.000000000'
 '2020-02-10T00:00:00.000000000' '2020-02-11T00:00:00.000000000'
 '2020-02-12T00:00:00.000000000' '2020-02-13T00:00:00.000000000'
 '2020-02-14T00:00:00.000000000' '2020-02-15T00:00:00.000000000'
 '2020-02-16T00:00:00.000000000' '2020-02-17T00:00:00.000000000'
 '2020-02-18T00:00:00.000000000' '2020-02-19T00:00:00.000000000'
 '2020-02-01T00:00:00.000000000' '2020-02-20T00:00:00.000000000'
 '2020-02-21T00:00:00.000000000' '2020-02-22T00:00:00.000000000'
 '2020-02-23T00:00:00.000000000' '2020-02-24T00:00:00.000000000'
 '2020-02-25T00:00:00.000000000' '2020-02-26T00:00:00.000000000'
 '2020-02-27T00:00:00.000000000' '2020-02-28T00:00:00.000000000'
 '2020-02-29T00:00:00.000

## Some basic group-by's
How many records, cases, and deaths are showing up for those values?

In [13]:
# iso2 to iso3 mappings
data.groupby(['iso2','iso3'])['uid'].count()

iso2  iso3
AS    ASM        112
GU    GUM        112
MP    MNP        112
PR    PRI        112
US    USA     364672
VI    VIR        112
Name: uid, dtype: int64

In [14]:
# quick look at NY
## can still see that NYC counties are not broken out
max_date = data.date.max()
data[(data.province_state == 'New York') & (data.date == max_date)].groupby('admin2')[['confirmed','deaths']].sum()

Unnamed: 0_level_0,confirmed,deaths
admin2,Unnamed: 1_level_1,Unnamed: 2_level_1
Albany,1479,61
Allegany,42,0
Bronx,0,0
Broome,379,23
Cattaraugus,61,0
Cayuga,60,1
Chautauqua,42,4
Chemung,131,1
Chenango,112,0
Clinton,77,4


In [15]:
# Global Death Counts
data.groupby('date')[['confirmed','deaths']].sum()

Unnamed: 0_level_0,confirmed,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-22,556,17
2020-01-23,655,18
2020-01-24,943,26
2020-01-25,1436,42
2020-01-26,2122,56
2020-01-27,2931,82
2020-01-28,5581,131
2020-01-29,6169,133
2020-01-30,8237,171
2020-01-31,9930,213


In [16]:
# US Death Counts
data[data['iso2']=='US'].groupby('date')[['confirmed','deaths']].sum()

Unnamed: 0_level_0,confirmed,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-22,1,0
2020-01-23,1,0
2020-01-24,2,0
2020-01-25,2,0
2020-01-26,5,0
2020-01-27,5,0
2020-01-28,5,0
2020-01-29,5,0
2020-01-30,5,0
2020-01-31,7,0


In [17]:
# New York Deaths Over Time
data[data['province_state']=='New York'].groupby('date')[['confirmed','deaths']].sum()

Unnamed: 0_level_0,confirmed,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-22,0,0
2020-01-23,0,0
2020-01-24,0,0
2020-01-25,0,0
2020-01-26,0,0
2020-01-27,0,0
2020-01-28,0,0
2020-01-29,0,0
2020-01-30,0,0
2020-01-31,0,0
