# COVID-19 ETL

This notebook is used to load, clean COVID-19 data and export it to PostgreSQL. The data contains:

* Data on COVID-19 (coronavirus) by Our World in Data: https://github.com/owid/covid-19-data/tree/master/public/data
* Data on COVID-19 (coronavirus) vaccinations by Our World in Data: https://github.com/owid/covid-19-data/tree/master/public/data/vaccinations
* COVID-19 Case Surveillance Public Use Data with Geography: https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Public-Use-Data-with-Ge/n8mc-b4w4

In [1]:
import pandas as pd
from sqlalchemy import create_engine

## Global COVID-19 Data

In [6]:
covid_data = "./resources/owid-covid-data.csv"

covid_data_df = pd.read_csv(covid_data)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80981,ZWE,Africa,Zimbabwe,2021-04-07,36984.0,18.0,14.571,1531.0,0.0,1.143,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
80982,ZWE,Africa,Zimbabwe,2021-04-08,37052.0,68.0,22.286,1532.0,1.0,1.286,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
80983,ZWE,Africa,Zimbabwe,2021-04-09,37147.0,95.0,34.857,1535.0,3.0,1.571,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
80984,ZWE,Africa,Zimbabwe,2021-04-10,37273.0,126.0,51.714,1538.0,3.0,2.000,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571


In [44]:
# Unefficient way of finding the last row of each country and inserting it into a new dataframe
# Ideally, making a list and appending it to a new dataframe would be much more efficient
# But there are so many columns that time-wise this takes less time

countries = covid_data_df["location"].unique().tolist()
stats = pd.DataFrame()

for country in countries:
    df = covid_data_df.loc[covid_data_df["location"] == country]
    df = df.iloc[-1].to_frame().T
    stats = pd.concat([stats,df])

In [54]:
# Dataframe to be used for scatter plots - needs some cleaning (probably) 
# la forma de buscar pordria ser usando max en vez del ultimo de la fila, dependiendo de la columna en la que se haga 
stats

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
412,AFG,Asia,Afghanistan,2021-04-11,57160,16,69.143,2521,0,3.429,...,1803.99,,597.029,9.59,,,37.746,0.5,64.83,0.511
836,OWID_AFR,,Africa,2021-04-11,4.3502e+06,9684,11325.6,115710,288,265.286,...,,,,,,,,,,
1248,ALB,Europe,Albania,2021-04-11,128393,238,266,2317,7,7.429,...,11803.4,1.1,304.195,10.08,7.1,51.2,,2.89,78.57,0.795
1660,DZA,Africa,Algeria,2021-04-11,118516,138,127.714,3130,4,3.571,...,13913.8,0.5,278.364,6.73,0.7,30.4,83.741,1.9,76.88,0.748
2066,AND,Europe,Andorra,2021-04-11,12545,48,44.857,120,0,0.429,...,,,109.135,7.97,29,37.8,,,83.73,0.868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79394,VNM,Asia,Vietnam,2021-04-11,2693,1,8.857,35,0,0,...,6171.88,2,245.465,6,1,45.9,85.847,2.6,75.4,0.704
79840,OWID_WRL,,World,2021-04-11,1.36047e+08,690739,674669,2.93636e+06,8557,11871,...,15469.2,10,233.07,8.51,6.434,34.635,60.13,2.705,72.58,0.737
80207,YEM,Asia,Yemen,2021-04-11,5357,81,79.857,1049,18,14.714,...,1479.15,18.8,495.003,5.35,7.6,29.2,49.542,0.7,66.12,0.47
80597,ZMB,Africa,Zambia,2021-04-11,90029,111,157,1226,0,0.857,...,3689.25,57.5,234.499,3.94,3.1,24.7,13.938,2,63.89,0.584


## US COVID-19 Data

In [39]:
# Get data from US only
covid_data_df_us = covid_data_df.loc[covid_data_df["location"] == "United States"]
covid_data_df_us

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
76767,USA,North America,United States,2020-01-22,1.0,,,,,,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926
76768,USA,North America,United States,2020-01-23,1.0,0.0,,,,,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926
76769,USA,North America,United States,2020-01-24,2.0,1.0,,,,,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926
76770,USA,North America,United States,2020-01-25,2.0,0.0,,,,,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926
76771,USA,North America,United States,2020-01-26,5.0,3.0,,,,,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77208,USA,North America,United States,2021-04-07,30922386.0,75038.0,65936.000,559202.0,2570.0,989.286,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926
77209,USA,North America,United States,2021-04-08,31002264.0,79878.0,66056.571,560202.0,1000.0,979.571,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926
77210,USA,North America,United States,2021-04-09,31084962.0,82698.0,67896.000,561074.0,872.0,970.000,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926
77211,USA,North America,United States,2021-04-10,31151495.0,66533.0,68404.429,561783.0,709.0,969.429,...,54225.446,1.2,151.089,10.79,19.1,24.6,,2.77,78.86,0.926


In [13]:
# Date, total cases, new cases, and deaths data
covid_numbers_us_df = covid_data_df_us[["date","total_cases", "new_cases", "total_deaths", "new_deaths", "total_cases_per_million", "new_cases_per_million", "total_deaths_per_million", "new_deaths_per_million"]]
covid_numbers_us_df

Unnamed: 0,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million
76767,2020-01-22,1.0,,,,0.003,,,
76768,2020-01-23,1.0,0.0,,,0.003,0.000,,
76769,2020-01-24,2.0,1.0,,,0.006,0.003,,
76770,2020-01-25,2.0,0.0,,,0.006,0.000,,
76771,2020-01-26,5.0,3.0,,,0.015,0.009,,
...,...,...,...,...,...,...,...,...,...
77208,2021-04-07,30922386.0,75038.0,559202.0,2570.0,93420.359,226.699,1689.419,7.764
77209,2021-04-08,31002264.0,79878.0,560202.0,1000.0,93661.680,241.321,1692.440,3.021
77210,2021-04-09,31084962.0,82698.0,561074.0,872.0,93911.521,249.841,1695.074,2.634
77211,2021-04-10,31151495.0,66533.0,561783.0,709.0,94112.525,201.004,1697.216,2.142


In [29]:
# Test data
covid_testdata_df_us = covid_data_df_us[["date", "new_tests", "total_tests", "total_tests_per_thousand", "new_tests_per_thousand", "positive_rate", "tests_per_case"]]
# Keep only the rows with at least 2 non-NA values.
covid_testdata_df_us.dropna(thresh=2)


Unnamed: 0,date,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,positive_rate,tests_per_case
76806,2020-03-01,372.0,372.0,0.001,0.001,,
76807,2020-03-02,550.0,922.0,0.003,0.002,,
76808,2020-03-03,933.0,1855.0,0.006,0.003,,
76809,2020-03-04,924.0,2779.0,0.008,0.003,,
76810,2020-03-05,1205.0,3984.0,0.012,0.004,,
...,...,...,...,...,...,...,...
77203,2021-04-02,1210503.0,382758324.0,1156.360,3.657,0.055,18.2
77204,2021-04-03,841444.0,383599768.0,1158.902,2.542,0.057,17.5
77205,2021-04-04,450322.0,384050090.0,1160.263,1.360,0.057,17.5
77206,2021-04-05,530002.0,384580092.0,1161.864,1.601,0.061,16.4


In [9]:
covid_data_state = "./resources/COVID-19_Case_Surveillance_Public_Use_Data_with_Geography.csv"

covid_data_state_df = pd.read_csv(covid_data_state, low_memory=False)

covid_data_state_df

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2020-02,,,,,,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Yes,Missing,,
1,2020-02,,,,,,,,,3.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,Yes,Yes,,Yes
2,2020-02,,,,,,,,,,0.0,Clinical evaluation,Missing,Laboratory-confirmed case,Symptomatic,Yes,No,,Yes
3,2020-08,,,,,,,,,0.0,,Routine surveillance,Missing,Laboratory-confirmed case,Asymptomatic,No,No,Missing,Yes
4,2020-08,,,,,,,,,0.0,,Routine surveillance,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22507134,2020-12,AZ,4.0,YUMA,4027.0,65+ years,Male,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Yes,Missing,Yes,
22507135,2020-12,AZ,4.0,YUMA,4027.0,65+ years,Male,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Yes,
22507136,2020-12,AZ,4.0,YUMA,4027.0,65+ years,Male,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Yes,Missing,Yes,
22507137,2020-12,AZ,4.0,YUMA,4027.0,65+ years,Male,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Yes,Missing,Yes,


In [11]:
# California data
covid_data_CA = covid_data_state_df.loc[covid_data_state_df["res_state"] == "CA"]
covid_data_CA

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
205,2020-12,CA,6.0,,,Missing,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Yes,Missing,,
206,2020-12,CA,6.0,,,Missing,,,,,,Missing,Missing,Laboratory-confirmed case,Unknown,No,Missing,,
207,2020-12,CA,6.0,,,,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Yes,Unknown,,
7413,2020-03,CA,6.0,BUTTE,6007.0,,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
7414,2020-03,CA,6.0,BUTTE,6007.0,,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Yes,No,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22506961,2020-12,CA,6.0,YOLO,6113.0,65+ years,Male,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Unknown,No,Missing,,
22506962,2020-12,CA,6.0,YOLO,6113.0,65+ years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Yes,Yes,,
22506963,2020-12,CA,6.0,YOLO,6113.0,65+ years,Male,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Unknown,Yes,Yes,,
22506964,2020-12,CA,6.0,YOLO,6113.0,65+ years,Male,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Unknown,Yes,Missing,,


## COVID-19 Vactination Data

In [7]:
us_vaccination = "./resources/us_state_vaccinations.csv"

us_vaccination_df = pd.read_csv(us_vaccination)

us_vaccination_df

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.15,1.59,7270.0,1.45,7.69,,,,0.207
1,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.19,1.71,9245.0,1.53,7.73,5906.0,5906.0,1205.0,0.222
2,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.88,,1.64,8.88,8260.0,7083.0,1445.0,0.212
3,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.28,2.05,13488.0,1.77,9.07,8267.0,7478.0,1525.0,0.226
4,2021-01-16,Alabama,,,,,,,,,7557.0,7498.0,1529.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5933,2021-04-08,Wyoming,288814.0,436025.0,169230.0,20.75,49.90,120094.0,29.24,75.34,228.0,2655.0,4587.0,0.662
5934,2021-04-09,Wyoming,289028.0,447855.0,169409.0,20.78,49.94,120246.0,29.27,77.38,214.0,2633.0,4549.0,0.645
5935,2021-04-10,Wyoming,289340.0,450525.0,169683.0,20.83,49.99,120534.0,29.32,77.84,312.0,2640.0,4561.0,0.642
5936,2021-04-11,Wyoming,310702.0,450525.0,180223.0,22.80,53.68,131933.0,31.14,77.84,21362.0,3431.0,5928.0,0.690


In [55]:
ca_us_vaccination_df = us_vaccination_df.loc[us_vaccination_df["location"] == "California"]
ca_us_vaccination_df

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
546,2021-01-12,California,816301.0,3286050.0,703540.0,0.25,2.07,100089.0,1.78,8.32,,,,0.248
547,2021-01-13,California,891489.0,3435650.0,744545.0,0.34,2.26,133689.0,1.88,8.70,75188.00,75188.0,1903.0,0.259
548,2021-01-14,California,975293.0,3540175.0,801998.0,,2.47,,2.03,8.96,83804.00,79496.0,2012.0,0.275
549,2021-01-15,California,1072959.0,3548575.0,865387.0,0.52,2.72,204374.0,2.19,8.98,97666.00,85553.0,2165.0,0.302
550,2021-01-16,California,,,,,,,,,96867.75,88381.0,2237.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,2021-04-08,California,21243518.0,27861050.0,14123008.0,19.23,53.76,7599559.0,35.74,70.51,377626.00,377051.0,9543.0,0.762
633,2021-04-09,California,21725654.0,28532520.0,14445185.0,19.80,54.98,7822226.0,36.56,72.21,482136.00,391393.0,9906.0,0.761
634,2021-04-10,California,22281619.0,29034050.0,14803675.0,20.53,56.39,8110488.0,37.47,73.48,555965.00,404572.0,10239.0,0.767
635,2021-04-11,California,22754163.0,29034050.0,15123816.0,21.09,57.59,8332396.0,38.28,73.48,472544.00,398820.0,10094.0,0.784


In [8]:
vaccination_by_man = "./resources/vaccinations-by-manufacturer.csv"

vaccination_by_man_df = pd.read_csv(vaccination_by_man)

vaccination_by_man_df

Unnamed: 0,location,date,vaccine,total_vaccinations
0,Chile,2020-12-24,Pfizer/BioNTech,420
1,Chile,2020-12-25,Pfizer/BioNTech,5198
2,Chile,2020-12-26,Pfizer/BioNTech,8338
3,Chile,2020-12-27,Pfizer/BioNTech,8649
4,Chile,2020-12-28,Pfizer/BioNTech,8649
...,...,...,...,...
2261,United States,2021-04-10,Moderna,82622178
2262,United States,2021-04-10,Pfizer/BioNTech,94715143
2263,United States,2021-04-11,Johnson&Johnson,6453740
2264,United States,2021-04-11,Moderna,83847244


In [56]:
vaccination_by_man_us_df =  vaccination_by_man_df.loc[vaccination_by_man_df["location"] == "United States"]
vaccination_by_man_us_df

Unnamed: 0,location,date,vaccine,total_vaccinations
2061,United States,2021-01-12,Moderna,3835859
2062,United States,2021-01-12,Pfizer/BioNTech,5488697
2063,United States,2021-01-13,Moderna,4249795
2064,United States,2021-01-13,Pfizer/BioNTech,6025872
2065,United States,2021-01-15,Moderna,5122662
...,...,...,...,...
2261,United States,2021-04-10,Moderna,82622178
2262,United States,2021-04-10,Pfizer/BioNTech,94715143
2263,United States,2021-04-11,Johnson&Johnson,6453740
2264,United States,2021-04-11,Moderna,83847244
