## Covid19 vaccinations per country ETL

Connect to db

In [1]:
from sqlalchemy import create_engine
source = create_engine('mysql+mysqlconnector://test:test123@192.168.99.100:3306/test')

Load csv into dataframe

In [2]:
import pandas as pd
df = pd.read_csv('vaccinations.csv')
df

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Argentina,ARG,12/29/2020,700.0,,,,,0.00,,,,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
1,Argentina,ARG,12/30/2020,,,,,15656.0,,,,346.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
2,Argentina,ARG,12/31/2020,32013.0,,,,15656.0,0.07,,,346.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
3,Argentina,ARG,1/1/2021,,,,,11070.0,,,,245.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
4,Argentina,ARG,1/2/2021,,,,,8776.0,,,,194.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1422,Wales,,1/19/2021,176186.0,175816.0,370.0,13989.0,10672.0,5.59,5.58,0.01,3385.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...
1423,Wales,,1/20/2021,190831.0,190435.0,396.0,14645.0,11105.0,6.05,6.04,0.01,3522.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...
1424,Wales,,1/21/2021,212732.0,212317.0,415.0,21901.0,12318.0,6.75,6.73,0.01,3907.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...
1425,Wales,,1/22/2021,241016.0,240547.0,469.0,28284.0,15148.0,7.64,7.63,0.01,4804.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...


### Clean data

Check NaN values and types

In [3]:
df.isna().any()

country                                False
iso_code                                True
date                                   False
total_vaccinations                      True
people_vaccinated                       True
people_fully_vaccinated                 True
daily_vaccinations_raw                  True
daily_vaccinations                      True
total_vaccinations_per_hundred          True
people_vaccinated_per_hundred           True
people_fully_vaccinated_per_hundred     True
daily_vaccinations_per_million          True
vaccines                               False
source_name                            False
source_website                         False
dtype: bool

In [4]:
df.dtypes

country                                 object
iso_code                                object
date                                    object
total_vaccinations                     float64
people_vaccinated                      float64
people_fully_vaccinated                float64
daily_vaccinations_raw                 float64
daily_vaccinations                     float64
total_vaccinations_per_hundred         float64
people_vaccinated_per_hundred          float64
people_fully_vaccinated_per_hundred    float64
daily_vaccinations_per_million         float64
vaccines                                object
source_name                             object
source_website                          object
dtype: object

Date column to date type

In [5]:
df['date'] = pd.to_datetime(df['date'])

Fill NaN values with forward fill and backward fill

In [6]:
cols = ['total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'daily_vaccinations_raw', 'daily_vaccinations',                        'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred',                                    'daily_vaccinations_per_million']
df.loc[:,cols] = df.loc[:,cols].ffill()
df.loc[:,cols] = df.loc[:,cols].bfill()

Fill iso_code column with GBR (all of the countries with no iso code belong to the United Kingdom)

In [7]:
df['iso_code'].fillna('GBR', inplace=True)
df

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Argentina,ARG,2020-12-29,700.0,243539.0,4394.0,17791.0,15656.0,0.00,0.54,0.01,346.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
1,Argentina,ARG,2020-12-30,700.0,243539.0,4394.0,17791.0,15656.0,0.00,0.54,0.01,346.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
2,Argentina,ARG,2020-12-31,32013.0,243539.0,4394.0,17791.0,15656.0,0.07,0.54,0.01,346.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
3,Argentina,ARG,2021-01-01,32013.0,243539.0,4394.0,17791.0,11070.0,0.07,0.54,0.01,245.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
4,Argentina,ARG,2021-01-02,32013.0,243539.0,4394.0,17791.0,8776.0,0.07,0.54,0.01,194.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1422,Wales,GBR,2021-01-19,176186.0,175816.0,370.0,13989.0,10672.0,5.59,5.58,0.01,3385.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...
1423,Wales,GBR,2021-01-20,190831.0,190435.0,396.0,14645.0,11105.0,6.05,6.04,0.01,3522.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...
1424,Wales,GBR,2021-01-21,212732.0,212317.0,415.0,21901.0,12318.0,6.75,6.73,0.01,3907.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...
1425,Wales,GBR,2021-01-22,241016.0,240547.0,469.0,28284.0,15148.0,7.64,7.63,0.01,4804.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...


In [8]:
df.isna().any()

country                                False
iso_code                               False
date                                   False
total_vaccinations                     False
people_vaccinated                      False
people_fully_vaccinated                False
daily_vaccinations_raw                 False
daily_vaccinations                     False
total_vaccinations_per_hundred         False
people_vaccinated_per_hundred          False
people_fully_vaccinated_per_hundred    False
daily_vaccinations_per_million         False
vaccines                               False
source_name                            False
source_website                         False
dtype: bool

Write dataframe to sql database (test), table (vaccinations)

In [9]:

df.to_sql('vaccinations', con=source, schema='test', if_exists='replace', index=False)

Check what's inside the table of the db

In [10]:
pd.read_sql_query('SELECT * FROM vaccinations', con=source)

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Argentina,ARG,2020-12-29,700.0,243539.0,4394.0,17791.0,15656.0,0.00,0.54,0.01,346.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
1,Argentina,ARG,2020-12-30,700.0,243539.0,4394.0,17791.0,15656.0,0.00,0.54,0.01,346.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
2,Argentina,ARG,2020-12-31,32013.0,243539.0,4394.0,17791.0,15656.0,0.07,0.54,0.01,346.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
3,Argentina,ARG,2021-01-01,32013.0,243539.0,4394.0,17791.0,11070.0,0.07,0.54,0.01,245.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
4,Argentina,ARG,2021-01-02,32013.0,243539.0,4394.0,17791.0,8776.0,0.07,0.54,0.01,194.0,Sputnik V,Ministry of Health,http://datos.salud.gob.ar/dataset/vacunas-cont...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1422,Wales,GBR,2021-01-19,176186.0,175816.0,370.0,13989.0,10672.0,5.59,5.58,0.01,3385.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...
1423,Wales,GBR,2021-01-20,190831.0,190435.0,396.0,14645.0,11105.0,6.05,6.04,0.01,3522.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...
1424,Wales,GBR,2021-01-21,212732.0,212317.0,415.0,21901.0,12318.0,6.75,6.73,0.01,3907.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...
1425,Wales,GBR,2021-01-22,241016.0,240547.0,469.0,28284.0,15148.0,7.64,7.63,0.01,4804.0,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...


Write DDL's to vaccinations.sql file

In [13]:
create_table = pd.io.sql.get_schema(df.reset_index(), 'test.vaccinations', con=source)
with open('../sql/vaccinations.sql', 'w+') as file:
    file.write(create_table)
    file.close()

In [14]:
def sql_insert():
    sql_texts = 'INSERT INTO test.vaccinations (`'+ str('`, `'.join(df.columns))+ '`)\nVALUES '
    for index, row in df.iterrows():   
        if index == len(df) - 1:
            sql_texts += str(tuple(row.values)) + ';' 
        else:
            sql_texts += str(tuple(row.values)) + ',\n'
    return sql_texts

with open('../sql/vaccinations.sql', 'a+') as file:
    file.write(sql_insert())
    file.close()
