# Global COVID-19 Vaccination Progress

## This Script contains:
### 1. Import Libraries & Data
### 2. Consistency Checks & Cleaning
### 3. Basic Descriptive Statistical Analysis

# 1. Import Libraries & Data

## 1.1 Libraries

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

## 1.2 Data

In [2]:
# Create Path
path = r'C:\Users\M de Villiers\Desktop\Data Analytics\Achievement 6'

In [3]:
# Import Data
df = pd.read_csv(os.path.join(path, '02_Data', 'Original_Data', 'country_vaccinations.csv'), index_col = False)

In [4]:
df.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/


In [5]:
df.tail()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
81971,Zimbabwe,ZWE,2022-03-03,7921113.0,4372925.0,3406482.0,10373.0,8903.0,52.48,28.97,22.57,590.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac...",Ministry of Health,https://www.arcgis.com/home/webmap/viewer.html...
81972,Zimbabwe,ZWE,2022-03-04,7930621.0,4374896.0,3408609.0,9508.0,8603.0,52.55,28.99,22.59,570.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac...",Ministry of Health,https://www.arcgis.com/home/webmap/viewer.html...
81973,Zimbabwe,ZWE,2022-03-05,7936145.0,4377373.0,3410340.0,5524.0,8458.0,52.58,29.0,22.6,560.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac...",Ministry of Health,https://www.arcgis.com/home/webmap/viewer.html...
81974,Zimbabwe,ZWE,2022-03-06,7938362.0,4378029.0,3410960.0,2217.0,8017.0,52.6,29.01,22.6,531.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac...",Ministry of Health,https://www.arcgis.com/home/webmap/viewer.html...
81975,Zimbabwe,ZWE,2022-03-07,7943325.0,4379875.0,3412556.0,4963.0,7482.0,52.63,29.02,22.61,496.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac...",Ministry of Health,https://www.arcgis.com/home/webmap/viewer.html...


In [6]:
df.shape

(81976, 15)

# 2. Consistency Checks & Cleaning

### Dropping columns

In [7]:
# dropiing unnecessary columns: source_name
df = df.drop(columns = ['source_name'])

In [8]:
# dropiing unnecessary columns: source_website
df = df.drop(columns = ['source_website'])

In [9]:
df.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."


In [10]:
df.shape

(81976, 13)

### Checking Data Types

In [11]:
# Check for mixed data types
for col in df.columns.tolist():
  weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

No mixed data types

In [12]:
# Check data types
df.dtypes

country                                 object
iso_code                                object
date                                    object
total_vaccinations                     float64
people_vaccinated                      float64
people_fully_vaccinated                float64
daily_vaccinations_raw                 float64
daily_vaccinations                     float64
total_vaccinations_per_hundred         float64
people_vaccinated_per_hundred          float64
people_fully_vaccinated_per_hundred    float64
daily_vaccinations_per_million         float64
vaccines                                object
dtype: object

In [13]:
df['date'] = df['date'].astype('datetime64[ns]')

In [14]:
df.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."


In [15]:
df.tail()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines
81971,Zimbabwe,ZWE,2022-03-03,7921113.0,4372925.0,3406482.0,10373.0,8903.0,52.48,28.97,22.57,590.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac..."
81972,Zimbabwe,ZWE,2022-03-04,7930621.0,4374896.0,3408609.0,9508.0,8603.0,52.55,28.99,22.59,570.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac..."
81973,Zimbabwe,ZWE,2022-03-05,7936145.0,4377373.0,3410340.0,5524.0,8458.0,52.58,29.0,22.6,560.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac..."
81974,Zimbabwe,ZWE,2022-03-06,7938362.0,4378029.0,3410960.0,2217.0,8017.0,52.6,29.01,22.6,531.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac..."
81975,Zimbabwe,ZWE,2022-03-07,7943325.0,4379875.0,3412556.0,4963.0,7482.0,52.63,29.02,22.61,496.0,"Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac..."


### Missing Values

In [16]:
# finding missing values
df.isnull().sum()

country                                    0
iso_code                                   0
date                                       0
total_vaccinations                     40103
people_vaccinated                      42338
people_fully_vaccinated                44857
daily_vaccinations_raw                 47943
daily_vaccinations                       279
total_vaccinations_per_hundred         40103
people_vaccinated_per_hundred          42338
people_fully_vaccinated_per_hundred    44857
daily_vaccinations_per_million           279
vaccines                                   0
dtype: int64

Upon further investigation, it would seem these values aren't missing. The timeline of the data is on a daily basis and some countries only report their statistics once a week, thus the rest of the days in the week will have "missing" values. I will change all of these to 0 since I might want to aggregate them at a later stage.

In [17]:
# Imputing missing values with 0
df.fillna(0, inplace =True)

In [18]:
df.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines
0,Afghanistan,AFG,2021-02-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
1,Afghanistan,AFG,2021-02-23,0.0,0.0,0.0,0.0,1367.0,0.0,0.0,0.0,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
2,Afghanistan,AFG,2021-02-24,0.0,0.0,0.0,0.0,1367.0,0.0,0.0,0.0,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
3,Afghanistan,AFG,2021-02-25,0.0,0.0,0.0,0.0,1367.0,0.0,0.0,0.0,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
4,Afghanistan,AFG,2021-02-26,0.0,0.0,0.0,0.0,1367.0,0.0,0.0,0.0,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."


In [19]:
df.isnull().sum()

country                                0
iso_code                               0
date                                   0
total_vaccinations                     0
people_vaccinated                      0
people_fully_vaccinated                0
daily_vaccinations_raw                 0
daily_vaccinations                     0
total_vaccinations_per_hundred         0
people_vaccinated_per_hundred          0
people_fully_vaccinated_per_hundred    0
daily_vaccinations_per_million         0
vaccines                               0
dtype: int64

### Duplicates

In [20]:
df[df.duplicated()]

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines


No duplicates

### Consistency

In [21]:
pd.options.display.max_rows = None

In [22]:
# Country names
df['country'].value_counts()

Norway                              460
Latvia                              459
United States                       450
Canada                              449
Russia                              448
Denmark                             448
China                               448
Israel                              444
Switzerland                         441
Liechtenstein                       441
Qatar                               441
Chile                               438
Mexico                              438
Lithuania                           436
Slovenia                            436
Czechia                             436
Germany                             436
Italy                               436
Estonia                             436
Hungary                             435
Poland                              435
Greece                              435
France                              435
Portugal                            435
Romania                             435


All country names are accurate and the format is consistent

In [23]:
# Date
df['date'].value_counts()

2021-08-28    220
2021-08-17    220
2021-08-26    220
2021-08-25    220
2021-08-24    220
2021-08-23    220
2021-08-22    220
2021-08-21    220
2021-08-20    220
2021-08-18    220
2021-08-16    220
2021-08-29    220
2021-08-15    220
2021-08-14    220
2021-08-13    220
2021-08-12    220
2021-08-11    220
2021-08-10    220
2021-08-09    220
2021-08-08    220
2021-08-27    220
2021-08-19    220
2021-07-30    219
2021-07-24    219
2021-07-29    219
2021-08-30    219
2021-08-31    219
2021-09-01    219
2021-07-16    219
2021-07-17    219
2021-07-18    219
2021-07-19    219
2021-07-20    219
2021-07-21    219
2021-07-22    219
2021-07-23    219
2021-07-25    219
2021-07-26    219
2021-07-27    219
2021-07-28    219
2021-08-07    219
2021-08-06    219
2021-08-05    219
2021-08-04    219
2021-08-03    219
2021-08-02    219
2021-08-01    219
2021-07-31    219
2021-07-13    218
2021-06-21    218
2021-07-15    218
2021-07-14    218
2021-06-22    218
2021-07-12    218
2021-07-11    218
2021-06-23

Not all daily dates have the same counts. I might have to aggregate the daily dates into monthly summaries for each country.

# Basic Descriptive Statistics

In [24]:
df1 = df.describe()

In [25]:
df1

Unnamed: 0,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
count,81976.0,81976.0,81976.0,81976.0,81976.0,81976.0,81976.0,81976.0,81976.0
mean,21791280.0,8045972.0,5914203.0,113150.1,133933.8,39.131932,19.218864,15.487147,3327.513882
std,153381900.0,46989620.0,35901410.0,802359.6,782905.8,60.609306,28.312537,25.393017,3968.625418
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,934.0,0.0,0.0,0.0,674.0
50%,7423.5,0.0,0.0,0.0,7557.5,0.13,0.0,0.0,2146.0
75%,3628498.0,1830708.0,1102393.0,14016.5,44892.0,65.9,37.09,24.0,4788.0
max,3165486000.0,1269302000.0,1234540000.0,24741000.0,22424290.0,336.16,124.65,121.53,117497.0


In [26]:
df1.to_clipboard()

# Export Data

In [27]:
df.to_csv(os.path.join(path, '02_Data','Prepared_Data', 'covid_vaccinations_clean.csv'), index = False)