In [1]:
# Import the required libraries and set the plotting options.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date

sns.set(rc = {'figure.figsize': (15,10)})

In [2]:
# Import necessary files onto Python for analysis purposes.
covid_cases = pd.read_csv("covid_19_uk_cases.csv")
vaccination_rates = pd.read_csv("covid_19_uk_vaccinated.csv")
tweets = pd.read_csv("tweets.csv")

## Exploring the Data

### 1. covid_cases

In [4]:
# Printing first 5 rows.
covid_cases.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,ISO 3166-1 Alpha 3-Codes,Sub-region Name,Intermediate Region Code,Date,Deaths,Cases,Recovered,Hospitalised
0,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-22,0.0,0.0,0.0,0.0
1,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-23,0.0,0.0,0.0,0.0
2,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-24,0.0,0.0,0.0,0.0
3,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-25,0.0,0.0,0.0,0.0
4,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-26,0.0,0.0,0.0,0.0


In [5]:
# Printing last 5 rows of data.
covid_cases.tail()

Unnamed: 0,Province/State,Country/Region,Lat,Long,ISO 3166-1 Alpha 3-Codes,Sub-region Name,Intermediate Region Code,Date,Deaths,Cases,Recovered,Hospitalised
7579,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-10,137735.0,8154306.0,0.0,378.0
7580,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-11,137763.0,8193769.0,0.0,386.0
7581,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-12,137944.0,8231437.0,0.0,386.0
7582,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-13,138080.0,8272883.0,0.0,0.0
7583,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-14,138237.0,8317439.0,0.0,0.0


In [3]:
# Determining shape, data type, and concise summary of covid_cases data set.
print(covid_cases.shape)
print(covid_cases.dtypes)
# info() to determine the exact columns where we have missing data.
print(covid_cases.info())

(7584, 12)
Province/State               object
Country/Region               object
Lat                         float64
Long                        float64
ISO 3166-1 Alpha 3-Codes     object
Sub-region Name              object
Intermediate Region Code      int64
Date                         object
Deaths                      float64
Cases                       float64
Recovered                   float64
Hospitalised                float64
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7584 entries, 0 to 7583
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Province/State            7584 non-null   object 
 1   Country/Region            7584 non-null   object 
 2   Lat                       7584 non-null   float64
 3   Long                      7584 non-null   float64
 4   ISO 3166-1 Alpha 3-Codes  7584 non-null   object 
 5   Sub-region Name           7584 non-null   obje

In [7]:
# Number of rows with missing data.
covid_cases_na = covid_cases[covid_cases.isna().any(axis=1)]
# Printing shape of new dataframe, covid_cases_na.
print(covid_cases_na.shape)
# Printing rows with missing data.
covid_cases_na

(2, 12)


Unnamed: 0,Province/State,Country/Region,Lat,Long,ISO 3166-1 Alpha 3-Codes,Sub-region Name,Intermediate Region Code,Date,Deaths,Cases,Recovered,Hospitalised
875,Bermuda,United Kingdom,32.3078,-64.7505,BMU,Northern America,0,2020-09-21,,,,
876,Bermuda,United Kingdom,32.3078,-64.7505,BMU,Northern America,0,2020-09-22,,,,


### 2. vaccination_rates

In [8]:
# Printing first 5 rows of vaccination_rates data set.
vaccination_rates.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,ISO 3166-1 Alpha 3-Codes,Sub-region Name,Intermediate Region Code,Date,Vaccinated,First Dose,Second Dose
0,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-22,0,0,0
1,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-23,0,0,0
2,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-24,0,0,0
3,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-25,0,0,0
4,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-26,0,0,0


In [9]:
# Printing last 5 rows of vaccination_rates data set.
vaccination_rates.tail()

Unnamed: 0,Province/State,Country/Region,Lat,Long,ISO 3166-1 Alpha 3-Codes,Sub-region Name,Intermediate Region Code,Date,Vaccinated,First Dose,Second Dose
7579,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-10,1070,1216,1070
7580,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-11,1300,1604,1300
7581,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-12,1482,2027,1482
7582,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-13,0,0,0
7583,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-14,0,0,0


In [10]:
# Determining shape, data type, and concise summary of vaccination_rates data set.
print(vaccination_rates.shape)
print(vaccination_rates.dtypes)
print(vaccination_rates.info())

(7584, 11)
Province/State               object
Country/Region               object
Lat                         float64
Long                        float64
ISO 3166-1 Alpha 3-Codes     object
Sub-region Name              object
Intermediate Region Code      int64
Date                         object
Vaccinated                    int64
First Dose                    int64
Second Dose                   int64
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7584 entries, 0 to 7583
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Province/State            7584 non-null   object 
 1   Country/Region            7584 non-null   object 
 2   Lat                       7584 non-null   float64
 3   Long                      7584 non-null   float64
 4   ISO 3166-1 Alpha 3-Codes  7584 non-null   object 
 5   Sub-region Name           7584 non-null   object 
 6   Intermediate Region Code  7

In [11]:
# Number of rows with missing data.
vaccination_rates_na = vaccination_rates[vaccination_rates.isna().any(axis=1)]
# Printing the shape of vaccination_rates_na, and the vaccination_rates_na dataframe.
print(vaccination_rates_na.shape)
vaccination_rates_na

(0, 11)


Unnamed: 0,Province/State,Country/Region,Lat,Long,ISO 3166-1 Alpha 3-Codes,Sub-region Name,Intermediate Region Code,Date,Vaccinated,First Dose,Second Dose


## Merging and exploring data

### 1. Merging covid_cases and vaccination_rates together

In [12]:
# Merging datasets. 
cov_vacc_merge = pd.merge(covid_cases, vaccination_rates)

In [13]:
# Prints first two rows of merged dataset.
cov_vacc_merge.head(2)

Unnamed: 0,Province/State,Country/Region,Lat,Long,ISO 3166-1 Alpha 3-Codes,Sub-region Name,Intermediate Region Code,Date,Deaths,Cases,Recovered,Hospitalised,Vaccinated,First Dose,Second Dose
0,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-22,0.0,0.0,0.0,0.0,0,0,0
1,Anguilla,United Kingdom,18.2206,-63.0686,AIA,Latin America and the Caribbean,29,2020-01-23,0.0,0.0,0.0,0.0,0,0,0


In [14]:
# Prints last two rows of merged dataset.
cov_vacc_merge.tail(2)

Unnamed: 0,Province/State,Country/Region,Lat,Long,ISO 3166-1 Alpha 3-Codes,Sub-region Name,Intermediate Region Code,Date,Deaths,Cases,Recovered,Hospitalised,Vaccinated,First Dose,Second Dose
7582,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-13,138080.0,8272883.0,0.0,0.0,0,0,0
7583,Others,United Kingdom,55.3781,-3.436,GBR,Northern Europe,0,2021-10-14,138237.0,8317439.0,0.0,0.0,0,0,0


In [15]:
# Prints datatypes of the columns in merged dataset, the shape and concise info on merged dataset.
print(cov_vacc_merge.dtypes)
print(cov_vacc_merge.shape)
print(cov_vacc_merge.info())

Province/State               object
Country/Region               object
Lat                         float64
Long                        float64
ISO 3166-1 Alpha 3-Codes     object
Sub-region Name              object
Intermediate Region Code      int64
Date                         object
Deaths                      float64
Cases                       float64
Recovered                   float64
Hospitalised                float64
Vaccinated                    int64
First Dose                    int64
Second Dose                   int64
dtype: object
(7584, 15)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7584 entries, 0 to 7583
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Province/State            7584 non-null   object 
 1   Country/Region            7584 non-null   object 
 2   Lat                       7584 non-null   float64
 3   Long                      7584 non-null   float6

In [16]:
# Changing data type of Date column from object to DateTime.
cov_vacc_merge['Date'] = pd.to_datetime(cov_vacc_merge['Date'])

print(cov_vacc_merge.dtypes)

Province/State                      object
Country/Region                      object
Lat                                float64
Long                               float64
ISO 3166-1 Alpha 3-Codes            object
Sub-region Name                     object
Intermediate Region Code             int64
Date                        datetime64[ns]
Deaths                             float64
Cases                              float64
Recovered                          float64
Hospitalised                       float64
Vaccinated                           int64
First Dose                           int64
Second Dose                          int64
dtype: object


In [17]:
# Dropping unnecessary columns from merged DataFrame. 
cov_vacc_drop = cov_vacc_merge.drop(columns=['Lat', 'Long', 
                                             'ISO 3166-1 Alpha 3-Codes',
                                             'Sub-region Name', 'Intermediate Region Code'])

cov_vacc_drop.head()

Unnamed: 0,Province/State,Country/Region,Date,Deaths,Cases,Recovered,Hospitalised,Vaccinated,First Dose,Second Dose
0,Anguilla,United Kingdom,2020-01-22,0.0,0.0,0.0,0.0,0,0,0
1,Anguilla,United Kingdom,2020-01-23,0.0,0.0,0.0,0.0,0,0,0
2,Anguilla,United Kingdom,2020-01-24,0.0,0.0,0.0,0.0,0,0,0
3,Anguilla,United Kingdom,2020-01-25,0.0,0.0,0.0,0.0,0,0,0
4,Anguilla,United Kingdom,2020-01-26,0.0,0.0,0.0,0.0,0,0,0


In [21]:
# Checking for missing datasets.
covid_vacc_drop_na = cov_vacc_drop[cov_vacc_drop.isna().any(axis=1)]
# Printing the shape of vaccination_rates_na, and the vaccination_rates_na dataframe.
print(covid_vacc_drop_na.shape)
covid_vacc_drop_na

(2, 10)


Unnamed: 0,Province/State,Country/Region,Date,Deaths,Cases,Recovered,Hospitalised,Vaccinated,First Dose,Second Dose
875,Bermuda,United Kingdom,2020-09-21,,,,,0,0,0
876,Bermuda,United Kingdom,2020-09-22,,,,,0,0,0


### 2. Analysing merged datasets

In [25]:
# Grouping data in merged dataset by Province/State.
per_region = cov_vacc_merge.groupby('Province/State')

# Finding the total number of cases per Country/Region.
cases = per_region.apply(lambda x: x[x['Country/Region']=='United Kingdom']['Cases'].sum())
# Sorting the total from highest to lowest cases.
cases.sort_values(ascending=False)

Province/State
Others                                          1.621651e+09
Channel Islands                                 1.957978e+06
Gibraltar                                       1.413853e+06
Isle of Man                                     8.871330e+05
Turks and Caicos Islands                        7.526180e+05
Bermuda                                         6.854420e+05
British Virgin Islands                          2.849610e+05
Cayman Islands                                  2.177560e+05
Anguilla                                        3.531500e+04
Falkland Islands (Malvinas)                     2.048200e+04
Montserrat                                      9.556000e+03
Saint Helena, Ascension and Tristan da Cunha    1.438000e+03
dtype: float64

In [27]:
# Sum of deaths per region sorted from highest to lowest.
deaths = per_region.apply(lambda x: x[x['Country/Region']=='United Kingdom']['Deaths'].sum())

deaths.sort_values(ascending=False)

Province/State
Others                                          46987145.0
Channel Islands                                    37130.0
Gibraltar                                          25412.0
Isle of Man                                        15051.0
Bermuda                                            10353.0
Turks and Caicos Islands                            5612.0
British Virgin Islands                              3573.0
Cayman Islands                                       911.0
Montserrat                                           539.0
Anguilla                                              24.0
Saint Helena, Ascension and Tristan da Cunha           4.0
Falkland Islands (Malvinas)                            0.0
dtype: float64

In [28]:
# Sum of recoveries per region sorted from highest to lowest.
recovered = per_region.apply(lambda x: x[x['Country/Region']=='United Kingdom']['Recovered'].sum())

recovered.sort_values(ascending=False)

Province/State
Channel Islands                                 1027626.0
Gibraltar                                        956103.0
Turks and Caicos Islands                         515923.0
Bermuda                                          363999.0
Isle of Man                                      328319.0
Cayman Islands                                   152052.0
British Virgin Islands                            64359.0
Falkland Islands (Malvinas)                       14754.0
Anguilla                                          12708.0
Montserrat                                         6376.0
Others                                             4115.0
Saint Helena, Ascension and Tristan da Cunha       1135.0
dtype: float64

In [29]:
# Sum of first doses per region, sorted from highest to lowest.
first_dose = per_region.apply(lambda x: x[x['Country/Region']=='United Kingdom']['First Dose'].sum())

first_dose.sort_values(ascending=False)

Province/State
Gibraltar                                       5870786
Montserrat                                      5401128
British Virgin Islands                          5166303
Anguilla                                        4931470
Isle of Man                                     4226984
Falkland Islands (Malvinas)                     3757307
Cayman Islands                                  3522476
Channel Islands                                 3287646
Turks and Caicos Islands                        3052822
Bermuda                                         2817981
Others                                          2583151
Saint Helena, Ascension and Tristan da Cunha    2348310
dtype: int64

In [30]:
# Sum of second doses per region, sorted from highest to lowest.
second_dose = per_region.apply(lambda x: x[x['Country/Region']=='United Kingdom']['Vaccinated'].sum())

second_dose.sort_values(ascending=False)

Province/State
Gibraltar                                       5606041
Montserrat                                      5157560
British Virgin Islands                          4933315
Anguilla                                        4709072
Isle of Man                                     4036345
Falkland Islands (Malvinas)                     3587869
Cayman Islands                                  3363624
Channel Islands                                 3139385
Turks and Caicos Islands                        2915136
Bermuda                                         2690908
Others                                          2466669
Saint Helena, Ascension and Tristan da Cunha    2242421
dtype: int64

In [31]:
# Creating a new column which shows the difference between second vaccination and first vaccination per region.
cov_vacc_drop['Vaccination Difference'] = cov_vacc_drop['First Dose'] - cov_vacc_drop['Second Dose']

cov_vacc_drop.tail()

Unnamed: 0,Province/State,Country/Region,Date,Deaths,Cases,Recovered,Hospitalised,Vaccinated,First Dose,Second Dose,Vaccination Difference
7579,Others,United Kingdom,2021-10-10,137735.0,8154306.0,0.0,378.0,1070,1216,1070,146
7580,Others,United Kingdom,2021-10-11,137763.0,8193769.0,0.0,386.0,1300,1604,1300,304
7581,Others,United Kingdom,2021-10-12,137944.0,8231437.0,0.0,386.0,1482,2027,1482,545
7582,Others,United Kingdom,2021-10-13,138080.0,8272883.0,0.0,0.0,0,0,0,0
7583,Others,United Kingdom,2021-10-14,138237.0,8317439.0,0.0,0.0,0,0,0,0


In [32]:
# Displaying the total number of people who have had first doses but not second doses in each region.
region = cov_vacc_drop.groupby('Province/State')
Difference = region.apply(lambda x: x[x['Country/Region']=='United Kingdom']['Vaccination Difference'].sum())

Difference.sort_values(ascending=False)

Province/State
Gibraltar                                       264745
Montserrat                                      243568
British Virgin Islands                          232988
Anguilla                                        222398
Isle of Man                                     190639
Falkland Islands (Malvinas)                     169438
Cayman Islands                                  158852
Channel Islands                                 148261
Turks and Caicos Islands                        137686
Bermuda                                         127073
Others                                          116482
Saint Helena, Ascension and Tristan da Cunha    105889
dtype: int64

## Gibraltar analysis

In [34]:
# Allows all rows to be printed if whole DataFrame needs to be displayed.
pd.set_option("display.max_rows", None)

In [35]:
# Filtering only rows containing Gibraltar from cov_vacc_drop DataFrame.
Gibraltar = cov_vacc_drop[cov_vacc_drop['Province/State']=='Gibraltar']
Gibraltar

Unnamed: 0,Province/State,Country/Region,Date,Deaths,Cases,Recovered,Hospitalised,Vaccinated,First Dose,Second Dose,Vaccination Difference
3792,Gibraltar,United Kingdom,2020-01-22,0.0,0.0,0.0,0.0,0,0,0,0
3793,Gibraltar,United Kingdom,2020-01-23,0.0,0.0,0.0,0.0,0,0,0,0
3794,Gibraltar,United Kingdom,2020-01-24,0.0,0.0,0.0,0.0,0,0,0,0
3795,Gibraltar,United Kingdom,2020-01-25,0.0,0.0,0.0,0.0,0,0,0,0
3796,Gibraltar,United Kingdom,2020-01-26,0.0,0.0,0.0,0.0,0,0,0,0
3797,Gibraltar,United Kingdom,2020-01-27,0.0,0.0,0.0,0.0,0,0,0,0
3798,Gibraltar,United Kingdom,2020-01-28,0.0,0.0,0.0,0.0,0,0,0,0
3799,Gibraltar,United Kingdom,2020-01-29,0.0,0.0,0.0,0.0,0,0,0,0
3800,Gibraltar,United Kingdom,2020-01-30,0.0,0.0,0.0,0.0,0,0,0,0
3801,Gibraltar,United Kingdom,2020-01-31,0.0,0.0,0.0,0.0,0,0,0,0


In [36]:
# Creating a data set with only death, cases, recovery, hospitalisation and vaccination difference.
Gibraltar_new = Gibraltar.drop(columns=['Province/State', 'Country/Region', 
                                        'Date', 'Vaccinated',
                                        'First Dose', 'Second Dose'])

Gibraltar_new.tail()

Unnamed: 0,Deaths,Cases,Recovered,Hospitalised,Vaccination Difference
4419,97.0,5626.0,0.0,858.0,332
4420,97.0,5655.0,0.0,876.0,692
4421,97.0,5682.0,0.0,876.0,1238
4422,97.0,5707.0,0.0,0.0,0
4423,97.0,5727.0,0.0,0.0,0


In [37]:
# Descriptive statistics for Gibraltar data set.
Gibraltar_new.describe()

Unnamed: 0,Deaths,Cases,Recovered,Hospitalised,Vaccination Difference
count,632.0,632.0,632.0,632.0,632.0
mean,40.208861,2237.109177,1512.821203,1027.625,418.900316
std,45.332832,2136.26809,1817.096755,1145.681058,18579.415986
min,0.0,0.0,0.0,0.0,-49858.0
25%,0.0,177.0,109.5,157.75,-3514.5
50%,5.0,1036.5,323.5,675.5,0.0
75%,94.0,4286.0,4122.5,1548.0,0.0
max,97.0,5727.0,4670.0,4907.0,82541.0
