# Plotting vaccines vs covid cases

## Setup

In [2]:
import pandas as pd 
import numpy as np 
import plotly.express as px
from LeafPlotlyTools import *
import datetime
from tqdm import tqdm
import math

In [3]:
grapher = Graph()

## Loading and preparing the data

In [4]:
df = pd.read_csv('data/country_covid_data.csv')

In [5]:
def interpret_date(value):
    year, month, day = value.split('-')

    return datetime.date(int(year), int(month), int(day))

In [6]:
df['date'] = df['date'].apply(interpret_date)

df = df.sort_values('date')

# df = df[df['date'] >= datetime.date(2020, 12, 8)]

df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,gdp_per_cap_percentile
0,ARG,South America,Argentina,2020-01-01,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,2.0
1,MEX,North America,Mexico,2020-01-01,,,,,,,...,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,2.0
2,ARG,South America,Argentina,2020-01-02,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,2.0
3,MEX,North America,Mexico,2020-01-02,,,,,,,...,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,2.0
4,ARG,South America,Argentina,2020-01-03,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81848,CHN,Asia,China,2021-05-07,102596.0,10.0,14.571,4846.0,0.0,0.143,...,0.7,261.899,9.74,1.9,48.4,,4.34,76.91,0.761,2.0
81849,UGA,Africa,Uganda,2021-05-07,42224.0,72.0,51.143,346.0,1.0,0.571,...,41.6,213.333,2.50,3.4,16.7,21.222,0.50,63.37,0.544,4.0
81850,CHL,South America,Chile,2021-05-07,1235778.0,6530.0,5361.857,27004.0,109.0,93.000,...,1.3,127.993,8.46,34.2,41.5,,2.11,80.18,0.851,2.0
81852,UKR,Europe,Ukraine,2021-05-07,2160809.0,8529.0,5248.429,47717.0,386.0,256.286,...,0.1,539.849,7.11,13.5,47.4,,8.80,72.06,0.779,3.0


In [7]:
def days_since_start(date):
    start = datetime.date(2020, 12, 8)
    difference = date - start

    return difference.days

In [8]:
df['num_days'] = df['date'].apply(days_since_start)

In [9]:
def gdp_stringifyer(value):
    option_dict = {
        1: "High",
        2: "Upper middle",
        3: "Lower middle",
        4: "Low"
    }

    try: return option_dict[int(value)]
    except: return np.nan

In [10]:
df['Country wealth'] = df['gdp_per_cap_percentile'].apply(gdp_stringifyer)

df.dropna(subset = ['Country wealth'])

df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,gdp_per_cap_percentile,num_days,Country wealth
0,ARG,South America,Argentina,2020-01-01,,,,,,,...,5.50,16.2,27.7,,5.00,76.67,0.845,2.0,-342,Upper middle
1,MEX,North America,Mexico,2020-01-01,,,,,,,...,13.06,6.9,21.4,87.847,1.38,75.05,0.779,2.0,-342,Upper middle
2,ARG,South America,Argentina,2020-01-02,,,,,,,...,5.50,16.2,27.7,,5.00,76.67,0.845,2.0,-341,Upper middle
3,MEX,North America,Mexico,2020-01-02,,,,,,,...,13.06,6.9,21.4,87.847,1.38,75.05,0.779,2.0,-341,Upper middle
4,ARG,South America,Argentina,2020-01-03,,,,,,,...,5.50,16.2,27.7,,5.00,76.67,0.845,2.0,-340,Upper middle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81848,CHN,Asia,China,2021-05-07,102596.0,10.0,14.571,4846.0,0.0,0.143,...,9.74,1.9,48.4,,4.34,76.91,0.761,2.0,150,Upper middle
81849,UGA,Africa,Uganda,2021-05-07,42224.0,72.0,51.143,346.0,1.0,0.571,...,2.50,3.4,16.7,21.222,0.50,63.37,0.544,4.0,150,Low
81850,CHL,South America,Chile,2021-05-07,1235778.0,6530.0,5361.857,27004.0,109.0,93.000,...,8.46,34.2,41.5,,2.11,80.18,0.851,2.0,150,Upper middle
81852,UKR,Europe,Ukraine,2021-05-07,2160809.0,8529.0,5248.429,47717.0,386.0,256.286,...,7.11,13.5,47.4,,8.80,72.06,0.779,3.0,150,Lower middle


In [11]:
list(df)

['iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'reproduction_rate',
 'icu_patients',
 'icu_patients_per_million',
 'hosp_patients',
 'hosp_patients_per_million',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'new_tests',
 'total_tests',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'positive_rate',
 'tests_per_case',
 'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_hundred',
 'people_fully_vaccinate

In [12]:
plot_df = df[['num_days',
'total_vaccinations_per_hundred',
'new_cases_per_million',
'location',
'population',
'Country wealth'
]]#.dropna()

In [13]:
# plot_df['total_vaccinations_per_hundred'] = plot_df['total_vaccinations_per_hundred'].fillna(0)

# plot_df = plot_df.dropna()

In [14]:
plot_df#[plot_df['Country wealth']!='High']

Unnamed: 0,num_days,total_vaccinations_per_hundred,new_cases_per_million,location,population,Country wealth
0,-342,,,Argentina,4.519578e+07,Upper middle
1,-342,,,Mexico,1.289328e+08,Upper middle
2,-341,,,Argentina,4.519578e+07,Upper middle
3,-341,,,Mexico,1.289328e+08,Upper middle
4,-340,,,Argentina,4.519578e+07,Upper middle
...,...,...,...,...,...,...
81848,150,21.41,0.007,China,1.439324e+09,Upper middle
81849,150,,1.574,Uganda,4.574100e+07,Low
81850,150,,341.595,Chile,1.911621e+07,Upper middle
81852,150,1.96,195.021,Ukraine,4.373376e+07,Lower middle


In [15]:
plot_df['Country wealth'].drop_duplicates()

0      Upper middle
14     Lower middle
55              NaN
80             High
117             Low
Name: Country wealth, dtype: object

This code goes through the countries and fills in a any days where the vaccines might not be reported and puts the number at 0 if it is nan

In [14]:
new_data = []
countries = plot_df['location'].sort_values().drop_duplicates()
data_start_date = plot_df['num_days'].min()
data_end_date = plot_df['num_days'].max()
for country in tqdm(countries):
    df_temp = plot_df[plot_df['location'] == country].sort_values('num_days')

    day = data_start_date.copy()
    vaccines_started = False
    while day <= data_end_date:
        try:
            row = df_temp[df_temp['num_days']==day].iloc[0] # I know there is a better way to do this but cba to find it

            if row['total_vaccinations_per_hundred'] > 0:
                vaccines_started = True
                
            if math.isnan(row['total_vaccinations_per_hundred']):
                if vaccines_started:
                    row['total_vaccinations_per_hundred'] = last_row['total_vaccinations_per_hundred']
                else:
                    row['total_vaccinations_per_hundred'] = 0
        except:
            row = last_row.copy()
            row['num_days'] = day

        new_data.append(list(row))

        day += 1
        last_row = row.copy()
        
plot_df_filled = pd.DataFrame(new_data, columns = list(plot_df)).sort_values('num_days')

plot_df_filled

100%|██████████| 208/208 [01:16<00:00,  2.71it/s]


Unnamed: 0,num_days,total_vaccinations_per_hundred,new_cases_per_million,location,population,Country wealth
2612,-342,0.00,,Argentina,45195777.0,Upper middle
47034,-342,0.00,,Mexico,128932753.0,Upper middle
47035,-341,0.00,,Mexico,128932753.0,Upper middle
2613,-341,0.00,,Argentina,45195777.0,Upper middle
2614,-340,0.00,,Argentina,45195777.0,Upper middle
...,...,...,...,...,...,...
66198,142,29.38,441.573,Slovenia,2078932.0,High
25507,142,6.25,5.578,Fiji,896444.0,Lower middle
21962,142,4.88,171.569,Ecuador,17643060.0,Lower middle
80738,142,3.21,2.960,Zimbabwe,14862927.0,Low


In [15]:
plot_df_filled = plot_df_filled.dropna()

plot_df_filled = plot_df_filled[plot_df_filled['num_days'] >= 0]

plot_df_filled

Unnamed: 0,num_days,total_vaccinations_per_hundred,new_cases_per_million,location,population,Country wealth
78934,0,0.00,16.880,Venezuela,28435943.0,Upper middle
3871,0,0.00,0.235,Australia,25499881.0,High
55587,0,0.00,264.828,Palestine,5101416.0,Lower middle
9032,0,0.00,16.191,Bolivia,11673029.0,Lower middle
57765,0,0.00,12.648,Philippines,109581085.0,Lower middle
...,...,...,...,...,...,...
66398,142,0.71,0.000,Solomon Islands,686878.0,Low
66198,142,29.38,441.573,Slovenia,2078932.0,High
25507,142,6.25,5.578,Fiji,896444.0,Lower middle
21962,142,4.88,171.569,Ecuador,17643060.0,Lower middle


## Creating the figure

Figuring out the colour order

In [16]:
color_dict = {
    "High": "#73a2ab",
    "Upper middle": "#ccdee0",
    "Lower middle": "#e2b6c2",
    "Low": "#bc586e"
}

# label_order = plot_df_filled['Country wealth'].drop_duplicates()

# colour_list = []
# for i in label_order:
#     colour_list.append(color_dict[i])

# colour_list

In [17]:
plot_df_filled['total_vaccinations_per_hundred'].max()

127.66

In [18]:
fig = px.scatter(plot_df_filled, 
            x="total_vaccinations_per_hundred", 
            y="new_cases_per_million", 
            labels={
                     "total_vaccinations_per_hundred": "Total vaccinations per hundred",
                     "new_cases_per_million": "New cases per million",
                     "num_days": "Days since first vaccine",
                 }, 
            color_discrete_map=color_dict,
            animation_frame="num_days", 
            animation_group="location",
           size="population", 
           color="Country wealth", 
           hover_name="location",
        #    log_y=True, 
           size_max=200, 
        #    size_min=5, 
           range_x=[-5,plot_df_filled['total_vaccinations_per_hundred'].max()], 
           range_y=[0,1100],
           )

fig

In [19]:
plot_df_filled['new_cases_per_million'].max()

3216.5690000000004

In [20]:
# Using the style_graph function and passing in varibles
fig = grapher.style_graph(fig,
                        title=False,
                        subtitle='',
                        x_axis_title="Total vaccinations per hundred",
                        y_axis_title='New cases per million',
                        data_source="",
                        data_source_position=[0,-0.16], # [x, y]
                        watermark=None, # "left" or "right"
                        watermark_position = [0,-0.16], # [x, y]
                        legend_position = [0,1], # [x, y]
                        ytozero = True, # Y-xis to zero True or False
                        xhovermode = False, # x-axis hover mode. This means y value hover text always appears regardless if your mouse is over the value or not
                        xaxis_tickangle = 0 # xaxis text angle in degrees of rotation clockwise
                       )
# fig.update_layout(
#     margin=dict(t=30, b=150),
# )

# fig.update_layout(yaxis_range=[0, 4])

# fig.update_layout(
#     yaxis = dict(
#         tickmode = 'array',
#         tickvals = [10,50, 100, 500, 1000, 3000, 5000],
#     )
# )

# fig.update_yaxes(type="log")

fig.update_layout(annotations=[])

fig.update_layout(legend=dict(
    orientation="v",
    yanchor="top",
    y=1,
    xanchor="right",
    x=1
))

# Showing figure and passing in the config setting from the grapher object. 
# Config settings determine the elements of the interactive options in the top right bar
fig.show(config = grapher.get_config())

In [21]:
grapher.save_graph_html(fig, 'figures/animated_plot.html')