# Exploring Our World in Data  COVID-19 data

Imports

In [97]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [98]:
import datetime


To seperate out the countries in this data we need country iso-3 codes. There is a data set of this all [here](https://www.iban.com/country-codes). Luckily we can download the data in one line of code with pandas' read_html function.

In [99]:
# country_codes = pd.read_html('https://www.iban.com/country-codes')[0]

# As I have already downloaded and saved the data I can now just load it in with this code
country_codes = pd.read_csv('data/country_codes.csv')

In [100]:
country_codes

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric
0,Afghanistan,AF,AFG,4
1,Åland Islands,AX,ALA,248
2,Albania,AL,ALB,8
3,Algeria,DZ,DZA,12
4,American Samoa,AS,ASM,16
...,...,...,...,...
244,Wallis and Futuna,WF,WLF,876
245,Western Sahara,EH,ESH,732
246,Yemen,YE,YEM,887
247,Zambia,ZM,ZMB,894


Lets save this for future use and incase the website ever goes down.

In [101]:
country_codes.to_csv('data/country_codes.csv', index=False)

Using the github download link we can pull the latest OWID COVID data straight from github

In [102]:
data_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-data.csv'

df = pd.read_csv(data_link)

df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95473,ZWE,Africa,Zimbabwe,2021-06-09,39432.0,111.0,57.286,1622.0,5.0,3.286,...,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571,
95474,ZWE,Africa,Zimbabwe,2021-06-10,39496.0,64.0,57.714,1626.0,4.0,3.143,...,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571,
95475,ZWE,Africa,Zimbabwe,2021-06-11,39688.0,192.0,77.714,1629.0,3.0,3.429,...,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571,
95476,ZWE,Africa,Zimbabwe,2021-06-12,39852.0,164.0,97.714,1632.0,3.0,3.857,...,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571,


In [103]:
def interpret_date(value):
    year, month, day = value.split('-')

    return datetime.date(int(year), int(month), int(day))

In [104]:
df['date'] = df['date'].apply(interpret_date)

df = df.sort_values('date')

In [105]:
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
55550,MEX,North America,Mexico,2020-01-01,,,,,,,...,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,
3412,ARG,South America,Argentina,2020-01-01,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,
3413,ARG,South America,Argentina,2020-01-02,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,
55551,MEX,North America,Mexico,2020-01-02,,,,,,,...,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,
3414,ARG,South America,Argentina,2020-01-03,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38026,HND,North America,Honduras,2021-06-13,247728.0,654.0,843.143,6631.0,25.0,21.714,...,16.0,240.208,7.21,2.0,,84.169,0.70,75.27,0.634,
89987,ARE,Asia,United Arab Emirates,2021-06-13,597986.0,1969.0,2130.714,1726.0,2.0,3.857,...,,317.840,17.26,1.2,37.4,,1.20,77.97,0.890,
37566,HTI,North America,Haiti,2021-06-13,16079.0,0.0,113.857,346.0,0.0,3.286,...,23.5,430.548,6.65,2.9,23.1,22.863,0.70,64.00,0.510,
39976,IND,Asia,India,2021-06-13,29510410.0,70421.0,85776.429,374305.0,3921.0,3588.429,...,21.2,282.280,10.39,1.9,20.6,59.550,0.53,69.66,0.645,


## Analysing the countries in the data set

In [106]:
country_iso_3_codes = list(country_codes['Alpha-3 code'])

In [107]:
country_df = df[df['iso_code'].isin(country_iso_3_codes)]

In [108]:
country_df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
55550,MEX,North America,Mexico,2020-01-01,,,,,,,...,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,
3412,ARG,South America,Argentina,2020-01-01,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,
3413,ARG,South America,Argentina,2020-01-02,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,
55551,MEX,North America,Mexico,2020-01-02,,,,,,,...,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,
3414,ARG,South America,Argentina,2020-01-03,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38026,HND,North America,Honduras,2021-06-13,247728.0,654.0,843.143,6631.0,25.0,21.714,...,16.0,240.208,7.21,2.0,,84.169,0.70,75.27,0.634,
89987,ARE,Asia,United Arab Emirates,2021-06-13,597986.0,1969.0,2130.714,1726.0,2.0,3.857,...,,317.840,17.26,1.2,37.4,,1.20,77.97,0.890,
37566,HTI,North America,Haiti,2021-06-13,16079.0,0.0,113.857,346.0,0.0,3.286,...,23.5,430.548,6.65,2.9,23.1,22.863,0.70,64.00,0.510,
39976,IND,Asia,India,2021-06-13,29510410.0,70421.0,85776.429,374305.0,3921.0,3588.429,...,21.2,282.280,10.39,1.9,20.6,59.550,0.53,69.66,0.645,


This is a header with all the values I want for the data on the countries

In [109]:
country_data_header = ['iso_code',
 'continent',
 'location',
 'population',
 'population_density',
 'median_age',
 'aged_65_older',
 'aged_70_older',
 'gdp_per_capita',
 'extreme_poverty',
 'cardiovasc_death_rate',
 'diabetes_prevalence',
 'female_smokers',
 'male_smokers',
 'handwashing_facilities',
 'hospital_beds_per_thousand',
 'life_expectancy',
 'human_development_index']

Now we can pull these columns from the OWID data and drop all the duplicates

In [110]:
country_data = country_df.drop_duplicates(subset = 'location')[country_data_header].sort_values('gdp_per_capita', ascending=False).reset_index(drop=True).dropna(subset=['gdp_per_capita'])

country_data

Unnamed: 0,iso_code,continent,location,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,QAT,Asia,Qatar,2881060.0,227.322,31.9,1.307,0.617,116935.600,,176.690,16.52,0.8,26.9,,1.20,80.23,0.848
1,MAC,Asia,Macao,649342.0,20546.766,39.2,9.798,4.991,104861.851,,,,,,,,84.24,
2,LUX,Europe,Luxembourg,625976.0,231.447,39.7,14.312,9.842,94277.965,0.2,128.275,4.42,20.9,26.0,,4.51,82.25,0.916
3,SGP,Asia,Singapore,5850343.0,7915.731,42.4,12.922,7.049,85535.383,,92.243,10.99,5.2,28.3,,2.40,83.62,0.938
4,BRN,Asia,Brunei,437483.0,81.347,32.4,4.591,2.382,71809.251,,201.285,12.79,2.0,30.9,,2.70,75.86,0.838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,NER,Africa,Niger,24206636.0,16.955,15.1,2.553,1.378,926.000,44.5,238.339,2.42,0.1,15.4,8.978,0.30,62.42,0.394
188,COD,Africa,Democratic Republic of Congo,89561404.0,35.879,17.0,3.020,1.745,808.133,77.1,318.949,6.10,,,4.472,,60.68,0.480
189,LBR,Africa,Liberia,5057677.0,49.127,19.2,3.057,1.756,752.788,38.6,272.509,2.42,1.5,18.1,1.188,0.80,64.10,0.480
190,BDI,Africa,Burundi,11890781.0,423.062,17.5,2.562,1.504,702.225,71.7,293.068,6.05,,,6.144,0.80,61.58,0.433


We now have a dataset with all our country specific data. To analyse this data by country gdp level we need to sort the dataset by gdp per capita and seperate it out into four sections; 1, 2, 3, 4 where 1 is the top perctile etc

In [111]:
gdp_percentile_list = []
number_of_countries = len(country_data)
number_of_sections = 4
section = 1
for index, row in country_data.iterrows():
    if index < (section/number_of_sections)*number_of_countries:
        gdp_percentile_list.append(section)
    else:
        section += 1
        gdp_percentile_list.append(section)

In [112]:
country_data['gdp_per_cap_percentile'] = gdp_percentile_list

country_data

Unnamed: 0,iso_code,continent,location,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,gdp_per_cap_percentile
0,QAT,Asia,Qatar,2881060.0,227.322,31.9,1.307,0.617,116935.600,,176.690,16.52,0.8,26.9,,1.20,80.23,0.848,1
1,MAC,Asia,Macao,649342.0,20546.766,39.2,9.798,4.991,104861.851,,,,,,,,84.24,,1
2,LUX,Europe,Luxembourg,625976.0,231.447,39.7,14.312,9.842,94277.965,0.2,128.275,4.42,20.9,26.0,,4.51,82.25,0.916,1
3,SGP,Asia,Singapore,5850343.0,7915.731,42.4,12.922,7.049,85535.383,,92.243,10.99,5.2,28.3,,2.40,83.62,0.938,1
4,BRN,Asia,Brunei,437483.0,81.347,32.4,4.591,2.382,71809.251,,201.285,12.79,2.0,30.9,,2.70,75.86,0.838,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,NER,Africa,Niger,24206636.0,16.955,15.1,2.553,1.378,926.000,44.5,238.339,2.42,0.1,15.4,8.978,0.30,62.42,0.394,4
188,COD,Africa,Democratic Republic of Congo,89561404.0,35.879,17.0,3.020,1.745,808.133,77.1,318.949,6.10,,,4.472,,60.68,0.480,4
189,LBR,Africa,Liberia,5057677.0,49.127,19.2,3.057,1.756,752.788,38.6,272.509,2.42,1.5,18.1,1.188,0.80,64.10,0.480,4
190,BDI,Africa,Burundi,11890781.0,423.062,17.5,2.562,1.504,702.225,71.7,293.068,6.05,,,6.144,0.80,61.58,0.433,4


In [113]:
country_data.to_csv('data/country_data.csv', index=False)

Testing that worked okay. Each of the percentile sections should have the same number of countries in them

In [114]:
for option in country_data['gdp_per_cap_percentile'].drop_duplicates():
    print(option, len(country_data[country_data['gdp_per_cap_percentile'] == option]))

1 48
2 48
3 48
4 48


Finally lets apply this back to the country df, which is the dataset with all the country specific coviid data

In [115]:
gdp_percentile_dict = country_data.set_index('iso_code').to_dict('dict')['gdp_per_cap_percentile']

In [116]:
def get_gdp_percentile(iso_code):
    try: return int(round(gdp_percentile_dict[iso_code]))
    except: return np.nan

In [117]:
country_df['gdp_per_cap_percentile'] = country_df['iso_code'].apply(get_gdp_percentile)

country_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality,gdp_per_cap_percentile
55550,MEX,North America,Mexico,2020-01-01,,,,,,,...,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,,2.0
3412,ARG,South America,Argentina,2020-01-01,,,,,,,...,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,,2.0
3413,ARG,South America,Argentina,2020-01-02,,,,,,,...,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,,2.0
55551,MEX,North America,Mexico,2020-01-02,,,,,,,...,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,,2.0
3414,ARG,South America,Argentina,2020-01-03,,,,,,,...,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38026,HND,North America,Honduras,2021-06-13,247728.0,654.0,843.143,6631.0,25.0,21.714,...,240.208,7.21,2.0,,84.169,0.70,75.27,0.634,,3.0
89987,ARE,Asia,United Arab Emirates,2021-06-13,597986.0,1969.0,2130.714,1726.0,2.0,3.857,...,317.840,17.26,1.2,37.4,,1.20,77.97,0.890,,1.0
37566,HTI,North America,Haiti,2021-06-13,16079.0,0.0,113.857,346.0,0.0,3.286,...,430.548,6.65,2.9,23.1,22.863,0.70,64.00,0.510,,4.0
39976,IND,Asia,India,2021-06-13,29510410.0,70421.0,85776.429,374305.0,3921.0,3588.429,...,282.280,10.39,1.9,20.6,59.550,0.53,69.66,0.645,,3.0


In [118]:
country_df.to_csv('data/country_covid_data.csv', index=False)

## Aggregating covid data

Finally we can look at the covid data by country gdp per capita percentile

In [119]:
covid_fields = [
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'icu_patients',
 'hosp_patients',
 'weekly_icu_admissions',
 'weekly_hosp_admissions',
 'new_tests',
 'total_tests',
 'new_tests_smoothed',
 'positive_rate',
#  'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'new_vaccinations_smoothed',
#  'population',
#  'aged_65_older',
#  'aged_70_older',
 ]

In [120]:
cummalitive_covid_fields = ['total_cases',
 'total_deaths',
 'total_tests',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated']

changing_covid_fields = [
 'new_cases',
 'new_cases_smoothed',
 'new_deaths',
 'new_deaths_smoothed',
 'icu_patients',
 'hosp_patients',
 'weekly_icu_admissions',
 'weekly_hosp_admissions',
 'new_tests',
 'new_tests_smoothed',
 'positive_rate',
 'new_vaccinations']

In [121]:
percentile_options = list(country_df['gdp_per_cap_percentile'].drop_duplicates())

dates = list(country_df['date'].drop_duplicates())

In [122]:
countries = list(country_df['location'].drop_duplicates())

As the streamgraph uses total vaccines per date we need to fill in any dates for countries with missing data for that date

In [123]:
new_data = []
for country in tqdm(countries):
    df_temp = country_df[country_df['location'] == country]
    country_dates = list(df_temp['date'].drop_duplicates())

    country_data_started = False
    for date in dates:
        try:
            row = df_temp[df_temp['date']==date].iloc[0]
            country_data_started = True

            new_data.append(list(row))

            last_row = row.copy()
        except:
            if country_data_started:
                # print(country, 'Missing data on date:', date)

                row = last_row.copy()

                row['date'] = date

                for field in changing_covid_fields:
                    row[field] = 0

                new_data.append(list(row))

            else:
                pass

new_country_df = pd.DataFrame(new_data, columns = list(country_df))

100%|██████████| 218/218 [01:35<00:00,  2.28it/s]


In [124]:
country_df = new_country_df

Running test on data

In [125]:
for country in country_df['location'].drop_duplicates().sort_values():
    df_temp = country_df[country_df['location']==country]
    old_value = 0

    date_start = df_temp['date'].min()

    day = date_start

    # print('Data starts at:', day)
    for index, row in df_temp.iterrows():
        value = row['total_vaccinations']
        date = row['date']

        if date != day:
            print(country, 'missing date:', day + datetime.timedelta(days=1))

        if value < old_value:
            print(country, 'decreases on', date)

        old_value = value
        day = day + datetime.timedelta(days=1)

    last_date = dates[-1]
    date_end = df_temp['date'].max()

    if last_date != date_end:
        print(country, 'data ends in', date_end)

Agregating the data to the level of the percentile group

In [126]:
def prep_numbers(value):
    try: 
        return float(value)
    except: 
        print('Failed to prep', value)
        return np.nan

In [127]:
def gdp_stringifyer(value):
    option_dict = {
        1: "High",
        2: "Upper middle",
        3: "Lower middle",
        4: "Low"
    }

    try: return option_dict[int(value)]
    except: return np.nan

In [128]:
list(country_df['gdp_per_cap_percentile'].drop_duplicates().dropna().sort_values())

[1.0, 2.0, 3.0, 4.0]

In [129]:
country_df['gdp_per_cap_percentile'] = country_df['gdp_per_cap_percentile'].apply(gdp_stringifyer)

In [130]:
percentile_options = list(country_df['gdp_per_cap_percentile'].drop_duplicates().dropna().sort_values())

In [131]:
new_data = []
new_header = ['gdp_per_cap_percentile', 'date'] + covid_fields
total_vaccines_list = []
for percentile_option in tqdm(percentile_options):
    # print(percentile_option)
    percentile_option_df = country_df[country_df['gdp_per_cap_percentile'] == percentile_option].copy()

    countries_found = []
    for date in dates:
        new_row = [percentile_option, date].copy()
        percentile_option_date_df = percentile_option_df[percentile_option_df['date'] == date].copy()

        # print(list(percentile_option_date_df['date'].drop_duplicates()))

        countries_found_for_date = list(percentile_option_date_df['location'].drop_duplicates())

        for c in countries_found_for_date:
            if c not in countries_found:
                countries_found.append(c)

        for j in countries_found:
            if j not in countries_found_for_date:
                print(j, 'not found in', percentile_option, 'on date:', date)

        for field in covid_fields:
            try:
                new_value = percentile_option_date_df[field].sum()
            except:
                print(percentile_option_date_df)
                new_value = percentile_option_date_df[field].sum()


            new_row.append(new_value)

        # if len(df_temp_date) == 47: 
        new_data.append(new_row)

gdp_percentile_data = pd.DataFrame(new_data, columns = new_header)

100%|██████████| 4/4 [00:13<00:00,  3.43s/it]


In [132]:
gdp_percentile_data#.head(5)

Unnamed: 0,gdp_per_cap_percentile,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,icu_patients,hosp_patients,...,weekly_hosp_admissions,new_tests,total_tests,new_tests_smoothed,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed
0,High,2020-01-01,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000,0.000000e+00,0.0,0.0,0.0,0.0
1,High,2020-01-02,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000,0.000000e+00,0.0,0.0,0.0,0.0
2,High,2020-01-03,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000,0.000000e+00,0.0,0.0,0.0,0.0
3,High,2020-01-04,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000,0.000000e+00,0.0,0.0,0.0,0.0
4,High,2020-01-05,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000,0.000000e+00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2119,Upper middle,2021-06-10,57075515.0,226610.0,177598.570,1497650.0,5577.0,4471.998,0.0,0.0,...,0.0,1059319.0,298328662.0,978512.0,0.919,1.131704e+09,807209420.0,327974350.0,24603905.0,21341777.0
2120,Upper middle,2021-06-11,57293532.0,218017.0,184192.143,1502802.0,5152.0,4627.287,0.0,0.0,...,0.0,72844.0,32283416.0,78793.0,0.000,1.163483e+09,194989847.0,108425773.0,22390812.0,20993205.0
2121,Upper middle,2021-06-12,57491720.0,198188.0,186199.999,1507394.0,4592.0,4677.144,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000,1.044817e+09,103655323.0,62044459.0,17403799.0,19926265.0
2122,Upper middle,2021-06-13,57636067.0,144347.0,186905.284,1510475.0,3081.0,4693.569,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000,1.125746e+09,152435181.0,79743078.0,15723501.0,19718368.0


In [133]:
gdp_percentile_data.to_csv('data/gdp_percentile_data.csv', index=False)

Running tests

In [134]:
for country in gdp_percentile_data['gdp_per_cap_percentile'].drop_duplicates().sort_values():
    df_temp = gdp_percentile_data[gdp_percentile_data['gdp_per_cap_percentile']==country]
    old_value = 0

    date_start = df_temp['date'].min()

    day = date_start

    # print('Data starts at:', day)
    for index, row in df_temp.iterrows():
        value = row['total_vaccinations']
        date = row['date']

        if date != day:
            print(country, 'missing date:', day)

        if value < old_value:
            print(country, 'decreases on', date)

        old_value = value
        day = day + datetime.timedelta(days=1)

High decreases on 2020-12-22
High decreases on 2020-12-24
High decreases on 2020-12-27
High decreases on 2020-12-29
High decreases on 2020-12-31
High decreases on 2021-01-03
High decreases on 2021-01-09
High decreases on 2021-01-16
High decreases on 2021-02-15
High decreases on 2021-02-19
High decreases on 2021-03-05
High decreases on 2021-03-12
High decreases on 2021-03-19
High decreases on 2021-03-27
High decreases on 2021-04-02
High decreases on 2021-04-04
High decreases on 2021-04-10
High decreases on 2021-04-17
High decreases on 2021-04-20
High decreases on 2021-04-24
High decreases on 2021-04-26
High decreases on 2021-04-30
High decreases on 2021-05-03
High decreases on 2021-05-07
High decreases on 2021-05-14
High decreases on 2021-05-17
High decreases on 2021-05-18
High decreases on 2021-05-21
High decreases on 2021-05-22
High decreases on 2021-05-24
High decreases on 2021-05-27
High decreases on 2021-05-28
High decreases on 2021-05-31
High decreases on 2021-06-04
High decreases

## Rearanging data to work in the d3 streamgraph

In [135]:
list(gdp_percentile_data)

['gdp_per_cap_percentile',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'icu_patients',
 'hosp_patients',
 'weekly_icu_admissions',
 'weekly_hosp_admissions',
 'new_tests',
 'total_tests',
 'new_tests_smoothed',
 'positive_rate',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'new_vaccinations_smoothed']

In [136]:
value_type = 'new_vaccinations'

In [137]:
gdp_percentile_data.head(5)

Unnamed: 0,gdp_per_cap_percentile,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,icu_patients,hosp_patients,...,weekly_hosp_admissions,new_tests,total_tests,new_tests_smoothed,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed
0,High,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,High,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,High,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,High,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,High,2020-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
dates = list(gdp_percentile_data['date'].sort_values().drop_duplicates())
wealth_groups = list(gdp_percentile_data['gdp_per_cap_percentile'].sort_values().drop_duplicates().dropna())

In [139]:
new_data = []
new_header = ['date'] + wealth_groups
new_values = [[0,0,0,0]]
for row_num, date in enumerate(dates):
    new_row = [date]
    date_df = gdp_percentile_data[gdp_percentile_data['date'] == date]

    last_values_row = new_values[row_num]
    new_values_row = []
    for col_num, group in enumerate(wealth_groups):
        group_date_df = date_df[date_df['gdp_per_cap_percentile'] == group]

        last_value = last_values_row[col_num]
        value = list(group_date_df[value_type])[0]
        new_value = value + last_value

        new_values_row.append(new_value)
        
        new_row.append(new_value)

    
    new_values.append(new_values_row)
    new_data.append(new_row)

stream_graph_data = pd.DataFrame(new_data, columns = new_header)

In [140]:
stream_graph_data = stream_graph_data[stream_graph_data['date'] >= datetime.date(2020, 12, 8)]

In [141]:
stream_graph_data

Unnamed: 0,date,High,Low,Lower middle,Upper middle
342,2020-12-08,0.0,0.0,0.0,0.000000e+00
343,2020-12-09,0.0,0.0,0.0,0.000000e+00
344,2020-12-10,0.0,0.0,0.0,0.000000e+00
345,2020-12-11,0.0,0.0,0.0,0.000000e+00
346,2020-12-12,0.0,0.0,0.0,0.000000e+00
...,...,...,...,...,...
526,2021-06-10,683672622.0,13751477.0,284064069.0,1.024871e+09
527,2021-06-11,690139558.0,13919360.0,288619235.0,1.047261e+09
528,2021-06-12,695083687.0,14045766.0,291851685.0,1.064665e+09
529,2021-06-13,697634973.0,14162066.0,295633192.0,1.080389e+09


In [142]:
stream_graph_data.to_csv('data/vaccine_stream_data.csv', index = False)

## Testing stream data

Running a test to see if the values ever decrease. They should not

In [143]:
for wealth_group in wealth_groups:
    old_value = 0
    for index, row in stream_graph_data.iterrows():
        value = row[wealth_group]
        date = row['date']

        if value < old_value:
            print(wealth_group, 'decreases on', date)

        old_value = value