# Exploring Our World in Data  COVID-19 data

Imports

In [48]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [49]:
import datetime


To seperate out the countries in this data we need country iso-3 codes. There is a data set of this all [here](https://www.iban.com/country-codes). Luckily we can download the data in one line of code with pandas read_html function.

In [50]:
country_codes = pd.read_html('https://www.iban.com/country-codes')[0]

In [51]:
country_codes

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric
0,Afghanistan,AF,AFG,4
1,Åland Islands,AX,ALA,248
2,Albania,AL,ALB,8
3,Algeria,DZ,DZA,12
4,American Samoa,AS,ASM,16
...,...,...,...,...
244,Wallis and Futuna,WF,WLF,876
245,Western Sahara,EH,ESH,732
246,Yemen,YE,YEM,887
247,Zambia,ZM,ZMB,894


Lets save this for future use and incase the website ever goes down.

In [52]:
country_codes.to_csv('data/country_codes.csv', index=False)

Using the github download link we can pull the latest OWID COVID data straight from github

In [53]:
data_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-data.csv'

df = pd.read_csv(data_link)

df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85166,ZWE,Africa,Zimbabwe,2021-04-26,38102.0,16.0,34.714,1560.0,3.0,1.000,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
85167,ZWE,Africa,Zimbabwe,2021-04-27,38164.0,62.0,41.286,1565.0,5.0,1.571,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
85168,ZWE,Africa,Zimbabwe,2021-04-28,38191.0,27.0,30.143,1565.0,0.0,1.429,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
85169,ZWE,Africa,Zimbabwe,2021-04-29,38235.0,44.0,31.000,1567.0,2.0,1.714,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571


In [54]:
def interpret_date(value):
    year, month, day = value.split('-')

    return datetime.date(int(year), int(month), int(day))

In [55]:
df['date'] = df['date'].apply(interpret_date)

df = df.sort_values('date')

In [56]:
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
3061,ARG,South America,Argentina,2020-01-01,,,,,,,...,18933.907,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845
49485,MEX,North America,Mexico,2020-01-01,,,,,,,...,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779
3062,ARG,South America,Argentina,2020-01-02,,,,,,,...,18933.907,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845
49486,MEX,North America,Mexico,2020-01-02,,,,,,,...,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779
49487,MEX,North America,Mexico,2020-01-03,,,,,,,...,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27361,FJI,Oceania,Fiji,2021-04-30,117.0,1.0,4.429,2.0,0.0,0.000,...,8702.975,1.4,412.820,14.49,10.2,34.8,,2.30,67.44,0.743
68821,SGP,Asia,Singapore,2021-04-30,61145.0,24.0,28.857,30.0,0.0,0.000,...,85535.383,,92.243,10.99,5.2,28.3,,2.40,83.62,0.938
2568,AGO,Africa,Angola,2021-04-30,26652.0,221.0,196.143,596.0,2.0,3.143,...,5819.495,,276.045,3.94,,,26.664,,61.15,0.581
67961,SYC,Africa,Seychelles,2021-04-30,5873.0,314.0,95.286,28.0,2.0,0.286,...,26382.287,1.1,242.648,10.55,7.1,35.7,,3.60,73.40,0.796


## Analysing the countries in the data set

In [57]:
country_iso_3_codes = list(country_codes['Alpha-3 code'])

In [58]:
country_df = df[df['iso_code'].isin(country_iso_3_codes)]

In [59]:
country_df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
3061,ARG,South America,Argentina,2020-01-01,,,,,,,...,18933.907,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845
49485,MEX,North America,Mexico,2020-01-01,,,,,,,...,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779
3062,ARG,South America,Argentina,2020-01-02,,,,,,,...,18933.907,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845
49486,MEX,North America,Mexico,2020-01-02,,,,,,,...,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779
49487,MEX,North America,Mexico,2020-01-03,,,,,,,...,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27361,FJI,Oceania,Fiji,2021-04-30,117.0,1.0,4.429,2.0,0.0,0.000,...,8702.975,1.4,412.820,14.49,10.2,34.8,,2.30,67.44,0.743
68821,SGP,Asia,Singapore,2021-04-30,61145.0,24.0,28.857,30.0,0.0,0.000,...,85535.383,,92.243,10.99,5.2,28.3,,2.40,83.62,0.938
2568,AGO,Africa,Angola,2021-04-30,26652.0,221.0,196.143,596.0,2.0,3.143,...,5819.495,,276.045,3.94,,,26.664,,61.15,0.581
67961,SYC,Africa,Seychelles,2021-04-30,5873.0,314.0,95.286,28.0,2.0,0.286,...,26382.287,1.1,242.648,10.55,7.1,35.7,,3.60,73.40,0.796


In [60]:
country_data_header = ['iso_code',
 'continent',
 'location',
 'population',
 'population_density',
 'median_age',
 'aged_65_older',
 'aged_70_older',
 'gdp_per_capita',
 'extreme_poverty',
 'cardiovasc_death_rate',
 'diabetes_prevalence',
 'female_smokers',
 'male_smokers',
 'handwashing_facilities',
 'hospital_beds_per_thousand',
 'life_expectancy',
 'human_development_index']

In [61]:
country_data = country_df.drop_duplicates(subset = 'location')[country_data_header].sort_values('gdp_per_capita', ascending=False).reset_index(drop=True).dropna(subset=['gdp_per_capita'])

country_data

Unnamed: 0,iso_code,continent,location,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,QAT,Asia,Qatar,2881060.0,227.322,31.9,1.307,0.617,116935.600,,176.690,16.52,0.8,26.9,,1.20,80.23,0.848
1,MAC,Asia,Macao,649342.0,20546.766,39.2,9.798,4.991,104861.851,,,,,,,,84.24,
2,LUX,Europe,Luxembourg,625976.0,231.447,39.7,14.312,9.842,94277.965,0.2,128.275,4.42,20.9,26.0,,4.51,82.25,0.916
3,SGP,Asia,Singapore,5850343.0,7915.731,42.4,12.922,7.049,85535.383,,92.243,10.99,5.2,28.3,,2.40,83.62,0.938
4,BRN,Asia,Brunei,437483.0,81.347,32.4,4.591,2.382,71809.251,,201.285,12.79,2.0,30.9,,2.70,75.86,0.838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,NER,Africa,Niger,24206636.0,16.955,15.1,2.553,1.378,926.000,44.5,238.339,2.42,0.1,15.4,8.978,0.30,62.42,0.394
184,COD,Africa,Democratic Republic of Congo,89561404.0,35.879,17.0,3.020,1.745,808.133,77.1,318.949,6.10,,,4.472,,60.68,0.480
185,LBR,Africa,Liberia,5057677.0,49.127,19.2,3.057,1.756,752.788,38.6,272.509,2.42,1.5,18.1,1.188,0.80,64.10,0.480
186,BDI,Africa,Burundi,11890781.0,423.062,17.5,2.562,1.504,702.225,71.7,293.068,6.05,,,6.144,0.80,61.58,0.433


We now have a dataset with all our country specific data. To analyse this data by country gdp level we need to sort the dataset by gdp per capita and seperate it out into four sections; 1, 2, 3, 4 where 1 is the top perctile etc

In [62]:
gdp_percentile_list = []
number_of_countries = len(country_data)
number_of_sections = 4
section = 1
for index, row in country_data.iterrows():
    if index < (section/number_of_sections)*number_of_countries:
        gdp_percentile_list.append(section)
    else:
        section += 1
        gdp_percentile_list.append(section)

In [63]:
country_data['gdp_per_cap_percentile'] = gdp_percentile_list

country_data

Unnamed: 0,iso_code,continent,location,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,gdp_per_cap_percentile
0,QAT,Asia,Qatar,2881060.0,227.322,31.9,1.307,0.617,116935.600,,176.690,16.52,0.8,26.9,,1.20,80.23,0.848,1
1,MAC,Asia,Macao,649342.0,20546.766,39.2,9.798,4.991,104861.851,,,,,,,,84.24,,1
2,LUX,Europe,Luxembourg,625976.0,231.447,39.7,14.312,9.842,94277.965,0.2,128.275,4.42,20.9,26.0,,4.51,82.25,0.916,1
3,SGP,Asia,Singapore,5850343.0,7915.731,42.4,12.922,7.049,85535.383,,92.243,10.99,5.2,28.3,,2.40,83.62,0.938,1
4,BRN,Asia,Brunei,437483.0,81.347,32.4,4.591,2.382,71809.251,,201.285,12.79,2.0,30.9,,2.70,75.86,0.838,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,NER,Africa,Niger,24206636.0,16.955,15.1,2.553,1.378,926.000,44.5,238.339,2.42,0.1,15.4,8.978,0.30,62.42,0.394,4
184,COD,Africa,Democratic Republic of Congo,89561404.0,35.879,17.0,3.020,1.745,808.133,77.1,318.949,6.10,,,4.472,,60.68,0.480,4
185,LBR,Africa,Liberia,5057677.0,49.127,19.2,3.057,1.756,752.788,38.6,272.509,2.42,1.5,18.1,1.188,0.80,64.10,0.480,4
186,BDI,Africa,Burundi,11890781.0,423.062,17.5,2.562,1.504,702.225,71.7,293.068,6.05,,,6.144,0.80,61.58,0.433,4


In [64]:
country_data.to_csv('data/country_data.csv', index=False)

Testing that worked okay. Each of the percentile sections should have the same number of countries in them

In [65]:
for option in country_data['gdp_per_cap_percentile'].drop_duplicates():
    print(option, len(country_data[country_data['gdp_per_cap_percentile'] == option]))

1 47
2 47
3 47
4 47


Finally lets apply this back to the country df, which is the dataset with all the country specific coviid data

In [66]:
gdp_percentile_dict = country_data.set_index('iso_code').to_dict('dict')['gdp_per_cap_percentile']

In [67]:
def get_gdp_percentile(iso_code):
    try: return int(round(gdp_percentile_dict[iso_code]))
    except: return np.nan

In [68]:
country_df['gdp_per_cap_percentile'] = country_df['iso_code'].apply(get_gdp_percentile)

country_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,gdp_per_cap_percentile
3061,ARG,South America,Argentina,2020-01-01,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,2.0
49485,MEX,North America,Mexico,2020-01-01,,,,,,,...,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,2.0
3062,ARG,South America,Argentina,2020-01-02,,,,,,,...,0.6,191.032,5.50,16.2,27.7,,5.00,76.67,0.845,2.0
49486,MEX,North America,Mexico,2020-01-02,,,,,,,...,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,2.0
49487,MEX,North America,Mexico,2020-01-03,,,,,,,...,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27361,FJI,Oceania,Fiji,2021-04-30,117.0,1.0,4.429,2.0,0.0,0.000,...,1.4,412.820,14.49,10.2,34.8,,2.30,67.44,0.743,3.0
68821,SGP,Asia,Singapore,2021-04-30,61145.0,24.0,28.857,30.0,0.0,0.000,...,,92.243,10.99,5.2,28.3,,2.40,83.62,0.938,1.0
2568,AGO,Africa,Angola,2021-04-30,26652.0,221.0,196.143,596.0,2.0,3.143,...,,276.045,3.94,,,26.664,,61.15,0.581,3.0
67961,SYC,Africa,Seychelles,2021-04-30,5873.0,314.0,95.286,28.0,2.0,0.286,...,1.1,242.648,10.55,7.1,35.7,,3.60,73.40,0.796,2.0


In [69]:
country_df.to_csv('data/country_covid_data.csv', index=False)

## Aggregating covid data

Finally we can look at the covid data by country gdp per capita percentile

In [70]:
covid_fields = [
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'icu_patients',
 'hosp_patients',
 'weekly_icu_admissions',
 'weekly_hosp_admissions',
 'new_tests',
 'total_tests',
 'new_tests_smoothed',
 'positive_rate',
#  'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'new_vaccinations_smoothed',
#  'population',
#  'aged_65_older',
#  'aged_70_older',
 ]

In [71]:
percentile_options = list(country_df['gdp_per_cap_percentile'].drop_duplicates())

dates = list(country_df['date'].drop_duplicates())

In [72]:
def prep_numbers(value):
    try: return float(value)
    except: return np.nan

In [73]:
new_data = []
new_header = ['gdp_per_cap_percentile', 'date'] + covid_fields
for percentile_option in tqdm(percentile_options):
    df_temp = country_df[country_df['gdp_per_cap_percentile'] == percentile_option]

    for date in dates:
        new_row = [percentile_option, date]
        df_temp_date = df_temp[df_temp['date'] == date]

        for field in covid_fields:
            new_value = df_temp_date[field].apply(prep_numbers).dropna().sum()
            new_row.append(new_value)

        # if len(df_temp_date) == 47: 
        new_data.append(new_row)

gdp_percentile_data = pd.DataFrame(new_data, columns = new_header)

100%|██████████| 5/5 [00:38<00:00,  7.68s/it]


In [74]:
list(gdp_percentile_data)

['gdp_per_cap_percentile',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'icu_patients',
 'hosp_patients',
 'weekly_icu_admissions',
 'weekly_hosp_admissions',
 'new_tests',
 'total_tests',
 'new_tests_smoothed',
 'positive_rate',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'new_vaccinations_smoothed']

In [75]:
gdp_percentile_data.head(5)

Unnamed: 0,gdp_per_cap_percentile,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,icu_patients,hosp_patients,...,weekly_hosp_admissions,new_tests,total_tests,new_tests_smoothed,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed
0,2.0,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,108.0,135.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,93.0,228.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,112.0,340.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,2020-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,91.0,431.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
gdp_percentile_data.to_csv('data/gdp_percentile_data.csv', index=False)

## Rearanging data to work in the d3 streamgraph

In [77]:
value_type = 'total_vaccinations'

In [78]:
gdp_percentile_data.head(5)

Unnamed: 0,gdp_per_cap_percentile,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,icu_patients,hosp_patients,...,weekly_hosp_admissions,new_tests,total_tests,new_tests_smoothed,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed
0,2.0,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,108.0,135.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,93.0,228.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,112.0,340.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,2020-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,91.0,431.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
def gdp_stringifyer(value):
    option_dict = {
        1: "High",
        2: "Upper middle",
        3: "Lower middle",
        4: "Low"
    }

    try: return option_dict[int(value)]
    except: return np.nan

In [80]:
gdp_percentile_data['wealth_group'] = gdp_percentile_data['gdp_per_cap_percentile'].apply(gdp_stringifyer)

gdp_percentile_data

Unnamed: 0,gdp_per_cap_percentile,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,icu_patients,hosp_patients,...,new_tests,total_tests,new_tests_smoothed,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,wealth_group
0,2.0,2020-01-01,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,27.0,27.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,Upper middle
1,2.0,2020-01-02,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,108.0,135.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,Upper middle
2,2.0,2020-01-03,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,93.0,228.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,Upper middle
3,2.0,2020-01-04,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,112.0,340.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,Upper middle
4,2.0,2020-01-05,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,...,91.0,431.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,Upper middle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2425,4.0,2021-04-26,2352458.0,9848.0,10971.571,38701.0,216.0,216.146,0.0,0.0,...,63033.0,18230273.0,68780.0,1.594,14015307.0,10680363.0,3334944.0,216907.0,269774.0,Low
2426,4.0,2021-04-27,2363406.0,10948.0,10975.428,38910.0,209.0,213.570,0.0,0.0,...,61045.0,17145920.0,68194.0,1.575,14333595.0,10823178.0,3510417.0,254082.0,266171.0,Low
2427,4.0,2021-04-28,2375708.0,12302.0,11095.145,39122.0,212.0,212.002,0.0,0.0,...,59485.0,15845036.0,65876.0,1.476,13236251.0,10422676.0,2813575.0,209426.0,265744.0,Low
2428,4.0,2021-04-29,2393378.0,17670.0,11942.717,39443.0,321.0,228.289,0.0,0.0,...,45293.0,12033725.0,48752.0,0.568,12413247.0,8566479.0,3846768.0,150777.0,215655.0,Low


In [86]:
dates = list(gdp_percentile_data['date'].sort_values().drop_duplicates())
wealth_groups = list(gdp_percentile_data['wealth_group'].sort_values().drop_duplicates().dropna())

In [87]:
new_data = []
new_header = ['date'] + wealth_groups
for date in dates:
    new_row = [date]
    date_df = gdp_percentile_data[gdp_percentile_data['date'] == date]
    for group in wealth_groups:
        group_date_df = date_df[date_df['wealth_group'] == group]

        try:
            value = list(group_date_df[value_type])[0]
        except:
            value = 0
        
        new_row.append(value)

    new_data.append(new_row)

stream_graph_data = pd.DataFrame(new_data, columns = new_header)


In [88]:
stream_graph_data = stream_graph_data[stream_graph_data['date'] >= datetime.date(2020, 12, 8)]

In [89]:
stream_graph_data

Unnamed: 0,date,High,Low,Lower middle,Upper middle
342,2020-12-08,0.0,0.0,0.0,0.0
343,2020-12-09,0.0,0.0,0.0,0.0
344,2020-12-10,0.0,0.0,0.0,0.0
345,2020-12-11,0.0,0.0,0.0,0.0
346,2020-12-12,0.0,0.0,0.0,0.0
...,...,...,...,...,...
481,2021-04-26,434218982.0,14015307.0,180580641.0,374445548.0
482,2021-04-27,449315782.0,14333595.0,184210490.0,396008673.0
483,2021-04-28,453353925.0,13236251.0,184885713.0,395678849.0
484,2021-04-29,452792498.0,12413247.0,179375293.0,420841546.0


In [90]:
stream_graph_data.to_csv('data/vaccine_stream_data.csv', index = False)