In [635]:
from functools import reduce
import pandas as pd
import datetime
from polio_utils import download_polio_data, extract_wild_cases, extract_vd_cases, owid_population, standardise_countries


Download and extract the data from the latest wild polio virus pdf from polioeradication.org

In [636]:
res = download_polio_data(url_stub='https://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-WPV-')
date = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_wild_cases_{date}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

wt_df = extract_wild_cases(file_path = fp)

Calculate wild polio cases per million population

In [637]:
population = owid_population()
wt_df['entity'] = standardise_countries(wt_df['entity'])
wt_df['year'] = wt_df['year'].astype(int)
wt_df['wild_polio_1_cases'] = wt_df['wild_polio_1_cases'].astype(int)

wt_pop = pd.DataFrame(pd.merge(left = population, right = wt_df, how="right"))
wt_pop['wild_polio_1_cases_per_million'] = (wt_pop['wild_polio_1_cases']/wt_pop['population']) * 1000000
wt_pop['wild_polio_1_cases_per_million'] = wt_pop['wild_polio_1_cases_per_million'].fillna(0).round(decimals=3)

wt_pop.head()

Unnamed: 0,entity,year,population,wild_polio_1_cases,wild_polio_1_cases_per_million
0,Pakistan,2016,203631360,20,0.098
1,Afghanistan,2016,35383028,13,0.367
2,Malawi,2016,17205254,0,0.0
3,Nigeria,2016,185960256,4,0.022
4,Iran,2016,79563992,0,0.0


Download and extract the data from the latest vaccine derived polio cases pdf from polio eradication.org

In [638]:
res = download_polio_data(url_stub='http://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-cVDPV-')
date = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_vaccine_derived_cases_{date}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

vd_df = extract_vd_cases(file_path = fp)

Calculate cases per million population

In [639]:
population = owid_population()
vd_df['entity'] = standardise_countries(vd_df['entity'])
vd_df['year'] = vd_df['year'].astype(int)

vd_pop = pd.DataFrame(pd.merge(left = population, right = vd_df, how="right"))
vd_pop['cVDPV1_per_million'] = (vd_pop['cVDPV1']/vd_pop['population']) * 1000000
vd_pop['cVDPV2_per_million'] = (vd_pop['cVDPV2']/vd_pop['population']) * 1000000
vd_pop['cVDPV3_per_million'] = (vd_pop['cVDPV3']/vd_pop['population']) * 1000000
vd_pop['total_cVDPV_per_million'] = (vd_pop['total_cVDPV']/vd_pop['population']) * 1000000

vd_pop.head()

Unnamed: 0,entity,year,population,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million
0,Afghanistan,2020,38928340,0.0,308.0,0.0,308.0,0.0,7.911974,0.0,7.911974
1,Afghanistan,2021,39835428,0.0,43.0,0.0,43.0,0.0,1.079441,0.0,1.079441
2,Angola,2019,31825298,0.0,138.0,0.0,138.0,0.0,4.336173,0.0,4.336173
3,Angola,2020,32866270,0.0,3.0,0.0,3.0,0.0,0.091279,0.0,0.091279
4,Benin,2019,11801151,0.0,8.0,0.0,8.0,0.0,0.6779,0.0,0.6779


Combine the Wild and Vaccine Derived cases

In [640]:
polio_dataframes = [wt_pop, vd_pop]

polio_df = reduce(
        lambda left, right: pd.merge(left, right, on=["entity", "year", "population"], how="outer"),
        polio_dataframes,
    )
polio_df = polio_df.drop(columns = ['population'])


Combine cases for entities where strains are currently recorded on different rows, just Total I think.

In [641]:
polio_df = polio_df.groupby(['entity', 'year']).sum()
polio_df = polio_df.reset_index()
polio_df.head()

Unnamed: 0,entity,year,wild_polio_1_cases,wild_polio_1_cases_per_million,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million
0,Afghanistan,2016,13.0,0.367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,2017,14.0,0.386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Afghanistan,2018,21.0,0.565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Afghanistan,2019,29.0,0.762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Afghanistan,2020,56.0,1.439,0.0,308.0,0.0,308.0,0.0,7.911974,0.0,7.911974


Add columns for: total polio = wild + vaccine derived

In [642]:
polio_df['total_polio'] = polio_df['wild_polio_1_cases'] + polio_df['total_cVDPV']
polio_df['total_polio_per_million'] = polio_df['wild_polio_1_cases_per_million'] + polio_df['total_cVDPV_per_million']
polio_df[['total_polio','total_polio_per_million']] = polio_df[['total_polio','total_polio_per_million']].fillna(0)

Data from 1980 onwards from WHO - download from http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1

In [643]:
who_polio = pd.read_excel('data/incidence_series.xls', sheet_name='Polio')
who_polio

Unnamed: 0,WHO_REGION,ISO_code,Cname,Disease,2019,2018,2017,2016,2015,2014,...,1989,1988,1987,1986,1985,1984,1983,1982,1981,1980
0,EMR,AFG,Afghanistan,polio,0.0,0.0,0.0,13.0,20.0,28.0,...,55.0,307.0,628.0,1843.0,1981.0,552.0,1991.0,1390.0,837.0,880.0
1,EUR,ALB,Albania,polio,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
2,AFR,DZA,Algeria,polio,0.0,0.0,0.0,0.0,0.0,0.0,...,18.0,9.0,35.0,29.0,66.0,108.0,132.0,71.0,114.0,116.0
3,EUR,AND,Andorra,polio,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,AFR,AGO,Angola,polio,138.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,15.0,37.0,14.0,3.0,0.0,6.0,12.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,AMR,VEN,Venezuela (Bolivarian Republic of),polio,0.0,0.0,0.0,0.0,0.0,0.0,...,16.0,17.0,46.0,27.0,8.0,9.0,9.0,30.0,68.0,11.0
190,WPR,VNM,Viet Nam,polio,,,0.0,0.0,0.0,0.0,...,427.0,839.0,1449.0,938.0,1600.0,1158.0,1109.0,897.0,644.0,1741.0
191,EMR,YEM,Yemen,polio,1.0,0.0,0.0,0.0,0.0,0.0,...,701.0,114.0,179.0,601.0,336.0,767.0,633.0,235.0,541.0,722.0
192,AFR,ZMB,Zambia,polio,2.0,0.0,0.0,0.0,0.0,0.0,...,47.0,85.0,69.0,134.0,128.0,177.0,182.0,243.0,429.0,276.0


Create a regions table to assign countries to WHO regions later on.

In [644]:
regions = who_polio[['WHO_REGION', 'Cname']].drop_duplicates().rename(columns = {'Cname':'entity'})
who_polio.drop(columns = ['Disease','WHO_REGION','ISO_code',], inplace=True)


Reformat the WHO data so it is long and can be joined to the data from Polio Eradication

In [645]:
who_df = pd.melt(who_polio, id_vars=['Cname'])
who_df['entity'] = standardise_countries(who_df['Cname'])
who_df = who_df[['entity', 'variable', 'value']].rename(columns = {'variable':'year', 'value':'total_polio'})
who_df[['year']]=who_df[['year']].astype(int)
who_df = who_df[who_df['year'] < 2016]

Calculate total polion cases per million

In [646]:
population = owid_population()
who_df = pd.DataFrame(pd.merge(left = population, right = who_df, how="right"))
who_df['total_polio_per_million'] = (who_df['total_polio']/who_df['population']) * 1000000
who_df.fillna(0)
who_df.drop(columns = "population", inplace = True)

Polio eradication only presents data for countries with recent polio cases, so we need to add 0 cases for the rest of the world. Create an empty dataframe for all countries for 2016 - latest year so that countries with zero cases are recorded as such, rather than as 0. 

In [647]:
fill_df = pd.DataFrame([(x, y) for x in who_df['entity'].drop_duplicates() for y in range(2016,datetime.datetime.now().year)])
fill_df.rename(columns={0:'entity', 1:'year'},inplace=True)
fill_df['total_polio'] = None
fill_df['total_polio_per_million'] = None
who_fill = who_df.append(pd.DataFrame(data = fill_df), ignore_index=True)
who_fill['total_polio_per_million'] = who_fill['total_polio_per_million'].astype(float)

Combine the WHO polio data (1980-2015) with the Polio Eradication data (2016-)

In [650]:
who_df = who_fill.merge(polio_df[['entity','year','total_polio','total_polio_per_million']], on = ['entity', 'year', 'total_polio','total_polio_per_million'], how ='outer')
who_df = who_df.sort_values(['entity', 'year', 'total_polio']).drop_duplicates(['year', 'entity'], keep='first')


In [651]:
who_df[who_df['entity'] == 'Yemen']

Unnamed: 0,entity,year,total_polio,total_polio_per_million
6981,Yemen,1980,722.0,90.910201
6787,Yemen,1981,541.0,65.719903
6593,Yemen,1982,235.0,27.512394
6399,Yemen,1983,633.0,71.369274
6205,Yemen,1984,767.0,83.251222
6011,Yemen,1985,336.0,35.101759
5817,Yemen,1986,601.0,60.456074
5623,Yemen,1987,179.0,17.341527
5429,Yemen,1988,114.0,10.623562
5235,Yemen,1989,701.0,62.649782


In [652]:
total_df = polio_df.merge(who_df, on = ['entity', 'year', 'total_polio', 'total_polio_per_million'], how = "outer")
total_df[total_df['entity'] == 'Yemen']

Unnamed: 0,entity,year,wild_polio_1_cases,wild_polio_1_cases_per_million,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million,total_polio,total_polio_per_million
101,Yemen,2019,0.0,0.0,1.0,0.0,0.0,1.0,0.034291,0.0,0.0,0.034291,1.0,0.034291
102,Yemen,2020,0.0,0.0,31.0,0.0,0.0,31.0,1.039363,0.0,0.0,1.039363,31.0,1.039363
103,Yemen,2021,0.0,0.0,3.0,13.0,0.0,16.0,0.098391,0.42636,0.0,0.524751,16.0,0.524751
8032,Yemen,1980,,,,,,,,,,,722.0,90.910201
8033,Yemen,1981,,,,,,,,,,,541.0,65.719903
8034,Yemen,1982,,,,,,,,,,,235.0,27.512394
8035,Yemen,1983,,,,,,,,,,,633.0,71.369274
8036,Yemen,1984,,,,,,,,,,,767.0,83.251222
8037,Yemen,1985,,,,,,,,,,,336.0,35.101759
8038,Yemen,1986,,,,,,,,,,,601.0,60.456074


In [653]:
total_df[['total_polio', 'total_polio_per_million']] = total_df[['total_polio', 'total_polio_per_million']].fillna(0)


Summing cases for WHO regions and renaming WHO region abbreviations

In [654]:
regional_total = regions.merge(total_df,on = 'entity').groupby(['WHO_REGION', 'year']).sum().reset_index()
regional_total['WHO_REGION'].replace(['AFR', 'AMR', 'SEAR', 'EUR', 'EMR', 'WPR'], ['Africa', 'Americas', 'South-East Asia', 'Europe', 'Eastern Mediterranean', 'Western Pacific'], inplace = True)
regional_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)



Replacing NAs with 0s for the 'total_polio' columns - we can't do this for the strain/polio type columns as we don't know this.

In [655]:
total_df = pd.concat([regional_total, total_df])
total_df[['total_polio', 'total_polio_per_million']] = total_df[['total_polio', 'total_polio_per_million']].fillna(0)
total_df = total_df.fillna("")
total_df['total_polio_per_million']  = round(total_df['total_polio_per_million'],3)


In [656]:
total_df.to_csv('data/polio_cases_to_upload.csv', index=False)