# Import and clean up of the control variables for my model

In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import seaborn as sns
import sidetable

In [2]:
##set a working directory
user = os.path.expanduser('~')
display(user)

os.makedirs(f'{user}/Desktop/private/thesis/', exist_ok=True)
path = os.chdir(f'{user}/Desktop/'+'private/thesis/control variables/')

'/Users/NatStrom'

In [None]:
#import the ingester extension functions
from ingester3.scratch import cache_manager
cache_manager()
from ingester3.extensions import *

import viewser
from viewser import Queryset, Column
from views_transformation_library import utilities

import ingester3
from ingester3.Country import Country

In [None]:
def nullcounts(ser):
    return ser.isnull().sum()

def custom_describe(frame, func=[nullcounts, 'sum', 'mean', 'median', 'max'],
                    numeric_only=True, **kwargs):
    if numeric_only:
        frame = frame.select_dtypes(include=np.number)
    return frame.agg(func, **kwargs)

## 1. WDI indicator

In [None]:
data_age = pd.read_csv('pop0014.csv',skiprows= 4)
data_gdp = pd.read_csv('gdp_pcap.csv',skiprows=4)
data_xpd_gdp = pd.read_csv('expense_gdp.csv',skiprows=4)

In [None]:
display(data_age.columns)
display(data_xpd_gdp.columns)
display(data_gdp.columns)
display(data_age)
display(data_xpd_gdp)
display(data_gdp)

In [None]:
#stack, filter and rename the columns
##rename
data_age = data_age.rename(columns= {'Country Code':'iso'})
data_xpd_gdp = data_xpd_gdp.rename(columns= {'Country Code':'iso'})
data_gdp = data_gdp.rename(columns= {'Country Code':'iso'})
##filter the needed columns
data_age = data_age.drop(columns=['Country Name','Indicator Name', 'Indicator Code','Unnamed: 66'])
data_xpd_gdp = data_xpd_gdp.drop(columns=['Country Name','Indicator Name', 'Indicator Code','Unnamed: 66'])
data_gdp = data_gdp.drop(columns=['Country Name','Indicator Name', 'Indicator Code','Unnamed: 66'])
##stack the individual year columns
data_age = data_age.melt(id_vars='iso',var_name='year_id',value_name='age0014_value')
data_xpd_gdp = data_xpd_gdp.melt(id_vars='iso',var_name='year_id',value_name='xpd_gdp_value')
data_gdp = data_gdp.melt(id_vars='iso',var_name='year_id',value_name='gdp_pcap_value')

display(data_age)
display(data_xpd_gdp)
display(data_gdp)

In [None]:
data_gdp['year_id'] = data_gdp['year_id'].astype(str).astype(int)
data_age['year_id'] = data_age['year_id'].astype(str).astype(int)
data_xpd_gdp['year_id'] = data_xpd_gdp['year_id'].astype(str).astype(int)

In [None]:
#merge them together
data_wdi = pd.merge(data_age,data_gdp, how="outer", on=["iso", "year_id"])
data_wdi = pd.merge(data_wdi,data_xpd_gdp, how="outer", on=["iso", "year_id"])
display(data_wdi)

In [None]:
## only years from 1980 onwards
data_wdi = data_wdi.loc[data_wdi['year_id'] >= 1990]
data_wdi = data_wdi.loc[data_wdi['year_id'] <= 2019]

In [None]:
data_wdi = data_wdi.loc[data_wdi['iso'].isin(['AGO', 'BDI', 'BEN', 'BFA', 'BWA', 'CAF', 'CIV', 'CMR', 'COD',
       'COG', 'COM', 'CPV', 'DJI', 'ERI', 'ETH', 'GAB', 'GHA', 'GIN',
       'GMB', 'GNB', 'GNQ', 'KEN', 'LBR', 'LSO', 'MDG', 'MLI', 'MOZ',
       'MRT', 'MWI', 'NAM', 'NER', 'NGA', 'RWA', 'SDN', 'SEN', 'SLE',
       'SOM', 'SSD', 'STP', 'SWZ', 'TCD', 'TGO', 'TZA', 'UGA', 'ZAF',
       'ZMB', 'ZWE'])]

In [None]:
data_wdi

In [None]:
#test for missingness
#getting the % of missing values in each column
values_list = list()
cols_list = list() #creating empty lists

for col in data_wdi.columns:
    pct_missing= np.mean(data_wdi[col].isnull())*100
    cols_list.append(col)
    values_list.append(pct_missing)

pct_missing_df =pd.DataFrame()
pct_missing_df["col"] = cols_list
pct_missing_df["pct_missing"] = values_list
pct_missing_df

In [None]:
custom_describe(data_wdi)

In [None]:
display(data_wdi.stb.freq(['iso','year_id'],value='age0014_value'))
display(data_wdi.stb.freq(['iso','year_id'],value='gdp_pcap_value'))
display(data_wdi.stb.freq(['iso','year_id'],value='xpd_gdp_value'))

In [None]:
#get a histogram
#create histograms for all mean variables
figure, axes = plt.subplots(3,1,figsize=(10,55))
sns.histplot(data_wdi['age0014_value'],ax=axes[0])
sns.histplot(data_wdi['gdp_pcap_value'],ax=axes[1])
sns.histplot(data_wdi['xpd_gdp_value'],ax=axes[2])
plt.savefig('wdi1.png')

In [None]:
#create trendlines
figure, axes = plt.subplots(3,1,figsize=(10,60))

sns.lineplot(x='year_id',y='age0014_value',data=data_wdi,err_style='bars',ax=axes[0])
sns.lineplot(x='year_id',y='gdp_pcap_value',data=data_wdi,err_style='bars',ax=axes[1])
sns.lineplot(x='year_id',y='xpd_gdp_value',data=data_wdi,err_style='bars',ax=axes[2])

plt.savefig('wdi2.png')

In [None]:
#logtransform the wdi_gdp variable
data_wdi['gdp_log'] = np.log(data_wdi['gdp_pcap_value'])
data_wdi

In [None]:
#compare skewed and logtransformed gdp
figure, axes = plt.subplots(2,1,figsize=(10,55))
sns.histplot(data_wdi['gdp_log'],ax=axes[0])
sns.histplot(data_wdi['gdp_pcap_value'],ax=axes[1])
plt.savefig('wdi_log.png')

In [None]:
#save to csv
data_wdi.to_csv('data_wdi.csv')

## 2. Development assistance to health

In [None]:
#load data (already cleaned up)
data_dah = pd.read_csv('DAH.csv')
display(data_dah)

In [None]:
data_dah = data_dah.loc[data_dah['iso'].isin(['AGO', 'BDI', 'BEN', 'BFA', 'BWA', 'CAF', 'CIV', 'CMR', 'COD',
       'COG', 'COM', 'CPV', 'DJI', 'ERI', 'ETH', 'GAB', 'GHA', 'GIN',
       'GMB', 'GNB', 'GNQ', 'KEN', 'LBR', 'LSO', 'MDG', 'MLI', 'MOZ',
       'MRT', 'MWI', 'NAM', 'NER', 'NGA', 'RWA', 'SDN', 'SEN', 'SLE',
       'SOM', 'SSD', 'STP', 'SWZ', 'TCD', 'TGO', 'TZA', 'UGA', 'ZAF',
       'ZMB', 'ZWE'])]

In [None]:
custom_describe(data_dah)

In [None]:
data_dah.columns
data_dah = data_dah.filter(['country', 'year_id', 'iso','ihme_dah_total_aid'])

In [None]:
#check for missingness
#getting the % of missing values in each column
values_list = list()
cols_list = list() #creating empty lists

for col in data_dah.columns:
    pct_missing= np.mean(data_dah[col].isnull())*100
    cols_list.append(col)
    values_list.append(pct_missing)

pct_missing_df =pd.DataFrame()
pct_missing_df["col"] = cols_list
pct_missing_df["pct_missing"] = values_list
pct_missing_df

In [None]:
#create histograms for all mean variables

sns.histplot(data_dah['ihme_dah_total_aid'],kde= True)

plt.savefig('dah1.png')

In [None]:
#create trendlines
sns.lineplot(x='year_id',y='ihme_dah_total_aid',data=data_dah,err_style='bars')

plt.savefig('dah2.png')

In [None]:
#logtransform the aid
data_dah['dah_total_log'] = np.log(data_dah['ihme_dah_total_aid'])
display(data_dah)

figure, axes = plt.subplots(2,1,figsize=(10,55))
sns.histplot(data_dah['ihme_dah_total_aid'],ax=axes[0])
sns.histplot(data_dah['dah_total_log'],ax=axes[1])

plt.savefig('dah_log.png')

In [None]:
custom_describe(data_dah)

In [None]:
data_dah.to_csv('data_dah.csv')

## 3.  WASH, ND-GAIN country indicators

In [None]:
#load data
data_water = pd.read_csv('access_drink.csv')
data_sani = pd.read_csv('access_sani.csv')
display(data_water, data_sani)

In [None]:
#stack, filter and rename the columns
##rename
data_water = data_water.rename(columns= {'ISO3':'iso'})
data_sani = data_sani.rename(columns= {'ISO3':'iso'})
##filter the needed columns
data_sani = data_sani.drop(columns=['Name'])
data_water = data_water.drop(columns=['Name'])
##stack the individual year columns
data_water = data_water.melt(id_vars='iso',var_name='year_id',value_name='access_drinkwater_index')
data_sani = data_sani.melt(id_vars='iso',var_name='year_id',value_name='access_sani_index')

In [None]:
display(data_sani.dtypes, data_water.dtypes)
data_water['year_id'] = data_water['year_id'].astype(str).astype(int)
data_sani['year_id'] = data_sani['year_id'].astype(str).astype(int)

In [None]:
data_wash = pd.merge(data_water,data_sani, how="outer", on=["iso", "year_id"])

In [None]:
#group by country-year & only filter iso for  SSA)
data_wash = data_wash.loc[data_wash['iso'].isin(['AGO', 'BDI', 'BEN', 'BFA', 'BWA', 'CAF', 'CIV', 'CMR', 'COD',
       'COG', 'COM', 'CPV', 'DJI', 'ERI', 'ETH', 'GAB', 'GHA', 'GIN',
       'GMB', 'GNB', 'GNQ', 'KEN', 'LBR', 'LSO', 'MDG', 'MLI', 'MOZ',
       'MRT', 'MWI', 'NAM', 'NER', 'NGA', 'RWA', 'SDN', 'SEN', 'SLE',
       'SOM', 'SSD', 'STP', 'SWZ', 'TCD', 'TGO', 'TZA', 'UGA', 'ZAF',
       'ZMB', 'ZWE'])]
data_wash = data_wash.groupby(['iso','year_id']).sum().reset_index()
display(data_wash)

In [None]:
#test for missingness
#getting the % of missing values in each column
values_list = list()
cols_list = list() #creating empty lists

for col in data_wash.columns:
    pct_missing= np.mean(data_wash[col].isnull())*100
    cols_list.append(col)
    values_list.append(pct_missing)

pct_missing_df =pd.DataFrame()
pct_missing_df["col"] = cols_list
pct_missing_df["pct_missing"] = values_list
pct_missing_df

In [None]:
custom_describe(data_wash)

In [None]:
figure, axes = plt.subplots(2,1,figsize=(10,60))
sns.histplot(data_wash['access_drinkwater_index'],kde = True,ax=axes[0])
sns.histplot(data_wash['access_sani_index'],kde = True,ax=axes[1])

plt.savefig('wash1.png')

In [None]:
#create trendlines
figure, axes = plt.subplots(2,1,figsize=(10,60))
sns.lineplot(x='year_id',y='access_drinkwater_index',data=data_wash,ax=axes[0])
sns.lineplot(x='year_id',y='access_sani_index',data=data_wash,ax=axes[1])

plt.savefig('wash2.png')

In [None]:
data_wash.to_csv('data_wash.csv')

## 4. Exposure and Vulnerability

In [None]:
def report(df):
    print()
    print(f"A dataset with {len(df.columns)} columns, with "
          f"data between t {min(df.index.get_level_values(0))} "
          f"and {max(df.index.get_level_values(0))}. "
          f"({len(np.unique(df.index.get_level_values(1)))} units)"
          )
    return

In [None]:
#look if the data is already in the views system
!viewser tables list

In [None]:
!viewser tables show gdis_pgy

In [None]:
#define the preliminers t (1990 till 2019) and c_ids
#transform pgy into cy
data_disaster = (Queryset("gdis_pgy", "priogrid_year")
                 .with_column(Column("gdis_count", from_table="gdis_pgy", from_column="gids_disasterno_count"))
                # .agggregate('sum')
                 .with_column(Column("gdis_nunique", from_table="gdis_pgy", from_column="gids_disasterno_nunique"))
                # .agggregate('sum')
                 .with_column(Column("gdis_type", from_table="gdis_pgy", from_column="gids_disastertype_join")))


data_disaster = data_disaster.publish().fetch()

data_disaster

In [None]:
data_disaster= data_disaster.reset_index()
data_disaster = data_disaster.rename(columns={'priogrid_gid':'pg_id'})
data_disaster = data_disaster[data_disaster['year_id'] <=2019] 
data_disaster = data_disaster[data_disaster['year_id'] >=1990] 
data_disaster = data_disaster[(data_disaster.pgy.lon.between(-10,40)) & (data_disaster.pgy.lat.between(-35,+35))]
#filter in africa
data_disaster

In [None]:
# Look how easy it is to convert a PGY df to a CY dataframe. Ready for aggregation!
data_disaster['c_id'] = data_disaster.pgy.c_id
data_disaster

In [None]:
#get the iso numbers
data_disaster['iso'] = data_disaster.c.isoab
data_disaster

In [None]:
inspect = data_disaster.loc[data_disaster.iso.isna()]
display(inspect)

In [None]:
data_disaster = data_disaster.loc[data_disaster['iso'].isin(['AGO', 'BDI', 'BEN', 'BFA', 'BWA', 'CAF', 'CIV', 'CMR', 'COD',
       'COG', 'COM', 'CPV', 'DJI', 'ERI', 'ETH', 'GAB', 'GHA', 'GIN',
       'GMB', 'GNB', 'GNQ', 'KEN', 'LBR', 'LSO', 'MDG', 'MLI', 'MOZ',
       'MRT', 'MWI', 'NAM', 'NER', 'NGA', 'RWA', 'SDN', 'SEN', 'SLE',
       'SOM', 'SSD', 'STP', 'SWZ', 'TCD', 'TGO', 'TZA', 'UGA', 'ZAF',
       'ZMB', 'ZWE'])]
data_disaster = data_disaster.groupby(['iso','year_id']).sum().reset_index()
display(data_disaster)

In [None]:
#clean up columns
data_disaster = data_disaster.drop(columns={'c_id'})

In [None]:
#example distribution plot with kernel densities
%matplotlib inline 
figure, axes = plt.subplots(2,1,figsize=(10,60))
sns.displot(data_disaster['gdis_count'],kde=True,ax=axes[0])
sns.displot(data_disaster['gdis_nunique'],kde=True,ax=axes[1])
plt.savefig('data_disaster1.png')

In [None]:
#create trendlines
figure, axes = plt.subplots(2,1,figsize=(10,60))

sns.lineplot(x='year_id',y='gdis_count',data=data_disaster,err_style='bars',ax=axes[0])
sns.lineplot(x='year_id',y='gdis_nunique',data=data_disaster,err_style='bars',ax=axes[1])
plt.savefig('data_disaster2.png')

In [None]:
#test for missingness
values_list = list()
cols_list = list() 
for col in data_disaster.columns:
    pct_missing= np.mean(data_disaster[col].isnull())*100
    cols_list.append(col)
    values_list.append(pct_missing)

pct_missing_df =pd.DataFrame()
pct_missing_df["col"] = cols_list
pct_missing_df["pct_missing"] = values_list
pct_missing_df

In [None]:
#logtransform the variable
data_disaster['gdis_count_lag'] = np.log1p(data_disaster['gdis_count'])
display(data_disaster)

figure, axes = plt.subplots(2,1,figsize=(10,55))
sns.histplot(data_disaster['gdis_count'],ax=axes[0])
sns.histplot(data_disaster['gdis_count_lag'],ax=axes[1])

plt.savefig('disaster_log.png')

In [None]:
data_disaster.to_csv('data_disaster.csv')

# emdat data for vulnerability

In [None]:
import openpyxl
workbook = openpyxl.load_workbook('emdat.xlsx')
import openpyxl

# Select the worksheet to work with
worksheet = workbook['emdat data']
all_cells = worksheet.rows

# Create an empty list to store the data
data = []

# Iterate through each row of cells and extract the values
for row in all_cells:
    row_data = []
    for cell in row:
        row_data.append(cell.value)
    data.append(row_data)

# Create a pandas DataFrame from the data
data_vn = pd.DataFrame(data)

# Optionally, set the column names to the first row of the worksheet
data_vn.columns = data_vn.iloc[0]
data_vn = data_vn[1:]

In [None]:
data_vn

In [None]:
data_vn.columns

In [None]:
data_vn['Total Deaths'] = data_vn['Total Deaths'].astype(float)
data_vn['Total Affected'] = data_vn['Total Affected'].astype(float)
data_vn['Year'] = data_vn['Year'].astype(float)

In [None]:
data_vn = data_vn.filter(['Year','ISO','Total Deaths','Total Affected'])
#rename the columns
data_vn = data_vn.rename(columns={'Year':'year_id','Total Deaths':'death_tot','Total Affected':'tot_affect'})
#fix the temporal scope
data_vn = data_vn[data_vn['year_id'] >=1990] 
data_vn = data_vn[data_vn['year_id'] <=2019] 

In [None]:
data_vn = data_vn.loc[data_vn['ISO'].isin(['AGO', 'BDI', 'BEN', 'BFA', 'BWA', 'CAF', 'CIV', 'CMR', 'COD',
       'COG', 'COM', 'CPV', 'DJI', 'ERI', 'ETH', 'GAB', 'GHA', 'GIN',
       'GMB', 'GNB', 'GNQ', 'KEN', 'LBR', 'LSO', 'MDG', 'MLI', 'MOZ',
       'MRT', 'MWI', 'NAM', 'NER', 'NGA', 'RWA', 'SDN', 'SEN', 'SLE',
       'SOM', 'SSD', 'STP', 'SWZ', 'TCD', 'TGO', 'TZA', 'UGA', 'ZAF',
       'ZMB', 'ZWE'])]
data_vn = data_vn.groupby(['ISO','year_id']).sum().reset_index()
display(data_vn)

In [None]:
data_vn = data_vn.rename(columns={'ISO':'iso'})

In [None]:
#get descriptives
custom_describe(data_vn)

In [None]:
#example distribution plot with kernel densities
%matplotlib inline 
figure, axes = plt.subplots(2,1,figsize=(10,60))

sns.displot(data_vn['death_tot'], kde=True, ax= axes[0])
sns.displot(data_vn['tot_affect'], kde=True, ax= axes[1])
plt.savefig('emdat1.png')

In [None]:
#create trendlines
figure, axes = plt.subplots(2,1,figsize=(10,60))

sns.lineplot(x='year_id',y='death_tot',data=data_vn,err_style='bars',ax=axes[0])
sns.lineplot(x='year_id',y='tot_affect',data=data_vn,err_style='bars',ax=axes[1])
plt.savefig('emdat2.pdf')

In [None]:
#inspect missingness
values_list = list()
cols_list = list() 
for col in data_vn.columns:
    pct_missing= np.mean(data_vn[col].isnull())*100
    cols_list.append(col)
    values_list.append(pct_missing)

pct_missing_df =pd.DataFrame()
pct_missing_df["col"] = cols_list
pct_missing_df["pct_missing"] = values_list
pct_missing_df

In [None]:
data_vn.to_csv('data_vn.csv')

# 6. Universal Health Care Coverage (UHC) + health worker density

In [None]:
data_uhc = pd.read_csv('Universal Health Coverage.csv')
data_worker =pd.read_csv('Health Worker Density.csv')

In [None]:
print(data_uhc.dtypes)
print(data_uhc.columns)
print(data_uhc.describe(include='all'))
print(data_uhc.year_id.unique())

In [None]:
print(data_worker.dtypes)
print(data_worker.columns)
print(data_worker.describe(include='all'))
print(data_worker.year_id.unique())

In [None]:
data_worker = data_worker.filter(['ihme_healthworkers_all_mean','year_id','iso'])
data_worker
                           

In [None]:
data_worker = data_worker.loc[data_worker['iso'].isin(['COG', 'MWI', 'MDG', 'GNQ', 'AGO', 'COM', 'ETH', 'ERI', 'BWA',
       'BDI', 'CAF', 'NER', 'DJI', 'TCD', 'MRT', 'CPV', 'MOZ', 'ZMB',
       'CIV', 'GAB', 'SWZ', 'ZWE', 'BEN', 'COD', 'SOM', 'LSO', 'NAM',
       'STP', 'SSD', 'ZAF', 'UGA', 'KEN', 'NGA', 'MLI', 'SDN', 'LBR',
       'TGO', 'TZA', 'GMB', 'RWA', 'GIN', 'GHA', 'CMR', 'BFA', 'SEN',
       'SLE', 'GNB'])]
data_worker = data_worker.groupby(['iso','year_id']).sum().reset_index()
display(data_worker)

In [None]:
## only years from 1990 onwards
data_worker = data_worker.loc[data_uhc['year_id'] >= 1990]

In [None]:
#test for missingness
#getting the % of missing values in each column
values_list = list()
cols_list = list() #creating empty lists

for col in data_worker.columns:
    pct_missing= np.mean(data_worker[col].isnull())*100
    cols_list.append(col)
    values_list.append(pct_missing)

pct_missing_df =pd.DataFrame()
pct_missing_df["col"] = cols_list
pct_missing_df["pct_missing"] = values_list
pct_missing_df

In [None]:
#get a histogram
sns.histplot(data_worker['ihme_healthworkers_all_mean'],kde=True)


plt.savefig('healthworker1.png')

In [None]:
data_worker.year_id.unique()

In [None]:
#prob need to log the variable

In [None]:
#create trendlines


sns.lineplot(x='year_id',y='ihme_healthworkers_all_mean',data=data_worker,err_style='bars')
plt.savefig('healthworker2.png')

In [None]:
#save to csv
data_worker.to_csv('data_hwd.csv')

In [None]:
# so the uhc is only reported every 5 years o so and has a lot of missingness – i will impute the data so its a panel from 2000 to 2019 at least

7. Population size

In [3]:
#import population size
dta = pd.read_csv('pop_size.csv',skiprows= 4)
display(dta.columns)
dta

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       'Unnamed: 67'],
      dtype='object')

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,...,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,106445.0,
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,...,583651101.0,600008424.0,616377605.0,632746570.0,649757148.0,667242986.0,685112979.0,702977106.0,720839314.0,
2,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,...,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,41128771.0,
3,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,...,397855507.0,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,490330870.0,
4,Angola,AGO,"Population, total",SP.POP.TOTL,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,...,27128337.0,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0,34503774.0,35588987.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,"Population, total",SP.POP.TOTL,947000.0,966000.0,994000.0,1022000.0,1050000.0,1078000.0,...,1812771.0,1788196.0,1777557.0,1791003.0,1797085.0,1788878.0,1790133.0,1786038.0,1761985.0,
262,"Yemen, Rep.",YEM,"Population, total",SP.POP.TOTL,5542459.0,5646668.0,5753386.0,5860197.0,5973803.0,6097298.0,...,27753304.0,28516545.0,29274002.0,30034389.0,30790513.0,31546691.0,32284046.0,32981641.0,33696614.0,
263,South Africa,ZAF,"Population, total",SP.POP.TOTL,16520441.0,16989464.0,17503133.0,18042215.0,18603097.0,19187194.0,...,54729551.0,55876504.0,56422274.0,56641209.0,57339635.0,58087055.0,58801927.0,59392255.0,59893885.0,
264,Zambia,ZMB,"Population, total",SP.POP.TOTL,3119430.0,3219451.0,3323427.0,3431381.0,3542764.0,3658024.0,...,15737793.0,16248230.0,16767761.0,17298054.0,17835893.0,18380477.0,18927715.0,19473125.0,20017675.0,


In [5]:
dta

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,...,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,106445.0,
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,...,583651101.0,600008424.0,616377605.0,632746570.0,649757148.0,667242986.0,685112979.0,702977106.0,720839314.0,
2,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,...,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,41128771.0,
3,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,...,397855507.0,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,490330870.0,
4,Angola,AGO,"Population, total",SP.POP.TOTL,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,...,27128337.0,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0,34503774.0,35588987.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,"Population, total",SP.POP.TOTL,947000.0,966000.0,994000.0,1022000.0,1050000.0,1078000.0,...,1812771.0,1788196.0,1777557.0,1791003.0,1797085.0,1788878.0,1790133.0,1786038.0,1761985.0,
262,"Yemen, Rep.",YEM,"Population, total",SP.POP.TOTL,5542459.0,5646668.0,5753386.0,5860197.0,5973803.0,6097298.0,...,27753304.0,28516545.0,29274002.0,30034389.0,30790513.0,31546691.0,32284046.0,32981641.0,33696614.0,
263,South Africa,ZAF,"Population, total",SP.POP.TOTL,16520441.0,16989464.0,17503133.0,18042215.0,18603097.0,19187194.0,...,54729551.0,55876504.0,56422274.0,56641209.0,57339635.0,58087055.0,58801927.0,59392255.0,59893885.0,
264,Zambia,ZMB,"Population, total",SP.POP.TOTL,3119430.0,3219451.0,3323427.0,3431381.0,3542764.0,3658024.0,...,15737793.0,16248230.0,16767761.0,17298054.0,17835893.0,18380477.0,18927715.0,19473125.0,20017675.0,


In [6]:
#clean up
dta = dta.drop(columns=['Country Name','Indicator Name', 'Indicator Code','Unnamed: 67'])
dta = dta.rename(columns= {'Country Code':'iso'})
dta = dta.melt(id_vars='iso',var_name='year_id',value_name='pop_size')
dta['year_id'] = dta['year_id'].astype(str).astype(int)

## only years from 1990 onwards
dta = dta.loc[dta['year_id'] >= 1990]
dta = dta.loc[dta['year_id'] <= 2019]
#only countries in SSA
dta = dta.loc[dta['iso'].isin(['AGO', 'BDI', 'BEN', 'BFA', 'BWA', 'CAF', 'CIV', 'CMR', 'COD',
       'COG', 'COM', 'CPV', 'DJI', 'ERI', 'ETH', 'GAB', 'GHA', 'GIN',
       'GMB', 'GNB', 'GNQ', 'KEN', 'LBR', 'LSO', 'MDG', 'MLI', 'MOZ',
       'MRT', 'MWI', 'NAM', 'NER', 'NGA', 'RWA', 'SDN', 'SEN', 'SLE',
       'SOM', 'SSD', 'STP', 'SWZ', 'TCD', 'TGO', 'TZA', 'UGA', 'ZAF',
       'ZMB', 'ZWE'])]

#save
dta.to_csv('data_pop_size.csv')

In [None]:
display(custom_describe(data_dah))
display(custom_describe(data_uhc))

In [None]:
custom_describe(data_wdi)

In [None]:
display(custom_describe(data_wash))
display(custom_describe(data_disaster))
display(custom_describe(data_vn))