In [15]:
from pandas_datareader import wb
import pandas as pd
import awoc

In [16]:
indicator = ['NY.GDP.PCAP.PP.CD', 'SI.POV.GINI', 'SE.XPD.TOTL.GD.ZS', 
             'SE.PRM.CMPT.FE.ZS','SE.PRM.CMPT.MA.ZS', 'EG.ELC.ACCS.ZS',
             'ER.H2O.FWTL.ZS', 'SH.XPD.CHEX.GD.ZS', 'SL.UEM.TOTL.FE.ZS',
             'SL.UEM.TOTL.MA.ZS'] # you can find the indicator code from https://data.worldbank.org/

Raw world bank data for the above indicators

In [5]:
df_wb_raw = wb.download(country='all', indicator=indicator, start=2011, end=2022)
df_wb_raw.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,NY.GDP.PCAP.PP.CD,SI.POV.GINI,SE.XPD.TOTL.GD.ZS,SE.PRM.CMPT.FE.ZS,SE.PRM.CMPT.MA.ZS,EG.ELC.ACCS.ZS,ER.H2O.FWTL.ZS,SH.XPD.CHEX.GD.ZS,SL.UEM.TOTL.FE.ZS,SL.UEM.TOTL.MA.ZS
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Africa Eastern and Southern,2021,3839.470058,,,,,,,,8.495897,7.401659
Africa Eastern and Southern,2020,3621.058106,,4.60817,70.825378,73.10321,45.609604,,,8.120844,7.192637
Africa Eastern and Southern,2019,3777.972368,,4.600084,69.990547,71.82766,44.073912,4.929973,6.272034,7.620241,6.673629
Africa Eastern and Southern,2018,3724.868177,,4.83459,69.4991,70.889862,42.880977,4.928115,6.323236,7.417152,6.461407
Africa Eastern and Southern,2017,3635.564622,,4.862555,68.167793,69.157753,40.092163,4.881813,6.317424,7.455044,6.480417


Creating new dataframe to work on

In [17]:
df_wb = df_wb_raw.reset_index() 
df_wb = df_wb.rename(columns={'country':'Country', 'year':'Year', 
                              'NY.GDP.PCAP.PP.CD': 'Per Capita GDP', 
                              'SI.POV.GINI': 'GINI Index',
                              'SE.XPD.TOTL.GD.ZS':'% GDP on Edu',
                              'SE.PRM.CMPT.FE.ZS':'% Fem Pri Edu',
                              'SE.PRM.CMPT.MA.ZS': '% Male Pri Edu',
                              'EG.ELC.ACCS.ZS': 'Access to Electricity (% of pop)',
                              'ER.H2O.FWTL.ZS': '% Annual Freshwater Withdrawals (internal)',
                              'SH.XPD.CHEX.GD.ZS': '% of GDP on Health',
                              'SL.UEM.TOTL.FE.ZS': '% Unemp Fem',
                              'SL.UEM.TOTL.MA.ZS': '% Unemp Male'})
df_wb.head()

Unnamed: 0,Country,Year,Per Capita GDP,GINI Index,% GDP on Edu,% Fem Pri Edu,% Male Pri Edu,Access to Electricity (% of pop),% Annual Freshwater Withdrawals (internal),% of GDP on Health,% Unemp Fem,% Unemp Male
0,Africa Eastern and Southern,2021,3839.470058,,,,,,,,8.495897,7.401659
1,Africa Eastern and Southern,2020,3621.058106,,4.60817,70.825378,73.10321,45.609604,,,8.120844,7.192637
2,Africa Eastern and Southern,2019,3777.972368,,4.600084,69.990547,71.82766,44.073912,4.929973,6.272034,7.620241,6.673629
3,Africa Eastern and Southern,2018,3724.868177,,4.83459,69.4991,70.889862,42.880977,4.928115,6.323236,7.417152,6.461407
4,Africa Eastern and Southern,2017,3635.564622,,4.862555,68.167793,69.157753,40.092163,4.881813,6.317424,7.455044,6.480417


Extracting the data for only countries in Africa

In [18]:
my_world = awoc.AWOC() # Creating the class
countries_africa= my_world.get_countries_list_of('Africa')
df_country = pd.DataFrame (countries_africa, columns = ['Country'])
df_country = df_country.assign(Continent = 'Africa')
df_country.head()

Unnamed: 0,Country,Continent
0,Algeria,Africa
1,Angola,Africa
2,Benin,Africa
3,Botswana,Africa
4,Burkina Faso,Africa


In [19]:
df_country['Country'].nunique() # number of countries

58

Checking countries with different spellings

In [20]:
df_temp = df_wb.merge(df_country,on='Country', how='inner')
df_temp['Country'].nunique()

47

In [21]:
temp_country = df_temp['Country'].unique().tolist()
missing = list(set(countries_africa) - set(temp_country))
print(missing)

['Egypt', 'Democratic Republic of the Congo', 'Saint Helena', 'Mayotte', 'Ivory Coast', 'Reunion', 'Republic of the Congo', 'Cape Verde', 'Gambia', 'Swaziland', 'Western Sahara']


Renaming countries in df_country based on countries in world bank data(df_wb). Out of 11 missing countries, 5 countries had a different spelling and remaining 6 country data no available. Country spelling updated as given in the world bank data.

In [22]:
df_country.loc[df_country['Country']=='Egypt'] = 'Egypt, Arab Rep.'
df_country.loc[df_country['Country']=="Ivory Coast"] = "Cote d'Ivoire"
df_country.loc[df_country['Country']=='Republic of the Congo'] = 'Congo, Rep.'
df_country.loc[df_country['Country']=='Democratic Republic of the Congo'] = 'Congo, Dem. Rep.'
df_country.loc[df_country['Country']=='Gambia'] = 'Gambia, The'

In [23]:
df_wb = df_wb.merge(df_country,on='Country', how='inner') # merging columns
df_wb.head()

Unnamed: 0,Country,Year,Per Capita GDP,GINI Index,% GDP on Edu,% Fem Pri Edu,% Male Pri Edu,Access to Electricity (% of pop),% Annual Freshwater Withdrawals (internal),% of GDP on Health,% Unemp Fem,% Unemp Male,Continent
0,Algeria,2021,12128.274991,,,,,,,,20.475,9.63,Africa
1,Algeria,2020,11438.691156,,7.042397,103.3442,103.970131,99.804131,,,20.802,10.211,Africa
2,Algeria,2019,12118.0045,,6.10036,101.582573,101.15535,99.5,87.152129,6.243371,18.679001,8.532,Africa
3,Algeria,2018,12006.010238,,5.86635,105.625076,104.681931,99.697838,87.152129,6.163026,18.587999,8.442,Africa
4,Algeria,2017,11809.483033,,6.50538,105.750702,105.527252,99.63549,87.152129,6.279384,18.487,8.346,Africa


In [24]:
df_wb['Country'].nunique() # number of countries

52