# Part1: Reading Indicators from World Bank DataCatalog

In [1]:
import pandas as pd

In [2]:
#---- Reading data sets ---- World Bank Indicators
# loading csv file of development indicators as dataframe
indicators_df = pd.read_csv('Resources/WDIData.csv')
indicators_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,16.320475,16.643243,16.994695,17.313168,17.60495,,,,,
1,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,32.224027,32.046478,31.323579,33.312163,38.380433,39.754201,42.168241,43.640661,,
2,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,20.525353,19.461383,17.790698,16.55347,23.907897,24.624725,26.8139,28.84115,,
3,Africa Eastern and Southern,AFE,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,,,,,,,...,66.303599,66.49601,65.828988,66.926692,68.722184,71.085418,71.994933,73.589886,,
4,Africa Eastern and Southern,AFE,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,,,,,,,...,,,,,,,,,,


In [3]:
# cleaning the dataframe to keep only our information of interest
# get rid off spaces in column names
indicators_df.columns =[column.replace(" ", "_") for column in indicators_df.columns]
# reading country codes file, saved from World Bank Website
country_codes = pd.read_csv('Resources/CountryCodes.csv')
# Keep only rows with actual data
country_codes = country_codes.dropna()
# Keep only rows with country codes
e_row = country_codes.index[country_codes['Country_Name'] == 'World'].tolist()
country_codes = country_codes.drop(e_row)
# convert the country codes as a list
countries  = country_codes.ISO3.tolist()
countries

['AFG',
 'ALB',
 'DZA',
 'ASM',
 'AND',
 'AGO',
 'AIA',
 'ATG',
 'ARG',
 'ARM',
 'ABW',
 'AUS',
 'AUT',
 'AZE',
 'BHS',
 'BHR',
 'BGD',
 'BRB',
 'BLR',
 'BEL',
 'BLX',
 'BLZ',
 'BEN',
 'BMU',
 'BTN',
 'BOL',
 'BIH',
 'BWA',
 'BAT',
 'BRA',
 'IOT',
 'VGB',
 'BRN',
 'BGR',
 'BFA',
 'BDI',
 'KHM',
 'CMR',
 'CAN',
 'CPV',
 'CYM',
 'CAF',
 'TCD',
 'CHL',
 'CHN',
 'CXR',
 'CCK',
 'COL',
 'COM',
 'ZAR',
 'COG',
 'COK',
 'CRI',
 'CIV',
 'HRV',
 'CUB',
 'CYP',
 'CZE',
 'CSK',
 'DNK',
 'DJI',
 'DMA',
 'DOM',
 'TMP',
 'ECU',
 'EGY',
 'SLV',
 'GNQ',
 'ERI',
 'EST',
 'ETH',
 'ETF',
 'EUN',
 'FRO',
 'FLK',
 'FJI',
 'FIN',
 'PCZ',
 'ZW1',
 'TAN',
 'VDR',
 'SVR',
 'ZPM',
 'ATF',
 'FRA',
 'FRE',
 'GUF',
 'PYF',
 'GAB',
 'GMB',
 'GAZ',
 'GEO',
 'DDR',
 'DEU',
 'GHA',
 'GIB',
 'GRC',
 'GRL',
 'GRD',
 'GLP',
 'GUM',
 'GTM',
 'GIN',
 'GNB',
 'GUY',
 'HTI',
 'VAT',
 'HND',
 'HKG',
 'HUN',
 'ISL',
 'IND',
 'IDN',
 'IRN',
 'IRQ',
 'IRL',
 'ISR',
 'ITA',
 'JAM',
 'JPN',
 'JTN',
 'JOR',
 'KAZ',
 'KEN',
 'KIR',


In [4]:
# list of development indicators of interest
indicators = ['GDP growth (annual %)', 'GNI growth (annual %)','Population, total']
# filtering the dataframe to keep only information corresponding to all the countries and the chosen indicators
indicators_df.query('Indicator_Name == @indicators & Country_Code == @countries', inplace = True)
indicators_df.head(10)

Unnamed: 0,Country_Name,Country_Code,Indicator_Name,Indicator_Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed:_65
71175,Afghanistan,AFG,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,,,,,,,...,12.75229,5.600745,2.724543,1.451315,2.260314,2.647003,1.189228,3.911603,-2.351101,
71201,Afghanistan,AFG,GNI growth (annual %),NY.GNP.MKTP.KD.ZG,,,,,,,...,,,,,,,,,,
71767,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8996967.0,9169406.0,9351442.0,9543200.0,9744772.0,9956318.0,...,31161380.0,32269590.0,33370800.0,34413600.0,35383030.0,36296110.0,37171920.0,38041760.0,38928340.0,
72618,Albania,ALB,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,,,,,,,...,1.417243,1.002018,1.774449,2.218726,3.314981,3.802227,4.01936,2.11342,-3.955398,
72644,Albania,ALB,GNI growth (annual %),NY.GNP.MKTP.KD.ZG,,,,,,,...,0.6441817,3.459208,0.9909446,2.477906,3.731449,2.449053,3.653887,0.9417562,,
73210,Albania,ALB,"Population, total",SP.POP.TOTL,1608800.0,1659800.0,1711319.0,1762621.0,1814135.0,1864791.0,...,2900401.0,2895092.0,2889104.0,2880703.0,2876101.0,2873457.0,2866376.0,2854191.0,2837743.0,
74061,Algeria,DZA,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,,-13.60544,-19.68504,34.31373,5.839413,6.206898,...,3.4,2.8,3.8,3.7,3.2,1.3,1.1,1.0,-5.1,
74087,Algeria,DZA,GNI growth (annual %),NY.GNP.MKTP.KD.ZG,,-10.40133,-24.84556,23.28765,3.681994,3.638401,...,3.101364,2.122301,3.571831,3.694971,4.623157,0.8034515,0.1032025,1.033323,-4.593422,
74653,Algeria,DZA,"Population, total",SP.POP.TOTL,11057864.0,11336340.0,11619830.0,11912800.0,12221680.0,12550880.0,...,37383900.0,38140140.0,38923690.0,39728020.0,40551400.0,41389170.0,42228420.0,43053050.0,43851040.0,
75504,American Samoa,ASM,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,,,,,,,...,-4.334828,-2.5,1.762821,3.149606,-1.679389,-6.987578,2.671119,-0.4878049,3.921569,


In [5]:
# Now we need to filter out the years.
# We are going to make an analysis from 1990-2019
years_to_keep = [str(i) for i in range(1990,2020)]
columns_to_keep = ['Country_Name','Country_Code','Indicator_Name']+ years_to_keep
# final indicators from World Bank DataCatalog
features_df = indicators_df[columns_to_keep]
features_df

Unnamed: 0,Country_Name,Country_Code,Indicator_Name,1990,1991,1992,1993,1994,1995,1996,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
71175,Afghanistan,AFG,GDP growth (annual %),,,,,,,,...,1.436244e+01,4.263548e-01,1.275229e+01,5.600745e+00,2.724543e+00,1.451315e+00,2.260314e+00,2.647003e+00,1.189228e+00,3.911603e+00
71201,Afghanistan,AFG,GNI growth (annual %),,,,,,,,...,,,,,,,,,,
71767,Afghanistan,AFG,"Population, total",1.241231e+07,1.329902e+07,1.448554e+07,1.581660e+07,1.707573e+07,1.811066e+07,1.885344e+07,...,2.918551e+07,3.011741e+07,3.116138e+07,3.226959e+07,3.337080e+07,3.441360e+07,3.538303e+07,3.629611e+07,3.717192e+07,3.804176e+07
72618,Albania,ALB,GDP growth (annual %),-9.575640e+00,-2.800214e+01,-7.187111e+00,9.559412e+00,8.302867e+00,1.332233e+01,9.099999e+00,...,3.706938e+00,2.545406e+00,1.417243e+00,1.002018e+00,1.774449e+00,2.218726e+00,3.314981e+00,3.802227e+00,4.019360e+00,2.113420e+00
72644,Albania,ALB,GNI growth (annual %),,,,,,,,...,4.306606e+00,3.606888e+00,6.441817e-01,3.459208e+00,9.909446e-01,2.477906e+00,3.731449e+00,2.449053e+00,3.653887e+00,9.417562e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381446,Zambia,ZMB,GNI growth (annual %),,,,,,,,...,,,,,,,,,,
382012,Zambia,ZMB,"Population, total",8.036849e+06,8.246662e+06,8.451346e+06,8.656484e+06,8.869745e+06,9.096608e+06,9.339740e+06,...,1.360599e+07,1.402320e+07,1.446515e+07,1.492655e+07,1.539979e+07,1.587937e+07,1.636345e+07,1.685361e+07,1.735171e+07,1.786103e+07
382863,Zimbabwe,ZWE,GDP growth (annual %),6.988553e+00,5.531782e+00,-9.015570e+00,1.051459e+00,9.235199e+00,1.580257e-01,1.036070e+01,...,1.967532e+01,1.419391e+01,1.666543e+01,1.989493e+00,2.376929e+00,1.779873e+00,7.558693e-01,4.709492e+00,4.824211e+00,-6.144236e+00
382889,Zimbabwe,ZWE,GNI growth (annual %),,,,,,,,...,1.991640e+01,1.368983e+01,1.668240e+01,2.172823e+00,2.127321e+00,1.501218e+00,6.742296e-01,4.907110e+00,5.094696e+00,


# Reading Human Development Index from UN Website

In [6]:
import pandas as pd
# loading csv file of development indicators as dataframe
hdi_df = pd.read_csv('Resources/HumanDevelopmentIndex (HDI).csv', sep=',')
hdi_df = hdi_df.dropna(how='all', axis=1)
hdi_df

Unnamed: 0,HDI Rank,Country,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,169,Afghanistan,0.302,0.307,0.316,0.312,0.307,0.331,0.335,0.339,...,0.472,0.477,0.489,0.496,0.500,0.500,0.502,0.506,0.509,0.511
1,69,Albania,0.650,0.631,0.615,0.618,0.624,0.637,0.646,0.645,...,0.745,0.764,0.775,0.782,0.787,0.788,0.788,0.790,0.792,0.795
2,91,Algeria,0.572,0.576,0.582,0.586,0.590,0.595,0.602,0.611,...,0.721,0.728,0.728,0.729,0.736,0.740,0.743,0.745,0.746,0.748
3,36,Andorra,..,..,..,..,..,..,..,..,...,0.837,0.836,0.858,0.856,0.863,0.862,0.866,0.863,0.867,0.868
4,148,Angola,..,..,..,..,..,..,..,..,...,0.517,0.533,0.544,0.555,0.565,0.572,0.578,0.582,0.582,0.581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,,Least Developed Countries,0.350,0.353,0.354,0.358,0.358,0.366,0.374,0.381,...,0.485,0.493,0.499,0.504,0.510,0.516,0.520,0.525,0.528,0.538
203,,Small Island Developing States,0.595,0.598,0.603,0.608,0.612,0.618,0.624,0.629,...,0.702,0.706,0.704,0.708,0.712,0.717,0.719,0.722,0.723,0.728
204,,Organization for Economic Co-operation and Dev...,0.785,0.790,0.788,0.800,0.807,0.812,0.817,0.817,...,0.873,0.877,0.879,0.883,0.886,0.889,0.892,0.894,0.895,0.900
205,,World,0.598,0.601,0.601,0.608,0.611,0.617,0.622,0.624,...,0.697,0.703,0.708,0.713,0.718,0.722,0.727,0.729,0.731,0.737


In [7]:
hdi_df["Country"].values

array([' Afghanistan', ' Albania', ' Algeria', ' Andorra', ' Angola',
       ' Antigua and Barbuda', ' Argentina', ' Armenia', ' Australia',
       ' Austria', ' Azerbaijan', ' Bahamas', ' Bahrain', ' Bangladesh',
       ' Barbados', ' Belarus', ' Belgium', ' Belize', ' Benin',
       ' Bhutan', ' Bolivia (Plurinational State of)',
       ' Bosnia and Herzegovina', ' Botswana', ' Brazil',
       ' Brunei Darussalam', ' Bulgaria', ' Burkina Faso', ' Burundi',
       ' Cabo Verde', ' Cambodia', ' Cameroon', ' Canada',
       ' Central African Republic', ' Chad', ' Chile', ' China',
       ' Colombia', ' Comoros', ' Congo',
       ' Congo (Democratic Republic of the)', ' Costa Rica', ' Croatia',
       ' Cuba', ' Cyprus', ' Czechia', " C�te d'Ivoire", ' Denmark',
       ' Djibouti', ' Dominica', ' Dominican Republic', ' Ecuador',
       ' Egypt', ' El Salvador', ' Equatorial Guinea', ' Eritrea',
       ' Estonia', ' Eswatini (Kingdom of)', ' Ethiopia', ' Fiji',
       ' Finland', ' France

In [7]:
last_i = hdi_df.index[hdi_df['Country'] == ' Zimbabwe'].tolist()
hdi_df = hdi_df.iloc[0:last_i[0]+1]
hdi_df

Unnamed: 0,HDI Rank,Country,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,169,Afghanistan,0.302,0.307,0.316,0.312,0.307,0.331,0.335,0.339,...,0.472,0.477,0.489,0.496,0.500,0.500,0.502,0.506,0.509,0.511
1,69,Albania,0.650,0.631,0.615,0.618,0.624,0.637,0.646,0.645,...,0.745,0.764,0.775,0.782,0.787,0.788,0.788,0.790,0.792,0.795
2,91,Algeria,0.572,0.576,0.582,0.586,0.590,0.595,0.602,0.611,...,0.721,0.728,0.728,0.729,0.736,0.740,0.743,0.745,0.746,0.748
3,36,Andorra,..,..,..,..,..,..,..,..,...,0.837,0.836,0.858,0.856,0.863,0.862,0.866,0.863,0.867,0.868
4,148,Angola,..,..,..,..,..,..,..,..,...,0.517,0.533,0.544,0.555,0.565,0.572,0.578,0.582,0.582,0.581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,113,Venezuela (Bolivarian Republic of),0.644,0.654,0.660,0.662,0.662,0.666,0.668,0.670,...,0.757,0.769,0.772,0.777,0.775,0.769,0.759,0.743,0.733,0.711
185,117,Viet Nam,0.483,0.493,0.504,0.514,0.525,0.537,0.548,0.547,...,0.661,0.671,0.676,0.681,0.683,0.688,0.693,0.696,0.700,0.704
186,179,Yemen,0.401,0.401,0.404,0.406,0.408,0.414,0.421,0.426,...,0.506,0.506,0.504,0.509,0.502,0.483,0.474,0.467,0.468,0.470
187,146,Zambia,0.421,0.417,0.416,0.419,0.414,0.415,0.416,0.416,...,0.527,0.534,0.549,0.557,0.561,0.569,0.571,0.578,0.582,0.584


## Joining the two data sets

In [8]:
# Before joining the two data sets, we need to reshape the tables such that the indicators will be columns and the
# years will be rows
# Convert Years columns (1990-2019) of features dataframe(WB Indicators) to Rows
df = features_df.melt(id_vars=["Country_Name","Country_Code", "Indicator_Name"], var_name="Year", value_name="Value")
# Convert Indicators Rows to Columns using Pivot Table
df_pivot = df.pivot_table('Value',['Country_Name','Country_Code','Year'],'Indicator_Name')
# Convert Multindex pivot table to a dataframe again 
features_df = pd.DataFrame(df_pivot.to_records())
features_df

Unnamed: 0,Country_Name,Country_Code,Year,GDP growth (annual %),GNI growth (annual %),"Population, total"
0,Afghanistan,AFG,1990,,,12412311.0
1,Afghanistan,AFG,1991,,,13299016.0
2,Afghanistan,AFG,1992,,,14485543.0
3,Afghanistan,AFG,1993,,,15816601.0
4,Afghanistan,AFG,1994,,,17075728.0
...,...,...,...,...,...,...
6106,Zimbabwe,ZWE,2015,1.779873,1.501218,13814642.0
6107,Zimbabwe,ZWE,2016,0.755869,0.674230,14030338.0
6108,Zimbabwe,ZWE,2017,4.709492,4.907110,14236599.0
6109,Zimbabwe,ZWE,2018,4.824211,5.094696,14438812.0


In [9]:
# Convert Years columns (1990-2019) to Rows
hdi_df = hdi_df.melt(id_vars=['Country','HDI Rank'], var_name = "Year", value_name = "HDI")
hdi_df

Unnamed: 0,Country,HDI Rank,Year,HDI
0,Afghanistan,169,1990,0.302
1,Albania,69,1990,0.650
2,Algeria,91,1990,0.572
3,Andorra,36,1990,..
4,Angola,148,1990,..
...,...,...,...,...
5665,Venezuela (Bolivarian Republic of),113,2019,0.711
5666,Viet Nam,117,2019,0.704
5667,Yemen,179,2019,0.470
5668,Zambia,146,2019,0.584


In [16]:
#hdi_df = hdi_df.set_index('Country')
#features_df = features_df.set_index('Country_Name')
join_inds = hdi_df.merge(features_df)

In [17]:
join_inds.dtypes

HDI Rank                  object
Year                      object
HDI                       object
Country_Code              object
GDP growth (annual %)    float64
GNI growth (annual %)    float64
Population, total        float64
dtype: object