## Data Preprocessing

The purpose of this notebook is to preprocess the datasets, i.e.
* create subsets with data of interest,
* deal with missing values,
* combine both datasets,
* create datasets to be used for statistical analyses and modelling.

The datasets to be analyzed are
* the Happy Planet Index for 2016 (see https://happyplanetindex.org/),
* the World Development Indicators (1960 - 2019) by the World Bank (see https://datacatalog.worldbank.org/dataset/world-development-indicators)
    

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Import WDI data (World Development Indicators)
df_wdi = pd.read_csv('../data/raw_data/WDIData.csv')

In [3]:
# Import HPI dataset (Happy Planet Index)
df_hpi = pd.read_excel('../data/raw_data/hpi-data-2016.xlsx', sheet_name = 'Complete HPI data', header = 5, usecols = 'B:O')

In [4]:
# Create DataFrame with WDI data for one year
# (in this case 2016, to be congruent with the HPI data)
year = '2016'
custom_list = [[df_wdi['Indicator Code'][row],df_wdi['Country Name'][row],df_wdi[year][row]] for row in range(len(df_wdi))]

df_columns = ['Indicator Code', 'Country Name', year]
custom_wdi_df = pd.DataFrame(custom_list, columns = df_columns)

In [5]:
print(custom_wdi_df.head())
print('\nShape:',custom_wdi_df.shape)

      Indicator Code Country Name       2016
0      PA.NUS.PPP.05   Arab World        NaN
1  PA.NUS.PRVT.PP.05   Arab World        NaN
2     EG.CFT.ACCS.ZS   Arab World  84.510171
3     EG.ELC.ACCS.ZS   Arab World  89.678685
4  EG.ELC.ACCS.RU.ZS   Arab World  79.665635

Shape: (377256, 3)


In [6]:
# Add Happy Planet Index value per Country from the HPI dataset

# Prepare HPI DataFrame for concatenation
# Include only relevant columns
df_hpi_mod = df_hpi[['Country', 'Happy Planet Index']]

# Include only relevant rows
df_hpi_mod = df_hpi_mod.iloc[:140]    

#df_hpi_mod

In [7]:
# Find out which Country Names in both datasets match
custom_wdi_df_match = custom_wdi_df.merge(df_hpi_mod, left_on='Country Name', right_on='Country', how='inner', suffixes=('WDI ','HPI '))
matching_countries_num = custom_wdi_df_match['Country Name'].unique().shape[0]
matching_countries_num

125

In [8]:
# Find out which Country Names in both datasets do not match
custom_wdi_df_all = custom_wdi_df.merge(df_hpi_mod, left_on='Country Name', right_on='Country', how='outer', suffixes=('WDI ','HPI '))
non_matching_countries_wdi = custom_wdi_df_all['Country Name'][custom_wdi_df_all['Country Name'] != custom_wdi_df_all['Country']]
non_matching_countries_wdi = non_matching_countries_wdi[non_matching_countries_wdi.isnull() == False].unique()
print('WDI not matching countries:',non_matching_countries_wdi.shape[0])

non_matching_countries_hpi = custom_wdi_df_all['Country'][custom_wdi_df_all['Country'] != custom_wdi_df_all['Country Name']]
non_matching_countries_hpi = non_matching_countries_hpi[non_matching_countries_hpi.isnull() == False].unique()
print('HPI not matching countries:',non_matching_countries_hpi.shape[0])

print(non_matching_countries_wdi)
print(non_matching_countries_hpi)

WDI not matching countries: 139
HPI not matching countries: 15
['Arab World' 'Caribbean small states' 'Central Europe and the Baltics'
 'Early-demographic dividend' 'East Asia & Pacific'
 'East Asia & Pacific (excluding high income)'
 'East Asia & Pacific (IDA & IBRD countries)' 'Euro area'
 'Europe & Central Asia' 'Europe & Central Asia (excluding high income)'
 'Europe & Central Asia (IDA & IBRD countries)' 'European Union'
 'Fragile and conflict affected situations'
 'Heavily indebted poor countries (HIPC)' 'High income' 'IBRD only'
 'IDA & IBRD total' 'IDA blend' 'IDA only' 'IDA total'
 'Late-demographic dividend' 'Latin America & Caribbean'
 'Latin America & Caribbean (excluding high income)'
 'Latin America & the Caribbean (IDA & IBRD countries)'
 'Least developed countries: UN classification' 'Low & middle income'
 'Low income' 'Lower middle income' 'Middle East & North Africa'
 'Middle East & North Africa (excluding high income)'
 'Middle East & North Africa (IDA & IBRD countri

In [9]:
# Manually create a matching table of non-matching country names
non_matching_countries_hpi
wdi_pair_list = ['Egypt, Arab Rep.',
                 'Hong Kong SAR, China',
                 'Iran, Islamic Rep.',
                 'Kyrgyz Republic',
                 'North Macedonia',
                 'Palestine',   # no match found in WDI
                 'Congo, Rep.',
                 'Russian Federation',
                 'Slovak Republic',
                 'Korea, Rep.',
                 'Eswatini',
                 'Syrian Arab Republic',
                 'United States',
                 'Venezuela, RB',
                 'Yemen, Rep.'
                ]
wdi_pair_list

['Egypt, Arab Rep.',
 'Hong Kong SAR, China',
 'Iran, Islamic Rep.',
 'Kyrgyz Republic',
 'North Macedonia',
 'Palestine',
 'Congo, Rep.',
 'Russian Federation',
 'Slovak Republic',
 'Korea, Rep.',
 'Eswatini',
 'Syrian Arab Republic',
 'United States',
 'Venezuela, RB',
 'Yemen, Rep.']

In [10]:
# Transform Indicator rows into Indicator columns
custom_wdi_df_indcols = custom_wdi_df.pivot(index = 'Country Name', columns = 'Indicator Code', values = '2016')
custom_wdi_df_indcols = custom_wdi_df_indcols.rename_axis(None, axis=1).reset_index()
custom_wdi_df_indcols.head()

Unnamed: 0,Country Name,AG.AGR.TRAC.NO,AG.CON.FERT.PT.ZS,AG.CON.FERT.ZS,AG.LND.AGRI.K2,AG.LND.AGRI.ZS,AG.LND.ARBL.HA,AG.LND.ARBL.HA.PC,AG.LND.ARBL.ZS,AG.LND.CREL.HA,...,per_sa_allsa.cov_q4_tot,per_sa_allsa.cov_q5_tot,per_si_allsi.adq_pop_tot,per_si_allsi.ben_q1_tot,per_si_allsi.cov_pop_tot,per_si_allsi.cov_q1_tot,per_si_allsi.cov_q2_tot,per_si_allsi.cov_q3_tot,per_si_allsi.cov_q4_tot,per_si_allsi.cov_q5_tot
0,Afghanistan,,590.067055,12.1823,379100.0,58.06758,7729000.0,0.218437,11.838679,2793133.0,...,,,,,,,,,,
1,Albania,,,126.138483,11816.999512,43.127735,620300.0,0.215674,22.638686,148084.0,...,,,,,,,,,,
2,Algeria,,8.209376,22.315273,413601.992188,17.365539,7404200.0,0.182588,3.108736,3376331.0,...,,,,,,,,,,
3,American Samoa,,,,49.000001,24.5,3000.0,0.05382,15.0,,...,,,,,,,,,,
4,Andorra,,,,187.800007,39.957448,780.0,0.010091,1.659574,,...,,,,,,,,,,


In [11]:
# Replace non-matching country names before
df_hpi_mod2 = df_hpi_mod.replace(non_matching_countries_hpi,wdi_pair_list)

In [12]:
# Merge DataFrames
custom_wdi_df_match2 = custom_wdi_df_indcols.merge(df_hpi_mod2, left_on='Country Name', right_on='Country', how='inner')
matching_countries_num2 = custom_wdi_df_match2['Country Name'].unique().shape[0]
matching_countries_num2

139

In [13]:
# Remove columns with only missing data and duplicated Country column
nan_columns = set(custom_wdi_df_match2.columns[custom_wdi_df_match2.isnull().mean() == 1])
nan_columns

wdi_hpi_2016_df = custom_wdi_df_match2.drop(nan_columns, axis = 1)
wdi_hpi_2016_df = wdi_hpi_2016_df.drop('Country', axis = 1)

In [14]:
# Show columns without missing data
columns_no_nulls = set(wdi_hpi_2016_df.columns[wdi_hpi_2016_df.isnull().mean()==0])
columns_no_nulls

{'AG.LND.AGRI.K2',
 'AG.LND.AGRI.ZS',
 'AG.LND.ARBL.HA',
 'AG.LND.ARBL.HA.PC',
 'AG.LND.ARBL.ZS',
 'AG.LND.TOTL.K2',
 'AG.PRD.CREL.MT',
 'AG.PRD.CROP.XD',
 'AG.PRD.FOOD.XD',
 'AG.PRD.LVSK.XD',
 'AG.SRF.TOTL.K2',
 'Country Name',
 'EG.ELC.ACCS.RU.ZS',
 'EG.ELC.ACCS.UR.ZS',
 'EG.ELC.ACCS.ZS',
 'EN.POP.DNST',
 'ER.FSH.CAPT.MT',
 'ER.FSH.PROD.MT',
 'ER.LND.PTLD.ZS',
 'ER.PTD.TOTL.ZS',
 'Happy Planet Index',
 'IT.CEL.SETS',
 'IT.CEL.SETS.P2',
 'IT.MLT.MAIN',
 'IT.MLT.MAIN.P2',
 'IT.NET.SECR',
 'IT.NET.SECR.P6',
 'IT.NET.USER.ZS',
 'NY.ADJ.DMIN.CD',
 'NY.ADJ.DNGY.CD',
 'SH.H2O.BASW.ZS',
 'SH.STA.BASS.ZS',
 'SH.TBS.DTEC.ZS',
 'SH.TBS.INCD',
 'SL.AGR.EMPL.FE.ZS',
 'SL.AGR.EMPL.MA.ZS',
 'SL.AGR.EMPL.ZS',
 'SL.EMP.1524.SP.FE.ZS',
 'SL.EMP.1524.SP.MA.ZS',
 'SL.EMP.1524.SP.ZS',
 'SL.EMP.MPYR.FE.ZS',
 'SL.EMP.MPYR.MA.ZS',
 'SL.EMP.MPYR.ZS',
 'SL.EMP.SELF.FE.ZS',
 'SL.EMP.SELF.MA.ZS',
 'SL.EMP.SELF.ZS',
 'SL.EMP.TOTL.SP.FE.ZS',
 'SL.EMP.TOTL.SP.MA.ZS',
 'SL.EMP.TOTL.SP.ZS',
 'SL.EMP.VULN.FE.ZS',
 'S

In [15]:
# Evaluate columns with most missing data
columns_90_nulls = set(wdi_hpi_2016_df.columns[wdi_hpi_2016_df.isnull().mean() > 0.90])
columns_75_nulls = set(wdi_hpi_2016_df.columns[wdi_hpi_2016_df.isnull().mean() > 0.75])    # more than 75% of data is missing
columns_50_nulls = set(wdi_hpi_2016_df.columns[wdi_hpi_2016_df.isnull().mean() > 0.50])

print(len(columns_90_nulls))
print(len(columns_75_nulls))
print(len(columns_50_nulls))
print(wdi_hpi_2016_df.shape[1])

99
192
330
1274


In [16]:
# Remove columns with more than 50% of missing data
wdi_hpi_2016_df = wdi_hpi_2016_df.drop(wdi_hpi_2016_df.columns[wdi_hpi_2016_df.isnull().mean() > 0.50], axis = 1)
print(wdi_hpi_2016_df.shape)

(139, 944)


In [17]:
# Show remaining columns in order to choose columns for the model
col_descr = pd.DataFrame(wdi_hpi_2016_df.columns, columns = ['Indicator Code'])

df_wdi_inds = pd.read_csv('../data/raw_data/WDISeries.csv')

col_descr = col_descr.merge(df_wdi_inds, left_on='Indicator Code', right_on='Series Code', how='left')
col_descr

Unnamed: 0,Indicator Code,Series Code,Topic,Indicator Name,Short definition,Long definition,Unit of measure,Periodicity,Base Period,Other notes,...,Notes from original source,General comments,Source,Statistical concept and methodology,Development relevance,Related source links,Other web links,Related indicators,License Type,Unnamed: 20
0,Country Name,,,,,,,,,,...,,,,,,,,,,
1,AG.CON.FERT.ZS,AG.CON.FERT.ZS,Environment: Agricultural production,Fertilizer consumption (kilograms per hectare ...,,Fertilizer consumption measures the quantity o...,,Annual,,,...,,,"Food and Agriculture Organization, electronic ...",Fertilizer consumption measures the quantity o...,"Factors such as the green revolution, has led ...",,,,CC BY-4.0,
2,AG.LND.AGRI.K2,AG.LND.AGRI.K2,Environment: Land use,Agricultural land (sq. km),,Agricultural land refers to the share of land ...,,Annual,,,...,,,"Food and Agriculture Organization, electronic ...",Agricultural land constitutes only a part of a...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,
3,AG.LND.AGRI.ZS,AG.LND.AGRI.ZS,Environment: Land use,Agricultural land (% of land area),,Agricultural land refers to the share of land ...,,Annual,,,...,,,"Food and Agriculture Organization, electronic ...",Agriculture is still a major sector in many ec...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,
4,AG.LND.ARBL.HA,AG.LND.ARBL.HA,Environment: Land use,Arable land (hectares),,Arable land (in hectares) includes land define...,,Annual,,,...,,,"Food and Agriculture Organization, electronic ...",Temporary fallow land refers to land left fall...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,
5,AG.LND.ARBL.HA.PC,AG.LND.ARBL.HA.PC,Environment: Land use,Arable land (hectares per person),,Arable land (hectares per person) includes lan...,,Annual,,,...,,,"Food and Agriculture Organization, electronic ...",Temporary fallow land refers to land left fall...,Agricultural land covers about one-third of th...,,,,CC BY-4.0,
6,AG.LND.ARBL.ZS,AG.LND.ARBL.ZS,Environment: Land use,Arable land (% of land area),,Arable land includes land defined by the FAO a...,,Annual,,,...,,,"Food and Agriculture Organization, electronic ...",Temporary fallow land refers to land left fall...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,
7,AG.LND.CREL.HA,AG.LND.CREL.HA,Environment: Agricultural production,Land under cereal production (hectares),,Land under cereal production refers to harvest...,,Annual,,,...,,,"Food and Agriculture Organization, electronic ...","Cereals production includes wheat, rice, maize...",The cultivation of cereals varies widely in di...,,,,CC BY-4.0,
8,AG.LND.CROP.ZS,AG.LND.CROP.ZS,Environment: Land use,Permanent cropland (% of land area),,Permanent cropland is land cultivated with cro...,,Annual,,,...,,,"Food and Agriculture Organization, electronic ...",The data on Permanent cropland and land area a...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,
9,AG.LND.FRST.K2,AG.LND.FRST.K2,Environment: Land use,Forest area (sq. km),,Forest area is land under natural or planted s...,,Annual,,,...,,,"Food and Agriculture Organization, electronic ...",Forest is determined both by the presence of t...,"As threats to biodiversity mount, the internat...",,,,CC BY-4.0,


In [18]:
# Reduce DataFrame to chosen columns
keep_columns = pd.DataFrame(['Country Name', 'Happy Planet Index'])
chosen_columns = pd.read_csv('../data/raw_data/WDISeries_Choice.csv')
chosen_columns = chosen_columns['Series Code'][chosen_columns['Choice'] == True]
chosen_columns.reset_index().drop('index', axis = 1)
keep_columns = pd.concat([keep_columns, chosen_columns], ignore_index = True).rename(columns = {0:'Indicator Code'})
keep_columns = keep_columns['Indicator Code'].tolist()

drop_columns = []
for col in wdi_hpi_2016_df.columns:
    if not col in keep_columns:
        drop_columns.append(col)

wdi_hpi_2016_df = wdi_hpi_2016_df.drop(drop_columns, axis = 1)

wdi_hpi_2016_df


Unnamed: 0,Country Name,AG.CON.FERT.ZS,AG.LND.AGRI.ZS,AG.LND.CROP.ZS,AG.LND.FRST.ZS,AG.PRD.FOOD.XD,AG.PRD.LVSK.XD,BG.GSR.NFSV.GD.ZS,BM.GSR.CMCP.ZS,BM.GSR.INSF.ZS,...,SH.H2O.BASW.ZS,SH.H2O.SMDW.ZS,SH.MED.PHYS.ZS,SH.STA.BASS.ZS,SP.DYN.LE00.IN,SP.POP.GROW,ST.INT.ARVL,ST.INT.DPRT,ST.INT.XPND.MP.ZS,Happy Planet Index
0,Afghanistan,12.182300,58.067580,0.277242,2.067825,125.41,98.70,8.214071,11.539220,1.514437,...,64.286484,,0.2840,42.054183,63.763000,2.778317,,,1.214557,20.225350
1,Albania,126.138483,43.127735,3.036496,28.121897,150.93,113.07,37.276873,13.357326,2.139191,...,91.020370,69.984329,1.1998,97.704877,78.194000,-0.159880,4070000.0,4852000.0,24.589211,36.766874
2,Algeria,22.315273,17.365539,0.425487,0.824439,151.62,147.06,8.941519,60.278868,2.153545,...,93.515851,,1.8300,87.542316,76.298000,2.051355,2039000.0,4530000.0,0.923644,33.300543
3,Argentina,50.336331,54.335712,0.365405,9.798406,129.11,107.00,6.331674,31.524771,2.931377,...,99.078375,,,94.258505,76.221000,1.057182,6655000.0,10446000.0,15.551549,35.190244
4,Armenia,110.495136,58.897086,2.037232,11.668423,135.37,143.85,31.207718,10.972255,3.670673,...,99.898536,83.005976,,93.406972,74.640000,0.361431,1260000.0,1263000.0,24.774536,25.666417
5,Australia,68.103696,48.241944,0.042902,16.258278,105.58,102.02,9.967677,26.699697,3.812020,...,99.969972,,3.5874,99.991152,82.448780,1.561940,8269000.0,10380000.0,12.605527,21.228966
6,Austria,141.837426,32.356676,0.810320,46.905713,101.29,108.27,28.130637,44.644297,5.861771,...,100.000000,98.907808,5.1441,99.974814,81.641463,1.081396,28121000.0,11534000.0,6.046445,30.478224
7,Bangladesh,289.402272,70.632327,6.376277,10.957978,145.31,135.88,5.122781,13.063639,11.458420,...,96.882530,55.491263,0.4822,47.005803,71.785000,1.091300,,,1.750125,38.390694
8,Belarus,146.618640,42.035489,0.548308,42.630106,133.94,147.85,23.605768,45.339389,5.899941,...,96.474184,94.497432,,97.777751,73.826829,0.125511,10935400.0,467000.0,2.970743,21.718916
9,Belgium,318.484804,44.610305,0.759577,22.583885,84.89,81.32,44.952060,56.161278,7.070418,...,99.999999,99.523184,3.3234,99.486058,81.439024,0.506300,7481000.0,13372000.0,5.780736,23.726523


In [19]:
print(wdi_hpi_2016_df['SH.H2O.SMDW.ZS'].mean(),'\n')
wdi_hpi_2016_df.mean()

77.22718636250488 



AG.CON.FERT.ZS          1.975917e+02
AG.LND.AGRI.ZS          4.286560e+01
AG.LND.CROP.ZS          3.363702e+00
AG.LND.FRST.ZS          2.994156e+01
AG.PRD.FOOD.XD          1.239777e+02
AG.PRD.LVSK.XD          1.189442e+02
BG.GSR.NFSV.GD.ZS       2.397951e+01
BM.GSR.CMCP.ZS          3.599249e+01
BM.GSR.INSF.ZS          7.619859e+00
BM.GSR.TRAN.ZS          3.313865e+01
BM.KLT.DINV.WD.GD.ZS    2.401294e+00
BX.GSR.MRCH.CD          1.062062e+11
BX.GSR.NFSV.CD          3.428883e+10
BX.GSR.ROYL.CD          3.072441e+09
BX.KLT.DINV.WD.GD.ZS    4.895063e+00
CM.MKT.INDX.ZG          5.305658e+00
DT.DOD.DIMF.CD          1.146466e+09
DT.DOD.DSTC.IR.ZS       5.256618e+01
EG.ELC.ACCS.ZS          8.424617e+01
EN.POP.DNST             1.864113e+02
ER.LND.PTLD.ZS          1.811230e+01
ER.MRN.PTMR.ZS          6.842138e+00
ER.PTD.TOTL.ZS          1.353181e+01
FB.ATM.TOTL.P5          5.434421e+01
FB.BNK.CAPA.ZS          1.018189e+01
FB.CBK.BRCH.P5          1.771323e+01
FD.AST.PRVT.GD.ZS       5.663540e+01
F

In [20]:
# Dealing with NaN values: Impute column mean for NaN values
wdi_hpi_2016_df = wdi_hpi_2016_df.fillna(wdi_hpi_2016_df.mean())

In [21]:
# Save preprocessed DataFrame
wdi_hpi_2016_df.to_pickle('../data/wdi_hpi_2016_df.pkl')