In [339]:
import requests   
import json
import pprint
import numpy as np
import pandas as pd
from functools import reduce

In [358]:
pd.options.display.max_rows = 3000
pd.options.display.max_columns = 40

### 1.a Static Data

In [362]:
wh = pd.read_excel('WHRData.xls', index_col=0)
wh = wh.reset_index()
wh = wh.rename(columns={'Country name': 'country', 'Year': 'date'})
#wh = wh[list(filter(lambda c: not c.startswith('Most'), wh.columns.tolist()))]

wh

Unnamed: 0,country,date,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,Standard deviation of ladder by country-year,Standard deviation/Mean of ladder by country-year,GINI index (World Bank estimate),"GINI index (World Bank estimate), average 2000-16","gini of household income reported in Gallup, by wp5-year","Most people can be trusted, Gallup","Most people can be trusted, WVS round 1981-1984","Most people can be trusted, WVS round 1989-1993","Most people can be trusted, WVS round 1994-1998","Most people can be trusted, WVS round 1999-2004","Most people can be trusted, WVS round 2005-2009","Most people can be trusted, WVS round 2010-2014"
0,Afghanistan,2008,3.72359,7.16869,0.450662,50.799999,0.718114,0.177889,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,1.774662,0.4766,,,,,,,,,,
1,Afghanistan,2009,4.401778,7.33379,0.552308,51.200001,0.678896,0.200178,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,1.722688,0.391362,,,0.441906,0.286315,,,,,,
2,Afghanistan,2010,4.758381,7.386629,0.539075,51.599998,0.600127,0.134353,0.706766,0.618265,0.275324,0.299357,-1.99181,-1.617176,1.878622,0.394803,,,0.327318,0.275833,,,,,,
3,Afghanistan,2011,3.831719,7.415019,0.521104,51.919998,0.495901,0.172137,0.731109,0.611387,0.267175,0.307386,-1.919018,-1.616221,1.78536,0.465942,,,0.336764,,,,,,,
4,Afghanistan,2012,3.782938,7.517126,0.520637,52.240002,0.530935,0.244273,0.77562,0.710385,0.267919,0.43544,-1.842996,-1.404078,1.798283,0.475367,,,0.34454,,,,,,,
5,Afghanistan,2013,3.5721,7.522238,0.483552,52.560001,0.577955,0.070403,0.823204,0.620585,0.273328,0.482847,-1.879709,-1.403036,1.22369,0.342569,,,0.304368,,,,,,,
6,Afghanistan,2014,3.130896,7.516955,0.525568,52.880001,0.508514,0.113184,0.871242,0.531691,0.374861,0.409048,-1.773257,-1.312503,1.395396,0.445686,,,0.413974,,,,,,,
7,Afghanistan,2015,3.982855,7.500539,0.528597,53.200001,0.388928,0.089091,0.880638,0.553553,0.339276,0.260557,-1.844364,-1.291594,2.160618,0.54248,,,0.596918,,,,,,,
8,Afghanistan,2016,4.220169,7.497038,0.559072,53.0,0.522566,0.051365,0.793246,0.564953,0.348332,0.32499,-1.855426,-1.392713,1.796219,0.425627,,,0.418629,,,,,,,
9,Afghanistan,2017,2.661718,7.497755,0.49088,52.799999,0.427011,-0.112198,0.954393,0.496349,0.371326,0.261179,-1.886566,-1.437808,1.454051,0.546283,,,0.286599,,,,,,,


### 1.b Dynamic data

In [324]:
class WorldBankDataFetcher:
    """
    """
    
    def __init__(self, indicators):
        self.indicators = indicators
        self.jsons = self.dataframes = []
        self.merged_data = None        

    def fetch_json_by(indicator):
        print('Fetching:' + indicator)
        url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=9000" % (indicator + '?date=2000:2018')
        r = requests.get(url)    
        return json.loads(r.content)

    def json_to_df(j):
        indicator_name = j[1][0]['indicator']['value']
        df = pd.DataFrame.from_dict(j[1])
        df['country'] = df[['country']].applymap(lambda x : x['value'])
        df = df[['country', 'countryiso3code', 'date', 'value']]
        df.columns = ['country', 'code', 'year', indicator_name]
        return df
    
    def fetch(self):
        print('Note: this operation make take a while depending on the network status...\n...')      
        
        for i in indicators:
            self.jsons.append(fetch_json_by(i))
            progress = str(indicators.index(i)) + '/' + str(len(indicators))             
            print(' '.join([progress, 'Indicator', i]))
        
        print('Creating data frames...')
        self.dataframes = [json_to_df(j) for j in self.jsons]
        
        print('Merging data frames...')
        self.merged_data = reduce(lambda x, y: x.merge(y, how='outer', on=['country', 'code', 'date']), self.dataframes)        
        
        print('All done!')

In [325]:
indicators = [
    'NY.GDP.PCAP.CD',    # GDP per capita (current US$)    
    'AG.LND.AGRI.ZS',    # Agricultural land (% of land area)
    'EG.ELC.ACCS.ZS',    # Access to electricity (% of population)    
    'EN.ATM.CO2E.PC',    # CO2 emissions (metric tons per capita)
    'MS.MIL.XPND.GD.ZS', # Military expenditure (% of GDP)
    'SL.UEM.TOTL.ZS',    # Unemployment, total (% of total labor force) (modeled ILO estimate)
    'BM.GSR.ROYL.CD',    # Charges for the use of intellectual property, payments (BoP, current US$)
    'SP.POP.TOTL',       # Population, total
    'IS.AIR.DPRT',       # Air transport, registered carrier departures worldwide
    'IC.TAX.TOTL.CP.ZS'  # Total tax rate (% of commercial profits)
]

In [326]:
fetcher = WorldBankDataFetcher(indicators)
fetcher.fetch()

Note: this operation make take a while depending on the network status...
...
0/10 Indicator NY.GDP.PCAP.CD
1/10 Indicator AG.LND.AGRI.ZS
2/10 Indicator EG.ELC.ACCS.ZS
3/10 Indicator EN.ATM.CO2E.PC
4/10 Indicator MS.MIL.XPND.GD.ZS
5/10 Indicator SL.UEM.TOTL.ZS
6/10 Indicator BM.GSR.ROYL.CD
7/10 Indicator SP.POP.TOTL
8/10 Indicator IS.AIR.DPRT
9/10 Indicator IC.TAX.TOTL.CP.ZS
Creating data frames...
Merging data frames...
All done!


In [328]:
#fetcher.merged_data#[fetcher.merged_data.date == '2018']
wb = fetcher.merged_data
wb['date'] =  pd.to_numeric(wb['date'])
wb = wb[wb.code!='']

wb[wb['date'] < 2007]

Unnamed: 0,country,code,date,GDP per capita (current US$),Agricultural land (% of land area),Access to electricity (% of population),CO2 emissions (metric tons per capita),Military expenditure (% of GDP),"Unemployment, total (% of total labor force) (modeled ILO estimate)","Charges for the use of intellectual property, payments (BoP, current US$)","Population, total","Air transport, registered carrier departures worldwide",Total tax rate (% of commercial profits)
623,Afghanistan,AFG,2006,269.229693,58.067580,28.228613,0.063728,1.896234,9.057000,,25893450.0,,35.8
636,Albania,ALB,2006,2972.742924,40.875912,100.000000,1.302576,1.567769,16.040001,6.768509e+06,2992547.0,4481.0,56.9
649,Algeria,DZA,2006,3464.610079,17.290300,98.774414,2.992187,2.643808,12.270000,1.500000e+07,33777915.0,44822.0,76.9
662,American Samoa,ASM,2006,8456.947997,25.000000,,,,,,58650.0,,
675,Andorra,AND,2006,43748.772159,46.340427,100.000000,6.746219,,,,80991.0,,
688,Angola,AGO,2006,2585.133522,46.193952,29.103676,1.098884,3.761495,19.084999,1.366548e+06,20262399.0,4965.0,52.2
701,Antigua and Barbuda,ATG,2006,12812.764438,20.454545,92.434402,4.913644,,,7.693244e+05,90301.0,26175.0,47.6
714,Argentina,ARG,2006,5878.761027,51.560096,97.167221,4.434821,0.788234,10.080000,8.948241e+08,39558890.0,74162.0,107.3
727,Armenia,ARM,2006,2158.002909,61.693011,99.265732,1.481178,2.947613,16.827999,,2958500.0,6177.0,36.6
740,Aruba,ABW,2006,24045.749421,11.111111,92.613983,26.948260,,,1.034751e+07,100832.0,,


### 1.c Integration

In [266]:
country_names = {
    
    'Congo (Brazzaville)' : 'Congo, Rep.',
    
    'Congo (Kinshasa)' : 'Congo, Dem. Rep.',
    
    'Egypt' : 'Egypt, Arab Rep.',
    
    'Gambia': 'Gambia, The',
    
    'Hong Kong S.A.R. of China' : 'Hong Kong SAR, China',
    
    'Iran' : 'Iran, Islamic Rep.',
    
    'Ivory Coast' : "Cote d'Ivoire",
    
    'Kyrgyzstan' : 'Kyrgyz Republic',
    
    'Laos' : 'Lao PDR',
    
    'Macedonia': 'North Macedonia',
    
    'Palestinian Territories' : 'West Bank and Gaza',
    
    'Russia' : 'Russian Federation',
    
    'Slovakia' : 'Slovak Republic',
    
    'South Korea' : 'Korea, Rep.',
        
    'Swaziland': 'Eswatini',
    
    'Syria' : 'Syrian Arab Republic',
    
    'Venezuela' : 'Venezuela, RB',
    
    'Yemen' : 'Yemen, Rep.'
    
}

country_names = {v:k for k, v in country_names.items()}
country_names

{'Congo, Rep.': 'Congo (Brazzaville)',
 'Congo, Dem. Rep.': 'Congo (Kinshasa)',
 'Egypt, Arab Rep.': 'Egypt',
 'Gambia, The': 'Gambia',
 'Hong Kong SAR, China': 'Hong Kong S.A.R. of China',
 'Iran, Islamic Rep.': 'Iran',
 "Cote d'Ivoire": 'Ivory Coast',
 'Kyrgyz Republic': 'Kyrgyzstan',
 'Lao PDR': 'Laos',
 'North Macedonia': 'Macedonia',
 'West Bank and Gaza': 'Palestinian Territories',
 'Russian Federation': 'Russia',
 'Slovak Republic': 'Slovakia',
 'Korea, Rep.': 'South Korea',
 'Eswatini': 'Swaziland',
 'Syrian Arab Republic': 'Syria',
 'Venezuela, RB': 'Venezuela',
 'Yemen, Rep.': 'Yemen'}

In [246]:
#set(wh.index).intersection(set(wb.country))
set(wh.index).difference(set(wb.country))

{'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Egypt',
 'Gambia',
 'Hong Kong S.A.R. of China',
 'Iran',
 'Ivory Coast',
 'Kyrgyzstan',
 'Laos',
 'Macedonia',
 'North Cyprus',
 'Palestinian Territories',
 'Russia',
 'Slovakia',
 'Somaliland region',
 'South Korea',
 'Swaziland',
 'Syria',
 'Taiwan Province of China',
 'Venezuela',
 'Yemen'}

In [247]:
set(wb.country).difference(set(wh.index))

{'American Samoa',
 'Andorra',
 'Antigua and Barbuda',
 'Aruba',
 'Bahamas, The',
 'Barbados',
 'Bermuda',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Cabo Verde',
 'Cayman Islands',
 'Channel Islands',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 "Cote d'Ivoire",
 'Curacao',
 'Dominica',
 'Egypt, Arab Rep.',
 'Equatorial Guinea',
 'Eritrea',
 'Eswatini',
 'Faroe Islands',
 'Fiji',
 'French Polynesia',
 'Gambia, The',
 'Gibraltar',
 'Greenland',
 'Grenada',
 'Guam',
 'Guinea-Bissau',
 'Hong Kong SAR, China',
 'Iran, Islamic Rep.',
 'Isle of Man',
 'Kiribati',
 'Korea, Dem. People’s Rep.',
 'Korea, Rep.',
 'Kyrgyz Republic',
 'Lao PDR',
 'Liechtenstein',
 'Macao SAR, China',
 'Maldives',
 'Marshall Islands',
 'Micronesia, Fed. Sts.',
 'Monaco',
 'Nauru',
 'New Caledonia',
 'North Macedonia',
 'Northern Mariana Islands',
 'Palau',
 'Papua New Guinea',
 'Puerto Rico',
 'Russian Federation',
 'Samoa',
 'San Marino',
 'Sao Tome and Principe',
 'Seychelles',
 'Sint Maarten (Dutch part)',
 'S

In [263]:
wb = wb.applymap(lambda x: country_names[x] if x in country_names else x)
wb

Unnamed: 0,country,code,date,GDP per capita (current US$),Agricultural land (% of land area),Access to electricity (% of population),CO2 emissions (metric tons per capita),Military expenditure (% of GDP),"Unemployment, total (% of total labor force) (modeled ILO estimate)","Charges for the use of intellectual property, payments (BoP, current US$)","Population, total","Air transport, registered carrier departures worldwide",Total tax rate (% of commercial profits)
611,Afghanistan,AFG,2018,,,,,,8.808000,,,,71.4
612,Afghanistan,AFG,2017,550.068459,,,,0.906857,8.837000,1.806604e+05,35530081.0,23682.000000,71.4
613,Afghanistan,AFG,2016,549.582760,58.067580,84.137138,,0.955493,8.841000,3.680295e+04,34656032.0,22770.000000,47.9
614,Afghanistan,AFG,2015,590.076474,58.067580,71.500000,,0.993455,8.864000,2.424755e+05,33736494.0,23532.000000,36.1
615,Afghanistan,AFG,2014,625.339539,58.067580,89.500000,0.299445,1.298013,8.706000,1.078901e+05,32758020.0,25920.000000,36.1
616,Afghanistan,AFG,2013,647.966460,58.067580,67.259552,0.315602,1.076950,8.452000,1.094402e+07,31731688.0,21696.000000,35.6
617,Afghanistan,AFG,2012,648.511070,58.067580,69.100000,0.350371,1.175417,7.936000,1.073803e+07,30696958.0,17775.000000,35.8
618,Afghanistan,AFG,2011,599.297630,58.067580,43.222019,0.412017,1.821346,8.230000,4.708426e+07,29708599.0,25021.000000,35.8
619,Afghanistan,AFG,2010,550.514974,58.069111,42.700000,0.293837,1.945837,7.821000,2.757785e+07,28803167.0,21677.000000,35.8
620,Afghanistan,AFG,2009,444.184404,58.067580,44.854885,0.241723,2.087413,6.705000,9.476112e+04,28004331.0,,35.8


In [265]:
set(wh.index).difference(set(wb.country))

{'North Cyprus', 'Somaliland region', 'Taiwan Province of China'}

In [373]:
integrated = wh.merge(wb, how='left', on=['country', 'date'])
integrated.isna().sum()#['Most people can be trusted, WVS round 1981-1984']

country                                                                         0
date                                                                            0
Life Ladder                                                                     0
Log GDP per capita                                                             28
Social support                                                                 13
Healthy life expectancy at birth                                               28
Freedom to make life choices                                                   29
Generosity                                                                     82
Perceptions of corruption                                                      96
Positive affect                                                                19
Negative affect                                                                13
Confidence in national government                                             174
Democratic Quali