In [80]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt

# Data preparation

In [81]:
very_high_hdi_df = pd.read_csv("./files/very_high_human_development.csv", sep=';')
high_hdi_df      = pd.read_csv("./files/high_human_development.csv", sep=';')
medium_hdi_df    = pd.read_csv("./files/medium_human_development.csv", sep=';')
low_hdi_df       = pd.read_csv("./files/low_human_development.csv", sep=';')

**mark**
- VH - very_high
- H - high
- M - medium
- L - low

In [82]:
very_high_hdi_df['mark'] = pd.Series("VH", index=very_high_hdi_df.index)
high_hdi_df['mark'] = pd.Series("H", index=high_hdi_df.index)
medium_hdi_df['mark'] = pd.Series("M", index=medium_hdi_df.index)
low_hdi_df['mark'] = pd.Series("L", index=low_hdi_df.index)

- hdi_rank = HDI rank
- country = Country
- hdi = Human Development Index (HDI)
- life_expectancy_at_birth = Life expectancy at birth
- expected_years_of_schooling = Expected years of schooling
- mean_years_of_schooling= Mean years of schooling
- gni = Gross national income (GNI) per capita (2011 PPP $)
- gni_rank_minus_hdi_rank = GNI per capita rank minus HDI rank

In [83]:
very_high_hdi_df.head()

Unnamed: 0,HDI rank,Country,Human Development Index (HDI),Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita (2011 PPP $),GNI per capita rank minus HDI rank,mark
0,1,Norway,944,816,175,126,64992,5,VH
1,2,Australia,935,824,202,130,42261,17,VH
2,3,Switzerland,930,830,158,128,56431,6,VH
3,4,Denmark,923,802,187,127,44025,11,VH
4,5,Netherlands,922,816,179,119,45435,9,VH


In [84]:
raw_hdi_df = pd.concat([very_high_hdi_df, high_hdi_df, medium_hdi_df, low_hdi_df])
raw_hdi_df.columns = ["hdi_rank", "country", "hdi",
                      "life_expectancy_at_birth",
                      "expected_years_of_schooling",
                      "mean_years_of_schooling", "gni",
                      "gni_rank_minus_hdi_rank", "mark"]

In [85]:
raw_hdi_df.head()

Unnamed: 0,hdi_rank,country,hdi,life_expectancy_at_birth,expected_years_of_schooling,mean_years_of_schooling,gni,gni_rank_minus_hdi_rank,mark
0,1,Norway,944,816,175,126,64992,5,VH
1,2,Australia,935,824,202,130,42261,17,VH
2,3,Switzerland,930,830,158,128,56431,6,VH
3,4,Denmark,923,802,187,127,44025,11,VH
4,5,Netherlands,922,816,179,119,45435,9,VH


In [86]:
object_to_float = lambda s: s.str.replace(',','.').astype(float)
object_to_int = lambda s: s.astype(int)

In [87]:
object_to_float_columns = ["hdi", "life_expectancy_at_birth", "expected_years_of_schooling", "mean_years_of_schooling"]
object_to_int_columns = ["hdi_rank", "gni", "gni_rank_minus_hdi_rank"]

In [88]:
hdi_df = raw_hdi_df.copy()
hdi_df[object_to_float_columns] = raw_hdi_df[object_to_float_columns].apply(object_to_float)
hdi_df[object_to_int_columns] = raw_hdi_df[object_to_int_columns].apply(object_to_int)

hdi_df.dtypes

hdi_rank                         int64
country                         object
hdi                            float64
life_expectancy_at_birth       float64
expected_years_of_schooling    float64
mean_years_of_schooling        float64
gni                              int64
gni_rank_minus_hdi_rank          int64
mark                            object
dtype: object

In [89]:
hdi_df.index = hdi_df.country
hdi_df.drop("country", axis=1, inplace=True)

In [90]:
hdi_df.head()

Unnamed: 0_level_0,hdi_rank,hdi,life_expectancy_at_birth,expected_years_of_schooling,mean_years_of_schooling,gni,gni_rank_minus_hdi_rank,mark
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Norway,1,0.944,81.6,17.5,12.6,64992,5,VH
Australia,2,0.935,82.4,20.2,13.0,42261,17,VH
Switzerland,3,0.93,83.0,15.8,12.8,56431,6,VH
Denmark,4,0.923,80.2,18.7,12.7,44025,11,VH
Netherlands,5,0.922,81.6,17.9,11.9,45435,9,VH


In [91]:
hdi_df.to_csv("files/hdi_with_components.csv")

In [92]:
!head "files/hdi_with_components.csv"

country,hdi_rank,hdi,life_expectancy_at_birth,expected_years_of_schooling,mean_years_of_schooling,gni,gni_rank_minus_hdi_rank,mark
Norway,1,0.944,81.6,17.5,12.6,64992,5,VH
Australia,2,0.935,82.4,20.2,13.0,42261,17,VH
Switzerland,3,0.93,83.0,15.8,12.8,56431,6,VH
Denmark,4,0.923,80.2,18.7,12.7,44025,11,VH
Netherlands,5,0.922,81.6,17.9,11.9,45435,9,VH
Germany,6,0.916,80.9,16.5,13.1,43919,11,VH
Ireland,6,0.916,80.9,18.6,12.2,39568,16,VH
United States,8,0.915,79.1,16.5,12.9,52947,3,VH
Canada,9,0.913,82.0,15.9,13.0,42155,11,VH


# Add ISO_A3 county code

In [93]:
gdi = pd.read_csv("files/gdi.csv", index_col="Country")

In [94]:
gdi.head()

Unnamed: 0_level_0,CODE,GDI,GDI group,HDI F,HDI M,LEB F,LEB M,EYS F,EYS M,MYS F,MYS M,EGN F,EGN M
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Norway,NOR,0.9963,1,0.940411,0.943903,83.6,79.5,18.19054,16.8241,12.68,12.548,57139.785877,72824.953704
Australia,AUS,0.975716,1,0.922306,0.945261,84.5,80.3,20.72467,19.72474,13.0691,12.85581,33688.072522,50913.744323
Switzerland,CHE,0.950473,2,0.898293,0.945101,85.0,80.8,15.70207,15.87935,11.5,13.1,44131.598189,69076.742105
Denmark,DNK,0.976778,1,0.911965,0.933645,82.2,78.3,19.32247,18.07368,12.80576,12.65413,36439.439333,51726.638178
Netherlands,NLD,0.947139,3,0.893001,0.942841,83.3,79.7,17.96617,17.88358,11.61511,12.17535,29500.241616,61641.441877


In [95]:
iso_a3_df = pd.DataFrame(gdi.CODE, index=gdi.index)
iso_a3_df.columns = ["iso_a3"]

In [96]:
iso_a3_df.to_csv("files/iso_a3_country_codes.csv")

In [97]:
!head "files/iso_a3_country_codes.csv"

Country,iso_a3
Norway,NOR
Australia,AUS
Switzerland,CHE
Denmark,DNK
Netherlands,NLD
Germany,DEU
Ireland,IRL
United States,USA
Canada,CAN


In [98]:
hdi_df["iso_a3"] = iso_a3_df.iso_a3

In [99]:
hdi_df[:2]

Unnamed: 0_level_0,hdi_rank,hdi,life_expectancy_at_birth,expected_years_of_schooling,mean_years_of_schooling,gni,gni_rank_minus_hdi_rank,mark,iso_a3
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Norway,1,0.944,81.6,17.5,12.6,64992,5,VH,NOR
Australia,2,0.935,82.4,20.2,13.0,42261,17,VH,AUS


In [100]:
hdi_df.to_csv("files/hdi_with_components.csv")

In [101]:
!head "files/hdi_with_components.csv"

country,hdi_rank,hdi,life_expectancy_at_birth,expected_years_of_schooling,mean_years_of_schooling,gni,gni_rank_minus_hdi_rank,mark,iso_a3
Norway,1,0.944,81.6,17.5,12.6,64992,5,VH,NOR
Australia,2,0.935,82.4,20.2,13.0,42261,17,VH,AUS
Switzerland,3,0.93,83.0,15.8,12.8,56431,6,VH,CHE
Denmark,4,0.923,80.2,18.7,12.7,44025,11,VH,DNK
Netherlands,5,0.922,81.6,17.9,11.9,45435,9,VH,NLD
Germany,6,0.916,80.9,16.5,13.1,43919,11,VH,DEU
Ireland,6,0.916,80.9,18.6,12.2,39568,16,VH,IRL
United States,8,0.915,79.1,16.5,12.9,52947,3,VH,USA
Canada,9,0.913,82.0,15.9,13.0,42155,11,VH,CAN
