In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pycountry



### Combining the files in /features
 - [GDP](features/GrossDomesticProduct.csv)
 - [HDI](features/HumanDevelopementIndex.csv)
 - [WDI](features/WorldDevelopmentIndicaton.csv)

In [2]:
## Preprocessing WDI
# Read in WDI data
wdi_raw = pd.read_csv('features/WorldDevelopmentIndicaton.csv', encoding='latin-1', engine='python')
wdi=wdi_raw.drop(['Country Name', 'Series Code'], axis=1)
wdi=wdi.rename(columns={'Country Code':'country', 'Series':'feature'})
wdi=wdi.set_index(['country', 'feature'])
wdi=wdi.stack().reset_index()
wdi=wdi.rename(columns={'level_2':'year', 0:'value'})
wdi['year']=wdi['year'].apply(lambda x: int(x[:4]))
wdi=wdi.set_index(['country', 'year'])
wdi['value']=wdi['value'].apply(lambda x: x if x != '..' else np.nan)
wdi.dropna(inplace=True)
wdi.sort_index(inplace=True)
wdi

Unnamed: 0_level_0,Unnamed: 1_level_0,feature,value
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
AUS,2000,All staff compensation as % of total expenditu...,73.66373
AUS,2000,All staff compensation as % of total expenditu...,66.72803
AUS,2000,All staff compensation as % of total expenditu...,75.69741
AUS,2000,All staff compensation as % of total expenditu...,68.32235
AUS,2000,All staff compensation as % of total expenditu...,72.82737
...,...,...,...
USA,2020,End month of the academic school year (pre-pri...,5
USA,2020,End of the academic school year (pre-primary t...,2020
USA,2020,Official entrance age to lower secondary educa...,12
USA,2020,Official entrance age to primary education (ye...,6


In [3]:
## Preprocessing HDI
# Read in HDI data
hdi_raw = pd.read_csv('features/HumanDevelopementIndex.csv', encoding='latin-1', engine='python')
hdi=hdi_raw.drop(['country'], axis=1)
hdi=hdi.rename(columns={'iso3':'country'})
hdi_keep=[f'hdi_20{i}' if i>9 else f'hdi_201{i}' for i in range(20) ]+['country','hdi_2020']
hdi=hdi[hdi_keep]
hdi=hdi.set_index('country')
hdi=hdi.stack().reset_index()
hdi=hdi.rename(columns={'level_1':'year', 0:'value'})
hdi['year']=hdi['year'].apply(lambda x: int(x[4:]))
hdi['feature']='hdi'
hdi=hdi.set_index(['country', 'year'])

In [15]:
## Preprocessing GDP
# Read in GDP data
gdp_raw = pd.read_csv('features/GrossDomesticProduct.csv', encoding='latin-1', engine='python')


Now, preprocess all the features to select the most important ones

In [7]:
combined_features = pd.read_csv('features/combined_features.csv', encoding='latin-1', engine='python')


In [16]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [18]:
print(combined_features.head())
print(len(combined_features))
print(len(combined_features['feature'].unique()))
print(combined_features['feature'].unique())

  country  year                                            feature     value
0     AUS  2000  All staff compensation as % of total expenditu...  73.66373
1     AUS  2000  All staff compensation as % of total expenditu...  66.72803
2     AUS  2000  All staff compensation as % of total expenditu...  75.69741
3     AUS  2000  All staff compensation as % of total expenditu...  68.32235
4     AUS  2000  All staff compensation as % of total expenditu...  72.82737
386646
2423
['All staff compensation as % of total expenditure in lower secondary public institutions (%)'
 'All staff compensation as % of total expenditure in post-secondary non-tertiary public institutions (%)'
 'All staff compensation as % of total expenditure in primary public institutions (%)'
 'All staff compensation as % of total expenditure in public institutions (%)'
 'All staff compensation as % of total expenditure in secondary public institutions (%)'
 'All staff compensation as % of total expenditure in tertiary public