In [2]:
# Imports
import pandas as pd

In [3]:
# Load in data

# Encoding is specified for each import to resolve an "unexpected continuation byte" error

# https://www.kaggle.com/code/nelgiriyewithana/introduction-to-world-educational-data/notebook
world_education_data = pd.read_csv("Data/Global_Education.csv", encoding="iso-8859-1")

# https://databank.worldbank.org/source/education-statistics-%5E-all-indicators
world_bank_education_data = pd.read_csv("Data/World Bank Education Data.csv", encoding="ascii")

# The following are from the UNESCO Institute for Statistics Data Browser for Education available here: https://databrowser.uis.unesco.org/browser/EDUCATION/UIS-SDG4Monitoring
teacher_data = pd.read_csv("Data/UIS Teacher Data/data.csv")
education_years_data = pd.read_csv("Data/UIS Years of Education/data.csv")
teacher_attrition_data = pd.read_csv("Data/UIS Teacher Attrition/data.csv")

In [4]:
#UIS Data

#Dropping values we do not need
teacher_data = teacher_data.drop(["qualifier", "magnitude"], axis=1)
education_years_data = education_years_data.drop(["qualifier", "magnitude"], axis=1)
teacher_attrition_data = teacher_attrition_data.drop(["qualifier", "magnitude"], axis=1)

In [5]:
# Work out which year has the most complete data
print((teacher_data["year"].value_counts() + education_years_data["year"].value_counts() + teacher_attrition_data["year"].value_counts()).sort_values())

year
2024     150.0
2011     663.0
2012     691.0
2013     906.0
2014    1035.0
2015    1090.0
2017    1119.0
2016    1120.0
2018    1143.0
2020    1147.0
2023    1161.0
2019    1165.0
2022    1166.0
2021    1199.0
2010       NaN
Name: count, dtype: float64


In [6]:
#2021 is the most complete year, so we will use it
years = range(2010, 2022)

new_data = pd.DataFrame(columns=["indicator", "country"])

for i in years: 
    new_data[str(i)] = []

In [7]:
#Data in the files is sorted by indicatorId which makes it easier to add them to our new_data dataframe

def load_into_data(data: pd.DataFrame, dataset: pd.DataFrame) -> pd.DataFrame:
    for index, _ in dataset.iterrows():
        if index == 0:
            lastId = None
            lastGeo = None
        else:
            lastId = dataset.iloc[index -1]["indicatorId"]
            lastGeo = dataset.iloc[index -1]["geoUnit"]

        currId = dataset.iloc[index]["indicatorId"]
        currGeo = dataset.iloc[index]["geoUnit"]
        year = dataset.iloc[index]["year"]
        
        if year > 2021:
            continue

        if lastId != currId or lastGeo != currGeo:
            df = pd.DataFrame({"indicator": [currId], "country": [currGeo]})
            data = pd.concat([data, df], ignore_index=True)
        data.at[len(data) -1, str(year)] = dataset.iloc[index]["value"]
    return data

new_data = load_into_data(new_data,teacher_data)
new_data = load_into_data(new_data,education_years_data)
new_data = load_into_data(new_data,teacher_attrition_data)



In [8]:
data = new_data.copy(deep=True)

In [9]:
def get_average_for_row(row: pd.Series) -> float:
    total = 0
    count = 0
    for cell in row:
        try:
            cell = float(cell)
        except ValueError:
            continue
        if math.isnan(cell):
            continue
        total += cell
        count += 1

    if count == 0:
        return 0
    
    return total / count

In [10]:
# Replacing missing values

import math
# Fill in missing 2021 values with the average of previous years

# Get which indicies are missing a value of 2021.
indexes_to_fill = []
for index, row in data.iterrows():
    if math.isnan(row["2021"]):
        indexes_to_fill.append(index)

print(indexes_to_fill)

for i in indexes_to_fill:
    data.at[i, "2021"] = get_average_for_row(data.iloc[i])

data
    

[6, 9, 13, 17, 18, 19, 22, 26, 27, 28, 29, 32, 36, 40, 42, 45, 48, 53, 54, 55, 56, 58, 59, 61, 63, 64, 68, 69, 71, 72, 74, 76, 78, 82, 84, 86, 87, 88, 89, 94, 96, 97, 98, 101, 103, 107, 110, 112, 115, 118, 121, 125, 127, 133, 135, 139, 143, 148, 149, 150, 153, 159, 166, 169, 171, 174, 179, 180, 186, 190, 194, 195, 197, 201, 206, 207, 210, 212, 213, 215, 219, 222, 224, 227, 228, 229, 230, 236, 239, 241, 244, 246, 247, 248, 251, 254, 257, 260, 263, 264, 268, 272, 273, 275, 281, 283, 287, 291, 296, 299, 300, 303, 305, 314, 316, 317, 322, 328, 332, 334, 335, 337, 338, 343, 344, 345, 346, 347, 348, 349, 350, 351, 353, 354, 355, 357, 359, 361, 362, 363, 371, 372, 375, 376, 377, 378, 379, 380, 381, 383, 388, 389, 393, 396, 399, 400, 406, 408, 411, 415, 419, 420, 421, 422, 425, 429, 430, 431, 432, 435, 439, 441, 444, 446, 449, 453, 458, 459, 460, 461, 463, 464, 466, 468, 469, 473, 474, 476, 477, 479, 481, 483, 487, 489, 491, 492, 493, 494, 499, 501, 502, 503, 508, 512, 515, 517, 520, 523, 526,

Unnamed: 0,indicator,country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,PTRHC.02.QUALIFIED,AGO,,,,,,,88.033783,,,,,67.013420
1,PTRHC.02.QUALIFIED,ALB,,,,28.42535,25.840099,24.001751,22.056330,20.786989,23.51306,24.01643,20.371010,20.236031
2,PTRHC.02.QUALIFIED,AND,,,,14.08721,13.703910,13.408840,13.684210,13.883440,13.27711,12.56213,12.051720,11.747060
3,PTRHC.02.QUALIFIED,ARE,,,,,,,,,27.26619,26.76902,23.275150,21.276951
4,PTRHC.02.QUALIFIED,ARM,,,,,,,,,6.27090,6.20682,6.918270,4.534200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1596,TATTRR.2T3,TCA,,,,,,,,,,,,9.879030
1597,TATTRR.2T3,TKL,,,,,,,,,,,0.000000,0.000000
1598,TATTRR.2T3,TUV,,,,,,,,,,,22.619049,1.515150
1599,TATTRR.2T3,UZB,,,,,,,3.150820,,,,,3.150820


In [11]:
# Converting country codes to names
import pycountry

for index, _ in data.iterrows():
    data.at[index, "country"] = pycountry.countries.get(alpha_3 = data.at[index, "country"]).name

data

Unnamed: 0,indicator,country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,PTRHC.02.QUALIFIED,Angola,,,,,,,88.033783,,,,,67.013420
1,PTRHC.02.QUALIFIED,Albania,,,,28.42535,25.840099,24.001751,22.056330,20.786989,23.51306,24.01643,20.371010,20.236031
2,PTRHC.02.QUALIFIED,Andorra,,,,14.08721,13.703910,13.408840,13.684210,13.883440,13.27711,12.56213,12.051720,11.747060
3,PTRHC.02.QUALIFIED,United Arab Emirates,,,,,,,,,27.26619,26.76902,23.275150,21.276951
4,PTRHC.02.QUALIFIED,Armenia,,,,,,,,,6.27090,6.20682,6.918270,4.534200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1596,TATTRR.2T3,Turks and Caicos Islands,,,,,,,,,,,,9.879030
1597,TATTRR.2T3,Tokelau,,,,,,,,,,,0.000000,0.000000
1598,TATTRR.2T3,Tuvalu,,,,,,,,,,,22.619049,1.515150
1599,TATTRR.2T3,Uzbekistan,,,,,,,3.150820,,,,,3.150820


In [12]:
# Removing all but 2021

data = data.drop(["2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"], axis=1)
data

Unnamed: 0,indicator,country,2021
0,PTRHC.02.QUALIFIED,Angola,67.013420
1,PTRHC.02.QUALIFIED,Albania,20.236031
2,PTRHC.02.QUALIFIED,Andorra,11.747060
3,PTRHC.02.QUALIFIED,United Arab Emirates,21.276951
4,PTRHC.02.QUALIFIED,Armenia,4.534200
...,...,...,...
1596,TATTRR.2T3,Turks and Caicos Islands,9.879030
1597,TATTRR.2T3,Tokelau,0.000000
1598,TATTRR.2T3,Tuvalu,1.515150
1599,TATTRR.2T3,Uzbekistan,3.150820


In [13]:
columns = ["country"]
columns.extend(data["indicator"].unique())

final_data = pd.DataFrame(columns=columns)

final_data

Unnamed: 0,country,PTRHC.02.QUALIFIED,PTRHC.1.QUALIFIED,PTRHC.2T3.QUALIFIED,QUTP.02,QUTP.1,QUTP.2T3,YEARS.FC.COMP.1T3,YEARS.FC.FREE.02,YEARS.FC.FREE.1T3,TATTRR.02,TATTRR.1,TATTRR.2T3


In [14]:
for index, _ in data.iterrows():
    country = data.at[index, "country"]
    indicator = data.at[index, "indicator"]
    indexes: list = final_data[final_data["country"] == country].index.tolist()

    if len(indexes) == 0:
        final_data.loc[len(final_data)] = pd.Series()
        final_data.at[final_data.index[-1], "country"] = country
        indexes.append(len(final_data) -1)
    final_data.at[indexes[0], indicator] = data.iloc[index]["2021"]
    

In [15]:
# World Bank Education data
world_bank_education_data = world_bank_education_data.drop(world_bank_education_data.columns[[1, 3]], axis=1)

for index, row in world_bank_education_data.iterrows():
    if world_bank_education_data.loc[index]["2021 [YR2021]"] == "..":
        world_bank_education_data.loc[index]["2021 [YR2021]"] = get_average_for_row(world_bank_education_data.loc[index])

world_bank_education_data

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  world_bank_education_data.loc[index]["2021 [YR2021]"] = get_average_for_row(world_bank_education_data.loc[index])


Unnamed: 0,Country Name,Series,2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],...,2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023],2024 [YR2024]
0,Afghanistan,Human Capital Index (HCI): Expected Years of S...,..,..,..,..,..,..,..,..,...,..,..,9.21,..,..,..,9.21,..,..,..
1,Afghanistan,Human Capital Index (HCI): Expected Years of S...,..,..,..,..,..,..,..,..,...,..,..,6.73,..,..,..,6.73,..,..,..
2,Afghanistan,Human Capital Index (HCI): Expected Years of S...,..,..,..,..,..,..,..,..,...,..,..,8.58,..,..,..,8.58,..,..,..
3,Afghanistan,Government expenditure on secondary education ...,..,..,..,..,..,..,..,..,...,0.79299,0.92321,0.90325,..,..,..,0.870019,..,..,..
4,Afghanistan,Government expenditure on tertiary education a...,..,..,..,..,..,..,..,..,...,0.51942,..,..,..,..,..,0.390228,..,..,..
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1627,Zimbabwe,Human Capital Index (HCI): Expected Years of S...,..,..,..,..,..,..,..,..,...,..,..,9.99,..,..,..,9.99,..,..,..
1628,Zimbabwe,Human Capital Index (HCI): Expected Years of S...,..,..,..,..,..,..,..,..,...,..,..,10.01,..,..,..,10.01,..,..,..
1629,Zimbabwe,Government expenditure on secondary education ...,..,..,..,..,..,..,..,..,...,..,..,..,..,..,..,1.305727,..,..,..
1630,Zimbabwe,Government expenditure on tertiary education a...,..,..,..,..,..,..,..,..,...,..,..,..,..,..,..,0.788906,..,..,..


In [16]:
world_bank_education_data.drop(world_bank_education_data.columns[[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,-1, -2, -3]], axis = 1)

Unnamed: 0,Country Name,Series,2021 [YR2021]
0,Afghanistan,Human Capital Index (HCI): Expected Years of S...,9.21
1,Afghanistan,Human Capital Index (HCI): Expected Years of S...,6.73
2,Afghanistan,Human Capital Index (HCI): Expected Years of S...,8.58
3,Afghanistan,Government expenditure on secondary education ...,0.870019
4,Afghanistan,Government expenditure on tertiary education a...,0.390228
...,...,...,...
1627,Zimbabwe,Human Capital Index (HCI): Expected Years of S...,9.99
1628,Zimbabwe,Human Capital Index (HCI): Expected Years of S...,10.01
1629,Zimbabwe,Government expenditure on secondary education ...,1.305727
1630,Zimbabwe,Government expenditure on tertiary education a...,0.788906


In [17]:
world_bank_education_data.rename({"Country Name": "country"})
columns = ["country"]
columns.extend(world_bank_education_data["Series"].unique())
final_world_bank_education_data = pd.DataFrame(columns=columns)

for index, _ in world_bank_education_data.iterrows():
    country = world_bank_education_data.at[index, "Country Name"]
    indicator = world_bank_education_data.at[index, "Series"]
    indexes: list = final_world_bank_education_data[final_world_bank_education_data["country"] == country].index.to_list()

    if len(indexes) == 0:
        final_world_bank_education_data.loc[len(final_world_bank_education_data)] = pd.Series()
        final_world_bank_education_data.at[final_world_bank_education_data.index[-1], "country"] = country
        indexes.append(len(final_world_bank_education_data) -1)
    final_world_bank_education_data.at[indexes[0], indicator] = world_bank_education_data.iloc[index]["2021 [YR2021]"]

In [18]:
world_education_data = world_education_data.drop(world_education_data.columns[[1,2,-1,-4]], axis=1)

In [19]:
world_education_data.rename(columns={"Countries and areas": "country"}, inplace=True)

In [249]:
# Joining data together
data = pd.merge(final_world_bank_education_data, world_education_data, on="country")

In [250]:
data

Unnamed: 0,country,"Human Capital Index (HCI): Expected Years of School, Male","Human Capital Index (HCI): Expected Years of School, Female","Human Capital Index (HCI): Expected Years of School, Total",Government expenditure on secondary education as % of GDP (%),Government expenditure on tertiary education as % of GDP (%),Government expenditure on primary education as % of GDP (%),OOSR_Pre0Primary_Age_Male,OOSR_Pre0Primary_Age_Female,OOSR_Primary_Age_Male,...,Grade_2_3_Proficiency_Reading,Grade_2_3_Proficiency_Math,Primary_End_Proficiency_Reading,Primary_End_Proficiency_Math,Lower_Secondary_End_Proficiency_Reading,Lower_Secondary_End_Proficiency_Math,Youth_15_24_Literacy_Rate_Male,Youth_15_24_Literacy_Rate_Female,Gross_Primary_Education_Enrollment,Gross_Tertiary_Education_Enrollment
0,Afghanistan,9.21,6.73,8.58,0.870019,0.390228,1.94682,0,0,0,...,22,25,13,11,0,0,74,56,104.0,9.7
1,Albania,13.08,12.89,12.99,0.806952,0.735245,2.01613,4,2,6,...,0,0,0,0,48,58,99,100,107.0,55.0
2,Algeria,11.01,11.84,11.42,1.821173,1.16981,1.601133,0,0,0,...,0,0,0,0,21,19,98,97,109.9,51.4
3,Andorra,0,0,0,0.6337,0.134172,0.661137,0,0,0,...,0,0,0,0,0,0,0,0,106.4,0.0
4,Angola,8.69,7.07,7.89,1.06928,0.187065,0.745475,31,39,0,...,0,0,0,0,0,0,0,0,113.5,9.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,Uruguay,0,0,11.78,1.141612,0.796504,0.95779,2,1,3,...,80,75,59,68,58,49,99,99,108.5,63.1
167,Uzbekistan,0,0,0,0,0,0,53,55,0,...,0,0,0,0,0,0,100,100,104.2,10.1
168,Vanuatu,10.53,10.77,10.64,2.375857,0.59755,2.458621,38,38,8,...,0,0,0,0,0,0,96,97,109.3,4.7
169,Zambia,0,0,9.15,0.633537,0.43014,1.921707,0,0,17,...,0,0,0,0,5,2,93,92,98.7,4.1


In [251]:
data = data.astype({col: float for col in data.columns[1:]})

In [252]:
# Multivariate Analysis
from matplotlib import pyplot as plt
import numpy as np
from scipy.stats import pearsonr

def multivariate_analysis(x: pd.Series, y: pd.Series) -> float:
    return pearsonr(x, y)

In [253]:
def calculate_corrs():
    correlations = []
    strong_corr = []
    medium_corr = []
    index = 2
    for series_name, series in data.items():
        if series_name == "country":
            continue
        for col in data.columns[index:]:
            corr = multivariate_analysis(series, data[col])
            correlations.append({"x": series_name, "y": data[col].name, "value": corr.statistic, "pvalue": corr.pvalue})
        index += 1
    correlations = pd.DataFrame.from_dict(correlations)
    # Get all results that indicate high correlations
    strong_corr = correlations.query("pvalue <= 0.05 & value >= 0.7")
    # Get all results that indicate medium correlations
    medium_corr = correlations.query("pvalue <= 0.05 & value < 0.7 & value >= 0.4")

    return (correlations, strong_corr, medium_corr)

correlations, strong_corr, medium_corr = calculate_corrs()

In [254]:
print(f"Total number of correlations: {len(correlations)}")
print(f"Number of strong correlations: {len(strong_corr)}")
print(f"Number of medium correlations: {len(medium_corr)}")

Total number of correlations: 435
Number of strong correlations: 33
Number of medium correlations: 37


In [255]:
strong_corr

Unnamed: 0,x,y,value,pvalue
0,Human Capital Index (HCI): Expected Years of S...,Human Capital Index (HCI): Expected Years of S...,0.996595,4.287882e-185
1,Human Capital Index (HCI): Expected Years of S...,Human Capital Index (HCI): Expected Years of S...,0.780694,2.437059e-36
29,Human Capital Index (HCI): Expected Years of S...,Human Capital Index (HCI): Expected Years of S...,0.784344,6.98352e-37
159,OOSR_Pre0Primary_Age_Male,OOSR_Pre0Primary_Age_Female,0.983515,1.86905e-127
204,OOSR_Primary_Age_Male,OOSR_Primary_Age_Female,0.968792,2.666759e-104
226,OOSR_Primary_Age_Female,OOSR_Lower_Secondary_Age_Female,0.706422,3.713119e-27
245,OOSR_Lower_Secondary_Age_Male,OOSR_Lower_Secondary_Age_Female,0.965014,3.5571419999999997e-100
246,OOSR_Lower_Secondary_Age_Male,OOSR_Upper_Secondary_Age_Male,0.805786,2.67763e-40
247,OOSR_Lower_Secondary_Age_Male,OOSR_Upper_Secondary_Age_Female,0.804199,4.955135e-40
264,OOSR_Lower_Secondary_Age_Female,OOSR_Upper_Secondary_Age_Male,0.765731,3.217123e-34


In [256]:
medium_corr

Unnamed: 0,x,y,value,pvalue
23,Human Capital Index (HCI): Expected Years of S...,Lower_Secondary_End_Proficiency_Reading,0.505283,1.813731e-12
24,Human Capital Index (HCI): Expected Years of S...,Lower_Secondary_End_Proficiency_Math,0.526808,1.348283e-13
28,Human Capital Index (HCI): Expected Years of S...,Gross_Tertiary_Education_Enrollment,0.454839,4.11575e-10
51,Human Capital Index (HCI): Expected Years of S...,Lower_Secondary_End_Proficiency_Reading,0.512396,7.840209e-13
52,Human Capital Index (HCI): Expected Years of S...,Lower_Secondary_End_Proficiency_Math,0.533664,5.664785e-14
56,Human Capital Index (HCI): Expected Years of S...,Gross_Tertiary_Education_Enrollment,0.46463,1.537232e-10
74,Human Capital Index (HCI): Expected Years of S...,Grade_2_3_Proficiency_Reading,0.441709,1.469593e-09
78,Human Capital Index (HCI): Expected Years of S...,Lower_Secondary_End_Proficiency_Reading,0.61161,6.358790999999999e-19
79,Human Capital Index (HCI): Expected Years of S...,Lower_Secondary_End_Proficiency_Math,0.62735,4.2764009999999997e-20
83,Human Capital Index (HCI): Expected Years of S...,Gross_Tertiary_Education_Enrollment,0.518983,3.542776e-13


In [257]:
# Scaling
from sklearn.decomposition import PCA
from sklearn import preprocessing

for series_name, series in data.items():
    if series_name == "country":
        continue
    data[series_name] = preprocessing.scale(series)

In [258]:
# Most of the strong correlations occur between completion rates and out of school rates. I have decided to use PCA to turn these into one variable each.
pca = PCA()
data_to_fit = data[["Completion_Rate_Primary_Male", "Completion_Rate_Primary_Female", "Completion_Rate_Lower_Secondary_Male", "Completion_Rate_Lower_Secondary_Female","Completion_Rate_Upper_Secondary_Male","Completion_Rate_Upper_Secondary_Female"]]
pca.fit(data_to_fit)
pca_data = pca.transform(data_to_fit)
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
per_var

array([94.9,  3.9,  0.6,  0.4,  0.1,  0. ])

In [259]:
# PCA 1 explains most of the variance, so it will replace the completion rate categories
data = data.drop(data_to_fit, axis=1)
data["Completion_Rate_Component"] = pd.DataFrame.from_dict(pca_data)[0]

In [260]:
pca = PCA()
data_to_fit = data[["OOSR_Pre0Primary_Age_Male", "OOSR_Pre0Primary_Age_Female", "OOSR_Primary_Age_Male", "OOSR_Primary_Age_Female","OOSR_Lower_Secondary_Age_Male","OOSR_Lower_Secondary_Age_Female", "OOSR_Upper_Secondary_Age_Male", "OOSR_Upper_Secondary_Age_Female"]]
pca.fit(data_to_fit)
pca_data = pca.transform(data_to_fit)
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
per_var

array([68. , 16.1, 10.6,  3.9,  0.7,  0.2,  0.2,  0.1])

In [261]:
# For the OOS rates, there are three components that explain most of the variance. These will replace the eight categories used to create them
data = data.drop(data_to_fit, axis=1)
data["OOS_Rate_Component_1"] = pd.DataFrame.from_dict(pca_data)[0]
data["OOS_Rate_Component_2"] = pd.DataFrame.from_dict(pca_data)[1]
data["OOS_Rate_Component_3"] = pd.DataFrame.from_dict(pca_data)[2]

In [262]:
data

Unnamed: 0,country,"Human Capital Index (HCI): Expected Years of School, Male","Human Capital Index (HCI): Expected Years of School, Female","Human Capital Index (HCI): Expected Years of School, Total",Government expenditure on secondary education as % of GDP (%),Government expenditure on tertiary education as % of GDP (%),Government expenditure on primary education as % of GDP (%),Grade_2_3_Proficiency_Reading,Grade_2_3_Proficiency_Math,Primary_End_Proficiency_Reading,...,Lower_Secondary_End_Proficiency_Reading,Lower_Secondary_End_Proficiency_Math,Youth_15_24_Literacy_Rate_Male,Youth_15_24_Literacy_Rate_Female,Gross_Primary_Education_Enrollment,Gross_Tertiary_Education_Enrollment,Completion_Rate_Component,OOS_Rate_Component_1,OOS_Rate_Component_2,OOS_Rate_Component_3
0,Afghanistan,0.276104,-0.176772,-0.053910,-0.469912,-0.076733,0.749592,-0.037060,0.225145,0.090075,...,-0.819213,-0.792686,0.797826,0.423888,0.306942,-0.828116,0.344230,-0.238755,1.984859,-0.313984
1,Albania,0.983405,0.933304,0.829305,-0.543257,-0.076699,0.830017,-0.659474,-0.612536,-0.436862,...,0.641912,1.032682,1.343715,1.390060,0.406199,0.670152,3.968097,-0.841514,0.303088,0.458349
2,Algeria,0.605081,0.744087,0.514873,0.636254,-0.076655,0.348467,-0.659474,-0.612536,-0.436862,...,-0.179970,-0.194721,1.321879,1.324184,0.502148,0.551085,1.685814,-1.977419,-0.106324,0.307244
3,Andorra,-1.407163,-1.389566,-1.772276,-0.744745,-0.076758,-0.742277,-0.659474,-0.612536,-0.436862,...,-0.819213,-0.792686,-0.818002,-0.805785,0.386348,-1.148938,-2.165288,-1.977419,-0.106324,0.307244
4,Angola,0.181066,-0.115501,-0.192101,-0.238177,-0.076753,-0.644413,-0.659474,-0.612536,-0.436862,...,-0.819213,-0.792686,-0.818002,-0.805785,0.621256,-0.841346,0.351325,-1.045277,-1.270958,-1.041499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,Uruguay,-1.407163,-1.389566,0.586972,-0.154057,-0.076693,-0.398049,1.603848,1.900508,1.954621,...,0.946313,0.749435,1.343715,1.368101,0.455828,0.938055,2.305490,-1.336099,0.131209,0.399614
167,Uzbekistan,-1.407163,-1.389566,-1.772276,-1.481720,-0.076772,-1.509440,-0.659474,-0.612536,-0.436862,...,-0.819213,-0.792686,1.365550,1.390060,0.313559,-0.814887,-2.165288,0.027520,-1.383693,-1.839081
168,Vanuatu,0.517354,0.551265,0.358657,1.281336,-0.076712,1.343471,-0.659474,-0.612536,-0.436862,...,-0.819213,-0.792686,1.278208,1.324184,0.482296,-0.993488,-2.165288,1.036925,-0.117382,-0.885149
169,Zambia,-1.407163,-1.389566,0.060247,-0.744935,-0.076729,0.720451,-0.659474,-0.612536,-0.436862,...,-0.667012,-0.729742,1.212701,1.214392,0.131588,-1.013333,1.200229,-0.984370,-1.037457,1.781639


In [264]:
# I have decided to turn expected years male and female, as well as the youth literacy rates into deltas
expected_years_delta = pd.Series(dtype='float64')
literacy_rate_delta = pd.Series(dtype='float64')

for _, row in data.iterrows():
    x = abs(float(row["Human Capital Index (HCI): Expected Years of School, Male"]) - float(row["Human Capital Index (HCI): Expected Years of School, Female"]))
    y = abs(float(row["Youth_15_24_Literacy_Rate_Male"]) - float(row["Youth_15_24_Literacy_Rate_Female"]))
    expected_years_delta[len(expected_years_delta)] = x
    literacy_rate_delta[len(literacy_rate_delta)] = y

data = data.drop(data[["Human Capital Index (HCI): Expected Years of School, Male", "Human Capital Index (HCI): Expected Years of School, Female", "Youth_15_24_Literacy_Rate_Male", "Youth_15_24_Literacy_Rate_Female"]], axis=1)
data["Expected Years of School Delta"] = expected_years_delta
data["Literacy Rate Delta"] = literacy_rate_delta

In [265]:
# I have decided to use the average of the math and reading scores
grade_2_3_proficiency = pd.Series(dtype='float64')
primary_end_proficiency = pd.Series(dtype='float64')
lower_secondary_end_proficiency = pd.Series(dtype='float64')

for _, row in data.iterrows():
    x = (row["Grade_2_3_Proficiency_Math"] + row["Grade_2_3_Proficiency_Reading"])/2.0
    y = (row["Primary_End_Proficiency_Math"] + row["Primary_End_Proficiency_Reading"])/2.0
    z = (row["Lower_Secondary_End_Proficiency_Math"] + row["Lower_Secondary_End_Proficiency_Reading"])/2.0
    grade_2_3_proficiency[len(grade_2_3_proficiency)] = x
    primary_end_proficiency[len(primary_end_proficiency)] = y
    lower_secondary_end_proficiency[len(lower_secondary_end_proficiency)] = z

data = data.drop(data[["Grade_2_3_Proficiency_Math", "Grade_2_3_Proficiency_Reading", "Primary_End_Proficiency_Math", "Primary_End_Proficiency_Reading", "Lower_Secondary_End_Proficiency_Math", "Lower_Secondary_End_Proficiency_Reading"]], axis=1)
data["Grade_2_3_Proficiency"] = grade_2_3_proficiency
data["Primary_End_Proficiency"] = primary_end_proficiency
data["Lower_Secondary_End_Proficiency"] = lower_secondary_end_proficiency

In [266]:
data

Unnamed: 0,country,"Human Capital Index (HCI): Expected Years of School, Total",Government expenditure on secondary education as % of GDP (%),Government expenditure on tertiary education as % of GDP (%),Government expenditure on primary education as % of GDP (%),Gross_Primary_Education_Enrollment,Gross_Tertiary_Education_Enrollment,Completion_Rate_Component,OOS_Rate_Component_1,OOS_Rate_Component_2,OOS_Rate_Component_3,Expected Years of School Delta,Literacy Rate Delta,Grade_2_3_Proficiency,Primary_End_Proficiency,Lower_Secondary_End_Proficiency
0,Afghanistan,-0.053910,-0.469912,-0.076733,0.749592,0.306942,-0.828116,0.344230,-0.238755,1.984859,-0.313984,0.452875,0.373939,0.094042,0.056947,-0.805949
1,Albania,0.829305,-0.543257,-0.076699,0.830017,0.406199,0.670152,3.968097,-0.841514,0.303088,0.458349,0.050100,0.046345,-0.636005,-0.452690,0.837297
2,Algeria,0.514873,0.636254,-0.076655,0.348467,0.502148,0.551085,1.685814,-1.977419,-0.106324,0.307244,0.139006,0.002305,-0.636005,-0.452690,-0.187346
3,Andorra,-1.772276,-0.744745,-0.076758,-0.742277,0.386348,-1.148938,-2.165288,-1.977419,-0.106324,0.307244,0.017598,0.012217,-0.636005,-0.452690,-0.805949
4,Angola,-0.192101,-0.238177,-0.076753,-0.644413,0.621256,-0.841346,0.351325,-1.045277,-1.270958,-1.041499,0.296567,0.012217,-0.636005,-0.452690,-0.805949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,Uruguay,0.586972,-0.154057,-0.076693,-0.398049,0.455828,0.938055,2.305490,-1.336099,0.131209,0.399614,0.017598,0.024387,1.752178,2.264819,0.847874
167,Uzbekistan,-1.772276,-1.481720,-0.076772,-1.509440,0.313559,-0.814887,-2.165288,0.027520,-1.383693,-1.839081,0.017598,0.024510,-0.636005,-0.452690,-0.805949
168,Vanuatu,0.358657,1.281336,-0.076712,1.343471,0.482296,-0.993488,-2.165288,1.036925,-0.117382,-0.885149,0.033912,0.045976,-0.636005,-0.452690,-0.805949
169,Zambia,0.060247,-0.744935,-0.076729,0.720451,0.131588,-1.013333,1.200229,-0.984370,-1.037457,1.781639,0.017598,0.001691,-0.636005,-0.452690,-0.698377


In [267]:
# Weighting and Aggregation

In [268]:
# Visualisation of Results