In [254]:
# Imports
import pandas as pd

In [255]:
# Load in data

# Encoding is specified for each import to resolve an "unexpected continuation byte" error

# https://www.kaggle.com/code/nelgiriyewithana/introduction-to-world-educational-data/notebook
world_education_data = pd.read_csv("Data/Global_Education.csv", encoding="iso-8859-1")

# https://databank.worldbank.org/source/education-statistics-%5E-all-indicators
world_bank_education_data = pd.read_csv("Data/World Bank Education Data.csv", encoding="ascii")

# The following are from the UNESCO Institute for Statistics Data Browser for Education available here: https://databrowser.uis.unesco.org/browser/EDUCATION/UIS-SDG4Monitoring
teacher_data = pd.read_csv("Data/UIS Teacher Data/data.csv")
education_years_data = pd.read_csv("Data/UIS Years of Education/data.csv")
teacher_attrition_data = pd.read_csv("Data/UIS Teacher Attrition/data.csv")

In [256]:
#Dropping values we do not need
teacher_data = teacher_data.drop(["qualifier", "magnitude"], axis=1)
education_years_data = education_years_data.drop(["qualifier", "magnitude"], axis=1)
teacher_attrition_data = teacher_attrition_data.drop(["qualifier", "magnitude"], axis=1)

In [257]:
# Work out which year has the most complete data
print((teacher_data["year"].value_counts() + education_years_data["year"].value_counts() + teacher_attrition_data["year"].value_counts()).sort_values())

year
2024     150.0
2011     663.0
2012     691.0
2013     906.0
2014    1035.0
2015    1090.0
2017    1119.0
2016    1120.0
2018    1143.0
2020    1147.0
2023    1161.0
2019    1165.0
2022    1166.0
2021    1199.0
2010       NaN
Name: count, dtype: float64


In [258]:
#2021 is the most complete year, so we will use it
years = range(2010, 2022)

new_data = pd.DataFrame(columns=["indicator", "country"])

for i in years: 
    new_data[str(i)] = []

In [259]:
#Data in the files is sorted by indicatorId which makes it easier to add them to our new_data dataframe

def load_into_data(data: pd.DataFrame, dataset: pd.DataFrame) -> pd.DataFrame:
    for index, _ in dataset.iterrows():
        if index == 0:
            lastId = None
            lastGeo = None
        else:
            lastId = dataset.iloc[index -1]["indicatorId"]
            lastGeo = dataset.iloc[index -1]["geoUnit"]

        currId = dataset.iloc[index]["indicatorId"]
        currGeo = dataset.iloc[index]["geoUnit"]
        year = dataset.iloc[index]["year"]
        
        if year > 2021:
            continue

        if lastId != currId or lastGeo != currGeo:
            df = pd.DataFrame({"indicator": [currId], "country": [currGeo]})
            data = pd.concat([data, df], ignore_index=True)
        data.at[len(data) -1, str(year)] = dataset.iloc[index]["value"]
    return data

new_data = load_into_data(new_data,teacher_data)
new_data = load_into_data(new_data,education_years_data)
new_data = load_into_data(new_data,teacher_attrition_data)



In [270]:
data = new_data.copy(deep=True)

In [273]:
# Replacing missing values

import math
# Fill in missing 2021 values with the average of previous years

# Get which indicies are missing a value of 2021.
indexes_to_fill = []
for index, row in data.iterrows():
    if math.isnan(row["2021"]):
        indexes_to_fill.append(index)

def get_average_for_row(row: pd.Series) -> float:
    total = 0
    count = 0
    for cell in row:
        try:
            int(cell)
        except ValueError:
            continue
        if math.isnan(cell):
            continue
        total += cell
        count += 1

    if count == 0:
        return 0
    
    return total / count

print(indexes_to_fill)

for i in indexes_to_fill:
    data.at[i, "2021"] = get_average_for_row(data.iloc[i])

data
    

[]


Unnamed: 0,indicator,country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,PTRHC.02.QUALIFIED,AGO,,,,,,,88.033783,,,,,67.013420
1,PTRHC.02.QUALIFIED,ALB,,,,28.42535,25.840099,24.001751,22.056330,20.786989,23.51306,24.01643,20.371010,20.236031
2,PTRHC.02.QUALIFIED,AND,,,,14.08721,13.703910,13.408840,13.684210,13.883440,13.27711,12.56213,12.051720,11.747060
3,PTRHC.02.QUALIFIED,ARE,,,,,,,,,27.26619,26.76902,23.275150,21.276951
4,PTRHC.02.QUALIFIED,ARM,,,,,,,,,6.27090,6.20682,6.918270,4.534200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1596,TATTRR.2T3,TCA,,,,,,,,,,,,9.879030
1597,TATTRR.2T3,TKL,,,,,,,,,,,0.000000,0.000000
1598,TATTRR.2T3,TUV,,,,,,,,,,,22.619049,1.515150
1599,TATTRR.2T3,UZB,,,,,,,3.150820,,,,,3.150820


In [282]:
# Converting country codes to names
import pycountry

for index, _ in data.iterrows():
    data.at[index, "country"] = pycountry.countries.get(alpha_3 = data.at[index, "country"]).name

data

Unnamed: 0,indicator,country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,PTRHC.02.QUALIFIED,Angola,,,,,,,88.033783,,,,,67.013420
1,PTRHC.02.QUALIFIED,Albania,,,,28.42535,25.840099,24.001751,22.056330,20.786989,23.51306,24.01643,20.371010,20.236031
2,PTRHC.02.QUALIFIED,Andorra,,,,14.08721,13.703910,13.408840,13.684210,13.883440,13.27711,12.56213,12.051720,11.747060
3,PTRHC.02.QUALIFIED,United Arab Emirates,,,,,,,,,27.26619,26.76902,23.275150,21.276951
4,PTRHC.02.QUALIFIED,Armenia,,,,,,,,,6.27090,6.20682,6.918270,4.534200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1596,TATTRR.2T3,Turks and Caicos Islands,,,,,,,,,,,,9.879030
1597,TATTRR.2T3,Tokelau,,,,,,,,,,,0.000000,0.000000
1598,TATTRR.2T3,Tuvalu,,,,,,,,,,,22.619049,1.515150
1599,TATTRR.2T3,Uzbekistan,,,,,,,3.150820,,,,,3.150820


In [None]:
# Multivariate Analysis

In [None]:
# Normalisation

In [None]:
# Weighting and Aggregation

In [None]:
# Visualisation of Results