# Import Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [130]:
df = pd.read_csv("unesco data.csv")

In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9152 entries, 0 to 9151
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Region            9152 non-null   object 
 1   Country           9152 non-null   object 
 2   Target            9152 non-null   object 
 3   Indicator Number  9152 non-null   object 
 4   Indicator Name    9152 non-null   object 
 5   Year              9152 non-null   int64  
 6   Value             9152 non-null   float64
 7   Metadata          7342 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 572.1+ KB


In [132]:
df.head(2)

Unnamed: 0,Region,Country,Target,Indicator Number,Indicator Name,Year,Value,Metadata
0,SDG: Sub-Saharan Africa,Zimbabwe,Education 2030 FFA,Education 2030 FFA,Government expenditure on education as a perce...,2010,1.54,
1,SDG: Latin America and the Caribbean,Guyana,4.5,4.5.6,Initial government expenditure on education as...,2010,2.37,SOURCE: UIS/UOE data


In [133]:
# drop irrelevant columns

df.drop(columns=['Region', 'Target', 'Indicator Number', 'Metadata'], inplace=True)

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9152 entries, 0 to 9151
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         9152 non-null   object 
 1   Indicator Name  9152 non-null   object 
 2   Year            9152 non-null   int64  
 3   Value           9152 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 286.1+ KB


In [135]:
df.head(2)

Unnamed: 0,Country,Indicator Name,Year,Value
0,Zimbabwe,Government expenditure on education as a perce...,2010,1.54
1,Guyana,Initial government expenditure on education as...,2010,2.37


In [136]:
df_pivoted = df.pivot_table(index=["Country", "Year"], columns="Indicator Name", values="Value").reset_index()

In [137]:
df_pivoted.head(2)

Indicator Name,Country,Year,"Expenditure on education (public, households, ODA) as a percentage of GDP","Expenditure on education (public, private, international) as a percentage of GDP",Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%),Initial government expenditure on education as a percentage of GDP (%),Initial private expenditure on education (household) as a percentage of GDP (%),Initial private expenditure on education (other non-educational private entities) as a percentage of GDP (%),International expenditure on education as a percentage of GDP (%),Percentage of total aid to education allocated to least developed countries (%)
0,Afghanistan,2010,,,6.4,3.48,2.85,,,0.63,
1,Afghanistan,2011,,,5.15,3.46,3.46,,0.0,2.08,


In [138]:
df_pivoted.columns

Index(['Country', 'Year',
       'Expenditure on education (public, households, ODA) as a percentage of GDP',
       'Expenditure on education (public, private, international) as a percentage of GDP',
       'Expenditure on education as a percentage of total government expenditure (%)',
       'Government expenditure on education as a percentage of GDP (%)',
       'Initial government expenditure on education as a percentage of GDP (%)',
       'Initial private expenditure on education (household) as a percentage of GDP (%)',
       'Initial private expenditure on education (other non-educational private entities) as a percentage of GDP (%)',
       'International expenditure on education as a percentage of GDP (%)',
       'Percentage of total aid to education allocated to least developed countries (%)'],
      dtype='object', name='Indicator Name')

In [139]:
df_pivoted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2274 entries, 0 to 2273
Data columns (total 11 columns):
 #   Column                                                                                                        Non-Null Count  Dtype  
---  ------                                                                                                        --------------  -----  
 0   Country                                                                                                       2274 non-null   object 
 1   Year                                                                                                          2274 non-null   int64  
 2   Expenditure on education (public, households, ODA) as a percentage of GDP                                     503 non-null    float64
 3   Expenditure on education (public, private, international) as a percentage of GDP                              255 non-null    float64
 4   Expenditure on education as a percentage of total gove

In [140]:
df_pivoted.index.name = None
df_pivoted.columns.name = None

In [141]:
df_pivoted.head()

Unnamed: 0,Country,Year,"Expenditure on education (public, households, ODA) as a percentage of GDP","Expenditure on education (public, private, international) as a percentage of GDP",Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%),Initial government expenditure on education as a percentage of GDP (%),Initial private expenditure on education (household) as a percentage of GDP (%),Initial private expenditure on education (other non-educational private entities) as a percentage of GDP (%),International expenditure on education as a percentage of GDP (%),Percentage of total aid to education allocated to least developed countries (%)
0,Afghanistan,2010,,,6.4,3.48,2.85,,,0.63,
1,Afghanistan,2011,,,5.15,3.46,3.46,,0.0,2.08,
2,Afghanistan,2012,,,6.49,2.6,2.22,,0.0,0.38,
3,Afghanistan,2013,,,7.13,3.45,2.81,,0.0,0.65,
4,Afghanistan,2014,,,7.96,3.7,2.99,,0.0,0.7,


In [142]:
df_pivoted.columns

Index(['Country', 'Year',
       'Expenditure on education (public, households, ODA) as a percentage of GDP',
       'Expenditure on education (public, private, international) as a percentage of GDP',
       'Expenditure on education as a percentage of total government expenditure (%)',
       'Government expenditure on education as a percentage of GDP (%)',
       'Initial government expenditure on education as a percentage of GDP (%)',
       'Initial private expenditure on education (household) as a percentage of GDP (%)',
       'Initial private expenditure on education (other non-educational private entities) as a percentage of GDP (%)',
       'International expenditure on education as a percentage of GDP (%)',
       'Percentage of total aid to education allocated to least developed countries (%)'],
      dtype='object')

In [143]:
# percentage of missing values

df_pivoted.isnull().mean() * 100

Country                                                                                                          0.000000
Year                                                                                                             0.000000
Expenditure on education (public, households, ODA) as a percentage of GDP                                       77.880387
Expenditure on education (public, private, international) as a percentage of GDP                                88.786280
Expenditure on education as a percentage of total government expenditure (%)                                    11.125770
Government expenditure on education as a percentage of GDP (%)                                                   7.959543
Initial government expenditure on education as a percentage of GDP (%)                                          40.809147
Initial private expenditure on education (household) as a percentage of GDP (%)                                 74.054529
Initial private expendit

In [144]:
# drop columns with more than 30% null values

df1 = df_pivoted.drop(columns= ["Expenditure on education (public, households, ODA) as a percentage of GDP",
                               "Expenditure on education (public, private, international) as a percentage of GDP",
                               "Initial government expenditure on education as a percentage of GDP (%)",
                               "Initial private expenditure on education (household) as a percentage of GDP (%)",
                               "Initial private expenditure on education (other non-educational private entities) as a percentage of GDP (%)",
                               "International expenditure on education as a percentage of GDP (%)",
                               "Percentage of total aid to education allocated to least developed countries (%)"])

In [145]:
df1.head()

Unnamed: 0,Country,Year,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%)
0,Afghanistan,2010,6.4,3.48
1,Afghanistan,2011,5.15,3.46
2,Afghanistan,2012,6.49,2.6
3,Afghanistan,2013,7.13,3.45
4,Afghanistan,2014,7.96,3.7


In [146]:
# percentage of missing values

df1.isnull().mean() * 100

Country                                                                          0.000000
Year                                                                             0.000000
Expenditure on education as a percentage of total government expenditure (%)    11.125770
Government expenditure on education as a percentage of GDP (%)                   7.959543
dtype: float64

In [147]:
df1['Expenditure on education as a percentage of total government expenditure (%)'].describe()

count    2021.000000
mean       13.993345
std         4.885673
min         0.000000
25%        10.600000
50%        13.420000
75%        16.970000
max        35.010000
Name: Expenditure on education as a percentage of total government expenditure (%), dtype: float64

In [148]:
df1['Government expenditure on education as a percentage of GDP (%)'].describe()

count    2093.000000
mean        4.415968
std         1.875801
min         0.000000
25%         3.150000
50%         4.240000
75%         5.420000
max        16.580000
Name: Government expenditure on education as a percentage of GDP (%), dtype: float64

In [149]:
# filling missing values with the mean

df1["Government expenditure on education as a percentage of GDP (%)"] = df1["Government expenditure on education as a percentage of GDP (%)"].fillna(df1["Government expenditure on education as a percentage of GDP (%)"].mean())
df1['Expenditure on education as a percentage of total government expenditure (%)'] = df1["Expenditure on education as a percentage of total government expenditure (%)"].fillna(df1["Expenditure on education as a percentage of total government expenditure (%)"].mean())

In [150]:
# percentage of missing values

df1.isnull().mean() * 100

Country                                                                         0.0
Year                                                                            0.0
Expenditure on education as a percentage of total government expenditure (%)    0.0
Government expenditure on education as a percentage of GDP (%)                  0.0
dtype: float64

In [151]:
df1.head()

Unnamed: 0,Country,Year,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%)
0,Afghanistan,2010,6.4,3.48
1,Afghanistan,2011,5.15,3.46
2,Afghanistan,2012,6.49,2.6
3,Afghanistan,2013,7.13,3.45
4,Afghanistan,2014,7.96,3.7


In [152]:
# a dataframe for 2023 which will be used to test the models built

df_2023 = df1[df1['Year'] == 2023]

In [153]:
df_2023.head()

Unnamed: 0,Country,Year,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%)
35,Algeria,2023,13.26,4.415968
62,Angola,2023,7.73,4.415968
86,Antigua and Barbuda,2023,9.8,4.415968
171,Bahamas,2023,11.59,3.09
198,Bangladesh,2023,12.01,2.05


In [154]:
df_2023.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 35 to 2263
Data columns (total 4 columns):
 #   Column                                                                        Non-Null Count  Dtype  
---  ------                                                                        --------------  -----  
 0   Country                                                                       56 non-null     object 
 1   Year                                                                          56 non-null     int64  
 2   Expenditure on education as a percentage of total government expenditure (%)  56 non-null     float64
 3   Government expenditure on education as a percentage of GDP (%)                56 non-null     float64
dtypes: float64(2), int64(1), object(1)
memory usage: 2.2+ KB


In [155]:
# drop rows where year == '2023'
df1 = df1[df1['Year'] != 2023]

In [156]:
df1.head()

Unnamed: 0,Country,Year,Expenditure on education as a percentage of total government expenditure (%),Government expenditure on education as a percentage of GDP (%)
0,Afghanistan,2010,6.4,3.48
1,Afghanistan,2011,5.15,3.46
2,Afghanistan,2012,6.49,2.6
3,Afghanistan,2013,7.13,3.45
4,Afghanistan,2014,7.96,3.7


In [157]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2218 entries, 0 to 2273
Data columns (total 4 columns):
 #   Column                                                                        Non-Null Count  Dtype  
---  ------                                                                        --------------  -----  
 0   Country                                                                       2218 non-null   object 
 1   Year                                                                          2218 non-null   int64  
 2   Expenditure on education as a percentage of total government expenditure (%)  2218 non-null   float64
 3   Government expenditure on education as a percentage of GDP (%)                2218 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 86.6+ KB


In [158]:
# save the dataframe as a .csv file

df1.to_csv("education_expenditure.csv", index=False)
df_2023.to_csv("df_2023.csv", index=False)