In [142]:
import pandas as pd
import matplotlib.pyplot as plt

## Load & Read Dataset

Deskripsi dataset:
1. SG_XPD_EDUC = Proporsi total pengeluaran pemerintah untuk pendidikan
2. SE_ADT_FUNS = Proporsi penduduk yang mencapai setidaknya tingkat kemahiran tertentu dalam keterampilan fungsional, berdasarkan jenis kelamin, usia dan jenis keterampilan (literasi dan numerasi).

In [143]:
sg_xpd_educ_original = pd.read_excel('Dataset/SG_XPD_EDUC.xlsx', sheet_name='Table format')
se_adt_funs_original = pd.read_excel('Dataset/SE_ADT_FUNS.xlsx', sheet_name='Table format')

In [144]:
sg_xpd_educ_original.head()

Unnamed: 0,Goal,Target,Indicator,SeriesCode,SeriesDescription,GeoAreaCode,GeoAreaName,Observation Status,Reporting Type,Units,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,1,1.a,1.a.2,SG_XPD_EDUC,Proportion of total government spending on ess...,4,Afghanistan,A,G,PERCENT,...,7.13207,7.96457,7.28491,8.26024,8.20247,,,,,
1,1,1.a,1.a.2,SG_XPD_EDUC,Proportion of total government spending on ess...,8,Albania,A,G,PERCENT,...,11.58003,10.98846,10.44102,10.82979,10.57019,10.9707,11.40018,10.13438,9.83378,
2,1,1.a,1.a.2,SG_XPD_EDUC,Proportion of total government spending on ess...,12,Algeria,A,G,PERCENT,...,,,,,,,,,,
3,1,1.a,1.a.2,SG_XPD_EDUC,Proportion of total government spending on ess...,20,Andorra,A,G,PERCENT,...,,,,,,14.57613,14.71529,13.22508,13.47384,13.29234
4,1,1.a,1.a.2,SG_XPD_EDUC,Proportion of total government spending on ess...,24,Angola,A,G,PERCENT,...,9.44777,6.85226,12.10089,10.8217,8.7603,8.2301,10.79581,,,


In [145]:
se_adt_funs_original.head()

Unnamed: 0,Goal,Target,Indicator,SeriesCode,SeriesDescription,GeoAreaCode,GeoAreaName,Age,Reporting Type,Sex,Type of skill,Units,2006,2011,2012,2013,2014,2015,2017
0,4,4.6,4.6.1,SE_ADT_FUNS,Proportion of population achieving at least a ...,36,Australia,16-65,G,BOTHSEX,LITE,PERCENT,,,87.20045,,,,
1,4,4.6,4.6.1,SE_ADT_FUNS,Proportion of population achieving at least a ...,36,Australia,16-65,G,BOTHSEX,NUME,PERCENT,,,79.51601,,,,
2,4,4.6,4.6.1,SE_ADT_FUNS,Proportion of population achieving at least a ...,36,Australia,16-65,G,FEMALE,LITE,PERCENT,,,87.35749,,,,
3,4,4.6,4.6.1,SE_ADT_FUNS,Proportion of population achieving at least a ...,36,Australia,16-65,G,FEMALE,NUME,PERCENT,,,76.64972,,,,
4,4,4.6,4.6.1,SE_ADT_FUNS,Proportion of population achieving at least a ...,36,Australia,16-65,G,MALE,LITE,PERCENT,,,87.04392,,,,


## Menghapus fitur yang tidak diperlukan dan mengubah nama kolom

Pada tahap ini, dilakukan penghapusan fitur / kolom yang tidak relevan terhadap analisis. Fitur- fitur yang tidak diperlukan adalah Goal, Target, Indicator, SeriesCode, SeriesDescription, GeoAreaCode, Units, Reporting Type, dan Observation Status. Kemudian, dilakukan perubahan nama kolom GeoAreaName menjadi Country untuk mempermudah proses analisis.

In [146]:
sg_xpd_educ = sg_xpd_educ_original.drop(['Goal', 'Target', 'Indicator', 'SeriesCode', 'SeriesDescription', 'GeoAreaCode', 'Units', 'Reporting Type', 'Observation Status'], axis=1)
se_adt_funs = se_adt_funs_original.drop(['Goal', 'Target', 'Indicator', 'SeriesCode', 'SeriesDescription', 'GeoAreaCode', 'Units', 'Reporting Type'], axis=1)
sg_xpd_educ.rename(columns={'GeoAreaName': 'Country'}, inplace=True)
se_adt_funs.rename(columns={'GeoAreaName': 'Country'}, inplace=True)

In [147]:
sg_xpd_educ.head() 

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Afghanistan,,,,12.80618,12.94666,15.08358,14.55921,10.81469,4.86059,...,7.13207,7.96457,7.28491,8.26024,8.20247,,,,,
1,Albania,,,2.50737,3.67385,3.99481,11.37165,10.76799,11.07502,10.79071,...,11.58003,10.98846,10.44102,10.82979,10.57019,10.9707,11.40018,10.13438,9.83378,
2,Algeria,,,,,,,15.5275,14.27719,13.96228,...,,,,,,,,,,
3,Andorra,,,,,,,,,,...,,,,,,14.57613,14.71529,13.22508,13.47384,13.29234
4,Angola,9.46676,9.43363,8.32275,6.52771,10.83334,7.26366,7.60744,9.28457,7.51377,...,9.44777,6.85226,12.10089,10.8217,8.7603,8.2301,10.79581,,,


In [148]:
se_adt_funs.head()

Unnamed: 0,Country,Age,Sex,Type of skill,2006,2011,2012,2013,2014,2015,2017
0,Australia,16-65,BOTHSEX,LITE,,,87.20045,,,,
1,Australia,16-65,BOTHSEX,NUME,,,79.51601,,,,
2,Australia,16-65,FEMALE,LITE,,,87.35749,,,,
3,Australia,16-65,FEMALE,NUME,,,76.64972,,,,
4,Australia,16-65,MALE,LITE,,,87.04392,,,,


## Mengisi nilai N/A / missing values

In [149]:
sg_xpd_educ[sg_xpd_educ.columns.difference(['Country'])] = sg_xpd_educ[sg_xpd_educ.columns.difference(['Country'])].fillna(method='ffill',axis=1)
se_adt_funs[se_adt_funs.columns.difference(['Country', 'Age', 'Sex'])] = se_adt_funs[se_adt_funs.columns.difference(['Country', 'Age', 'Sex'])].fillna(method='ffill',axis=1)

In [150]:
sg_xpd_educ.head()

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Afghanistan,,,,12.80618,12.94666,15.08358,14.55921,10.81469,4.86059,...,7.13207,7.96457,7.28491,8.26024,8.20247,8.20247,8.20247,8.20247,8.20247,8.20247
1,Albania,,,2.50737,3.67385,3.99481,11.37165,10.76799,11.07502,10.79071,...,11.58003,10.98846,10.44102,10.82979,10.57019,10.9707,11.40018,10.13438,9.83378,9.83378
2,Algeria,,,,,,,15.5275,14.27719,13.96228,...,14.34175,14.34175,14.34175,14.34175,14.34175,14.34175,14.34175,14.34175,14.34175,14.34175
3,Andorra,,,,,,,,,,...,,,,,,14.57613,14.71529,13.22508,13.47384,13.29234
4,Angola,9.46676,9.43363,8.32275,6.52771,10.83334,7.26366,7.60744,9.28457,7.51377,...,9.44777,6.85226,12.10089,10.8217,8.7603,8.2301,10.79581,10.79581,10.79581,10.79581


In [151]:
se_adt_funs.head()

Unnamed: 0,Country,Age,Sex,Type of skill,2006,2011,2012,2013,2014,2015,2017
0,Australia,16-65,BOTHSEX,LITE,,,87.20045,87.20045,87.20045,87.20045,87.20045
1,Australia,16-65,BOTHSEX,NUME,,,79.51601,79.51601,79.51601,79.51601,79.51601
2,Australia,16-65,FEMALE,LITE,,,87.35749,87.35749,87.35749,87.35749,87.35749
3,Australia,16-65,FEMALE,NUME,,,76.64972,76.64972,76.64972,76.64972,76.64972
4,Australia,16-65,MALE,LITE,,,87.04392,87.04392,87.04392,87.04392,87.04392


## Menggabungkan kedua dataset menjadi satu dataset yang sama