## Read and Rename

In [1]:
import pandas as pd
import sys
sys.path.append('../')
from utils.uuid_util import generate_uuid

education = pd.read_csv('../data/Cleaned/education/education.csv')
employment = pd.read_csv('../data/Cleaned/employment/european_employment_data.csv')
socioecono = pd.read_csv('../data/Cleaned/socio/socio.csv')

education.rename(columns={'Country Name': 'country_name',
                          'Country Code': 'country_code',
                          'Indicator Name': 'indicator_name',
                          'Value': 'value',
                          }, inplace=True)
employment.rename(columns={'Country Name': 'country_name',
                           'Country Code': 'country_code',
                            'Indicator Name': 'indicator_name',
                            '2023': 'unemployment_rate',
                            }, inplace=True)
employment.drop(columns=['Unnamed: 0'], inplace=True)
sociecono = socioecono.rename(columns={'country': 'country_name',
                                       'wbid': 'country_code',
                                       'class': 'socioeconomic_class',
                                        }, inplace=True)

## Indicator Selection

In [2]:
# Government expenditure on education as % of GDP (%),SE.XPD.TOTL.GD.ZS
# Government expenditure on secondary education as % of GDP (%),UIS.XGDP.23.FSGOV
# Government expenditure on tertiary education as % of GDP (%),UIS.XGDP.56.FSGOV
# Government expenditure per secondary student as % of GDP per capita (%),SE.XPD.SECO.PC.ZS
# Government expenditure per tertiary student as % of GDP per capita (%),SE.XPD.TERT.PC.ZS

# "Graduates from tertiary education, female (number)",SE.TER.GRAD.FE
# "Gross enrolment ratio, tertiary, female (%)",SE.TER.ENRR.FE
# Labor force with advanced education, female (% of female labor force)",SL.TLF.ADVN.FE.ZS
# UIS: Percentage of population age 25+ with at least a completed short-cycle tertiary degree (ISCED 5 or higher). Female,UIS.EA.5T8.AG25T99.F


# Labor force with basic education (% of total),SL.TLF.BASC.ZS
# Labor force with intermediate education (% of total),SL.TLF.INTM.ZS
# Labor force with advanced education (% of total),SL.TLF.ADVN.ZS

# PISA: Mean performance on the mathematics scale,LO.PISA.MAT
# PISA: Mean performance on the reading scale,LO.PISA.REA
# PISA: Mean performance on the science scale,LO.PISA.SCI

# UIS: Percentage of population age 25+ with at least a completed short-cycle tertiary degree (ISCED 5 or higher). Total,UIS.EA.5T8.AG25T99

categories_dict = {
    "Government Expenditure": [
    "SE.XPD.TOTL.GD.ZS",
    "UIS.XGDP.23.FSGOV",
    "UIS.XGDP.56.FSGOV",
    "SE.XPD.SECO.PC.ZS",
    "SE.XPD.TERT.PC.ZS",
    ],
    "Gender Equality": [
    "SE.TER.CMPL.ZS",
    "SE.TER.GRAD.FE",
    "SE.TER.ENRR.FE",
    "UIS.EA.5T8.AG25T99.F",
    ],     
    "Labor Force Education": [
        "SL.TLF.BASC.ZS",
        "SL.TLF.INTM.ZS",
        "SL.TLF.ADVN.ZS",
    ],
    "PISA Score": [
        "LO.PISA.REA",
        "LO.PISA.MAT",
        "LO.PISA.SCI",
    ],    
}



categories_reverse = {}
for key in categories_dict.keys():
    for item in categories_dict[key]:
        categories_reverse[item] = key
        
categories_reverse

{'SE.XPD.TOTL.GD.ZS': 'Government Expenditure',
 'UIS.XGDP.23.FSGOV': 'Government Expenditure',
 'UIS.XGDP.56.FSGOV': 'Government Expenditure',
 'SE.XPD.SECO.PC.ZS': 'Government Expenditure',
 'SE.XPD.TERT.PC.ZS': 'Government Expenditure',
 'SE.TER.CMPL.ZS': 'Gender Equality',
 'SE.TER.GRAD.FE': 'Gender Equality',
 'SE.TER.ENRR.FE': 'Gender Equality',
 'UIS.EA.5T8.AG25T99.F': 'Gender Equality',
 'SL.TLF.BASC.ZS': 'Labor Force Education',
 'SL.TLF.INTM.ZS': 'Labor Force Education',
 'SL.TLF.ADVN.ZS': 'Labor Force Education',
 'LO.PISA.REA': 'PISA Score',
 'LO.PISA.MAT': 'PISA Score',
 'LO.PISA.SCI': 'PISA Score'}

In [3]:
selected_indicators_code = list(categories_reverse.keys())
selected_indicators_code

['SE.XPD.TOTL.GD.ZS',
 'UIS.XGDP.23.FSGOV',
 'UIS.XGDP.56.FSGOV',
 'SE.XPD.SECO.PC.ZS',
 'SE.XPD.TERT.PC.ZS',
 'SE.TER.CMPL.ZS',
 'SE.TER.GRAD.FE',
 'SE.TER.ENRR.FE',
 'UIS.EA.5T8.AG25T99.F',
 'SL.TLF.BASC.ZS',
 'SL.TLF.INTM.ZS',
 'SL.TLF.ADVN.ZS',
 'LO.PISA.REA',
 'LO.PISA.MAT',
 'LO.PISA.SCI']

## Export countries

In [4]:
employment.head()

Unnamed: 0,country_name,country_code,indicator_name,Indicator Code,unemployment_rate
0,Austria,AUT,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.268
1,Belgium,BEL,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.541
2,Bulgaria,BGR,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,4.185
3,Switzerland,CHE,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,3.838
4,Germany,DEU,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,3.006


In [5]:
# export country nodes
country = employment[['country_name', 'country_code']].copy(deep=True)
# generate unique 3-digit country id
import uuid
country['country_id'] = country['country_code'].apply(lambda x: uuid.uuid3(uuid.NAMESPACE_DNS, x))
country.to_csv(f'../data/neo4j/country.csv', index=False)


## Export education nodes

In [6]:
# add indicator code
edu_raw = pd.read_csv('../data/European/EducationStats/european_education_data.csv')

edu_raw_index = edu_raw[['Indicator Name', 'Indicator Code']].drop_duplicates()
edu_raw_index.rename(columns={'Indicator Name': 'indicator_name',
                                'Indicator Code': 'indicator_code'}, inplace=True)

# add education indicator code to our file

education = pd.merge(education, edu_raw_index, on='indicator_name', how='left')
# education.to_csv(f'../data/neo4j/education.csv', index=False)
indeces = education[['indicator_name', 'indicator_code']].drop_duplicates()
indeces.to_csv(f'../data/neo4j/index.csv', index=False)

In [7]:
# add UUID to every row
education['educational_index_id'] = education.apply(lambda row: generate_uuid(), axis=1)

In [8]:
# export education nodes
educational_index_fields = ['indicator_name', 'value', 'indicator_code', 'educational_index_id']
educational_index = education[educational_index_fields]

educational_index_selected = educational_index[educational_index['indicator_code'].isin(selected_indicators_code)]
educational_index_selected.to_csv(f'../data/neo4j/educational_index.csv', index=False)

## Categories

In [9]:
# "Gross graduation ratio from first degree programmes (ISCED 6 and 7) in tertiary education, both sexes (%)",SE.TER.CMPL.ZS
# "Gross graduation ratio from first degree programmes (ISCED 6 and 7) in tertiary education, female (%)",SE.TER.CMPL.FE.ZS
# "Gross graduation ratio from first degree programmes (ISCED 6 and 7) in tertiary education, gender parity index (GPI)",UIS.GGR.5.A.GPI
# "Gross graduation ratio from first degree programmes (ISCED 6 and 7) in tertiary education, male (%)",SE.TER.CMPL.MA.ZS

categories = pd.DataFrame({
    'indicator_code': list(categories_reverse.keys()),
    'category': list(categories_reverse.values()),
})\
    .merge(indeces, on='indicator_code', how='left')[['category', 'indicator_name', 'indicator_code']]
    
categories.to_csv(f'../data/neo4j/categories.csv', index=False)

## Education Index - Country relationship

In [10]:
education_country = education[['country_name','indicator_code', 'country_code', 'educational_index_id']].copy(deep=True)
education_country['country_id'] = pd.merge(education_country, country, on='country_name', how='left')['country_id']
education_country.to_csv(f'../data/neo4j/education_country.csv', index=False)

## Social Economic Class

In [11]:
# add UUID to every "row"
socioecono['socioeconomic_id'] = socioecono.apply(lambda row: generate_uuid(), axis=1)
socioecono.to_csv(f'../data/neo4j/socioecono.csv', index=False)
socioecono.head()

Unnamed: 0,unid,country_code,country_name,year,ses,socioeconomic_class,gdppc,yrseduc,region5,regionUN,socioeconomic_id
0,100,BGR,Bulgaria,2000,64.527023,Middle(semi-per),8958.050781,9.3762,SE Europe,East Europe,d453d391-8a31-4471-ad10-6ffcadc103f5
1,100,BGR,Bulgaria,2010,63.851353,Middle(semi-per),15283.17969,9.851,SE Europe,East Europe,897f86f9-9e3e-4c1b-90af-e4d0a0703591
2,208,DNK,Denmark,2000,86.824326,High(core),42337.71484,9.7124,NW Europe,North Europe,079ecd4e-21c6-402e-a1d0-e41d828ee4a4
3,208,DNK,Denmark,2010,82.432434,High(core),43998.4375,10.0566,NW Europe,North Europe,96617ffc-ff43-4fa2-8492-e99642aa6d69
4,246,FIN,Finland,2000,70.608109,High(core),34887.17969,8.1934,NW Europe,North Europe,00915cb5-adff-4b1a-ad52-24b74a780c1d


# Employment

In [12]:
## Employment Rate
# add uuid to every row
employment['employment_id'] = employment.apply(lambda row: generate_uuid(), axis=1)
employment.to_csv(f'../data/neo4j/employment.csv', index=False)
employment.head()

Unnamed: 0,country_name,country_code,indicator_name,Indicator Code,unemployment_rate,employment_id
0,Austria,AUT,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.268,c1296e8b-593b-420e-8a9b-168fa946783e
1,Belgium,BEL,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.541,30fd7b77-c95d-44c9-91cd-747457712f28
2,Bulgaria,BGR,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,4.185,2962a524-2845-47a6-9cd7-75306fa01647
3,Switzerland,CHE,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,3.838,becb3554-b946-41fa-aac9-195379626432
4,Germany,DEU,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,3.006,036bc564-ef4b-42b0-9725-bafa72745acc


# Correlation

In [13]:
correlation_df = pd.merge(education, employment, on='country_code', how='inner')
correlation_df = correlation_df[correlation_df['indicator_code'].isin(selected_indicators_code)]
correlation_df

Unnamed: 0,country_name_x,country_code,indicator_name_x,value,indicator_code,educational_index_id,country_name_y,indicator_name_y,Indicator Code,unemployment_rate,employment_id
438,Austria,AUT,Government expenditure on education as % of GD...,5.499550,SE.XPD.TOTL.GD.ZS,75ba0a9b-69ce-4161-a764-4641d1ae9133,Austria,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.268,c1296e8b-593b-420e-8a9b-168fa946783e
439,Austria,AUT,Government expenditure on secondary education ...,2.293050,UIS.XGDP.23.FSGOV,3516fbf2-06c3-4884-bc04-28f636780d79,Austria,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.268,c1296e8b-593b-420e-8a9b-168fa946783e
440,Austria,AUT,Government expenditure on tertiary education a...,1.799560,UIS.XGDP.56.FSGOV,9ec7d4b2-b840-42e0-a8d6-d9282acedd25,Austria,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.268,c1296e8b-593b-420e-8a9b-168fa946783e
445,Austria,AUT,Government expenditure per secondary student a...,27.153971,SE.XPD.SECO.PC.ZS,60fdba8f-d925-4758-abd8-feef1dfc92cf,Austria,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.268,c1296e8b-593b-420e-8a9b-168fa946783e
446,Austria,AUT,Government expenditure per tertiary student as...,36.216961,SE.XPD.TERT.PC.ZS,43b466bb-8759-4fc0-b5f4-9bb9f587f00b,Austria,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.268,c1296e8b-593b-420e-8a9b-168fa946783e
...,...,...,...,...,...,...,...,...,...,...,...
23288,United Kingdom,GBR,Labor force with intermediate education (% of ...,74.199997,SL.TLF.INTM.ZS,a7a9d735-c047-4faf-b47c-f7da774edffd,United Kingdom,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,4.355,15611ab8-aff2-4a21-a411-365201c64e07
23513,United Kingdom,GBR,PISA: Mean performance on the mathematics scale,492.478500,LO.PISA.MAT,0da2ab71-2cc7-41e0-80a9-e05febd5b4ae,United Kingdom,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,4.355,15611ab8-aff2-4a21-a411-365201c64e07
23516,United Kingdom,GBR,PISA: Mean performance on the reading scale,497.971900,LO.PISA.REA,986a423f-70cc-4201-8c3b-7d614b662a48,United Kingdom,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,4.355,15611ab8-aff2-4a21-a411-365201c64e07
23519,United Kingdom,GBR,PISA: Mean performance on the science scale,509.221500,LO.PISA.SCI,a92b8cdc-b028-4f1e-a3ea-1d6b955319c0,United Kingdom,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,4.355,15611ab8-aff2-4a21-a411-365201c64e07


In [14]:
from utils.analysis import get_correlation

corr_list = []

for indicator in selected_indicators_code:
    corr_list.append(get_correlation(correlation_df, indicator))
    
corr_df = pd.DataFrame({
    "indicator_code": selected_indicators_code,
    "correlation_coef": corr_list,
})

# add indictor name
# add category
corr_df = corr_df.merge(categories, on='indicator_code', how='left')
corr_df.to_csv(f'../data/neo4j/correlation.csv', index=False)
corr_df


Unnamed: 0,indicator_code,correlation_coef,category,indicator_name
0,SE.XPD.TOTL.GD.ZS,-0.24014,Government Expenditure,Government expenditure on education as % of GD...
1,UIS.XGDP.23.FSGOV,-0.395974,Government Expenditure,Government expenditure on secondary education ...
2,UIS.XGDP.56.FSGOV,-0.116463,Government Expenditure,Government expenditure on tertiary education a...
3,SE.XPD.SECO.PC.ZS,-0.136313,Government Expenditure,Government expenditure per secondary student a...
4,SE.XPD.TERT.PC.ZS,-0.348315,Government Expenditure,Government expenditure per tertiary student as...
5,SE.TER.CMPL.ZS,-0.159856,Gender Equality,Gross graduation ratio from first degree progr...
6,SE.TER.GRAD.FE,0.197104,Gender Equality,"Graduates from tertiary education, female (num..."
7,SE.TER.ENRR.FE,0.487939,Gender Equality,"Gross enrolment ratio, tertiary, female (%)"
8,UIS.EA.5T8.AG25T99.F,-0.159318,Gender Equality,UIS: Percentage of population age 25+ with at ...
9,SL.TLF.BASC.ZS,0.023364,Labor Force Education,Labor force with basic education (% of total)
