# Data Cleaning and Analysis
#### Data regarding education from data.worldbank.org

## Imports

In [1]:
# Basic imports

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set()

%matplotlib inline

In [2]:
# Additional imports
import os
import education.utils as u

## Files with data

In [3]:
# Paths to data
raw_data = '../data/raw'
processed_data = '../data/processed'
clean_data = '../data/clean'

In [4]:
# Show all files 
os.listdir(processed_data)

['countries_info.csv',
 'data_explain.csv',
 'data_sources.csv',
 'data_year.csv',
 'ed_data.csv']

In [5]:
# File paths
ed_data = 'ed_data.csv'
countries = 'countries_info.csv'
data_info = 'data_explain.csv'

## Data Exploration

In [6]:
ed_df = pd.read_csv(os.path.join(processed_data, ed_data))

In [7]:
ed_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2055,2060,2065,2070,2075,2080,2085,2090,2095,2100
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.F,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.GPI,,,,,,,...,,,,,,,,,,
3,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,,,,,,...,,,,,,,,,,
4,Arab World,ARB,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,54.822121,54.894138,56.209438,57.267109,57.991138,59.36554,...,,,,,,,,,,


In [13]:
indicators = list(ed_df['Indicator Name'].unique())

In [15]:
for i in indicators:
    if 'Barro-Lee' in i:
        print(i)

Barro-Lee: Average years of primary schooling, age 15+, female
Barro-Lee: Average years of primary schooling, age 15+, total
Barro-Lee: Average years of primary schooling, age 15-19, female
Barro-Lee: Average years of primary schooling, age 15-19, total
Barro-Lee: Average years of primary schooling, age 20-24, female
Barro-Lee: Average years of primary schooling, age 20-24, total
Barro-Lee: Average years of primary schooling, age 25+, female
Barro-Lee: Average years of primary schooling, age 25+, total
Barro-Lee: Average years of primary schooling, age 25-29, female
Barro-Lee: Average years of primary schooling, age 25-29, total
Barro-Lee: Average years of primary schooling, age 30-34, female
Barro-Lee: Average years of primary schooling, age 30-34, total
Barro-Lee: Average years of primary schooling, age 35-39, female
Barro-Lee: Average years of primary schooling, age 35-39, total
Barro-Lee: Average years of primary schooling, age 40-44, female
Barro-Lee: Average years of primary scho

In [16]:
off_ed = [
'Barro-Lee: Percentage of female population age 25+ with no education',
'Barro-Lee: Percentage of female population age 25+ with primary schooling. Completed Primary',
'Barro-Lee: Percentage of female population age 25+ with secondary schooling. Completed Secondary',
'Barro-Lee: Percentage of female population age 25+ with tertiary schooling. Completed Tertiary',
'Barro-Lee: Percentage of population age 25+ with no education',
'Barro-Lee: Percentage of population age 25+ with primary schooling. Completed Primary',
'Barro-Lee: Percentage of population age 25+ with secondary schooling. Completed Secondary',
'Barro-Lee: Percentage of population age 25+ with tertiary schooling. Completed Tertiary',
]

In [27]:
df_off_reg = ed_df[ed_df['Indicator Name'].isin(off_ed)]
df_off_reg.dropna(axis=1, how='all', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_off_reg.dropna(axis=1, how='all', inplace=True)


In [28]:
df_off_reg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1936 entries, 290 to 883665
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    1936 non-null   object 
 1   Country Code    1936 non-null   object 
 2   Indicator Name  1936 non-null   object 
 3   Indicator Code  1936 non-null   object 
 4   1970            1152 non-null   float64
 5   1975            1152 non-null   float64
 6   1980            1152 non-null   float64
 7   1985            1152 non-null   float64
 8   1990            1152 non-null   float64
 9   1995            1152 non-null   float64
 10  2000            1152 non-null   float64
 11  2005            1152 non-null   float64
 12  2010            1152 non-null   float64
dtypes: float64(9), object(4)
memory usage: 211.8+ KB


In [29]:
df_c = pd.read_csv(os.path.join(processed_data, countries))

In [30]:
df_c

Unnamed: 0,Country Code,Short Name,Region,Income Group,POP_census_year,Is_Country
0,ABW,Aruba,Latin America & Caribbean,High income: nonOECD,2010.0,True
1,AFG,Afghanistan,South Asia,Low income,1979.0,True
2,AGO,Angola,Sub-Saharan Africa,Upper middle income,1970.0,True
3,ALB,Albania,Europe & Central Asia,Upper middle income,2011.0,True
4,AND,Andorra,Europe & Central Asia,High income: nonOECD,2011.0,True
...,...,...,...,...,...,...
236,XKX,Kosovo,Europe & Central Asia,Lower middle income,2011.0,True
237,YEM,Yemen,Middle East & North Africa,Lower middle income,2004.0,True
238,ZAF,South Africa,Sub-Saharan Africa,Upper middle income,2011.0,True
239,ZMB,Zambia,Sub-Saharan Africa,Lower middle income,2010.0,True


In [35]:
df_c = df_c[['Country Code','Region','Income Group', 'Is_Country']]

In [44]:
df_off = df_off_reg.join(df_c.set_index('Country Code'), on='Country Code', how='left')

In [45]:
df_off.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1936 entries, 290 to 883665
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    1936 non-null   object 
 1   Country Code    1936 non-null   object 
 2   Indicator Name  1936 non-null   object 
 3   Indicator Code  1936 non-null   object 
 4   1970            1152 non-null   float64
 5   1975            1152 non-null   float64
 6   1980            1152 non-null   float64
 7   1985            1152 non-null   float64
 8   1990            1152 non-null   float64
 9   1995            1152 non-null   float64
 10  2000            1152 non-null   float64
 11  2005            1152 non-null   float64
 12  2010            1152 non-null   float64
 13  Region          1712 non-null   object 
 14  Income Group    1712 non-null   object 
 15  Is_Country      1928 non-null   object 
dtypes: float64(9), object(7)
memory usage: 321.7+ KB


In [47]:
df_off['Is_Country'].fillna(value=False, inplace=True)

In [49]:
df_off[~df_off['Is_Country']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208 entries, 290 to 190980
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    208 non-null    object 
 1   Country Code    208 non-null    object 
 2   Indicator Name  208 non-null    object 
 3   Indicator Code  208 non-null    object 
 4   1970            0 non-null      float64
 5   1975            0 non-null      float64
 6   1980            0 non-null      float64
 7   1985            0 non-null      float64
 8   1990            0 non-null      float64
 9   1995            0 non-null      float64
 10  2000            0 non-null      float64
 11  2005            0 non-null      float64
 12  2010            0 non-null      float64
 13  Region          0 non-null      object 
 14  Income Group    0 non-null      object 
 15  Is_Country      208 non-null    bool   
dtypes: bool(1), float64(9), object(6)
memory usage: 26.2+ KB


In [50]:
df_off[df_off['Is_Country']].to_csv(os.path.join(clean_data, 'education_level.csv'))