# Preprocessing

In [49]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_rows', 500)

In [17]:
merged_df = pd.read_csv("../00_data/1_interim/merged_data.csv", index_col=["Code", "Year"])

### Handle High Missing Values #1: by Code (Country ID)

In [24]:
# Missing Values by Country ID (Code)
all_missing_values = merged_df.isnull().groupby(level=0).sum()
# 9 main potential features: 
values_count_per_country = merged_df.groupby(level=0).size().iloc[0] * 9

all_missing_values["total_missing"] = all_missing_values.sum(axis=1)
all_missing_values["total_missing_%"] = round( ( all_missing_values["total_missing"] / values_count_per_country ) * 100, 2)
#plot top 12 countries with most missing values
top_missing_countries = all_missing_values.sort_values(ascending=False, by="total_missing")
top_12_missing = top_missing_countries["total_missing"].head(12)
print(f"Top 12 Countries with most missing values:\n{top_12_missing}")
#Threshold 50%: all countries with missing values over 50% get excluded from Dataframe
# 9 x 6 = 54 data point for each country --> 27 NaNs as upper limit: 
# AIA       49 (Anguilla)
# MSR       48 (Montserrat)
# OWID_KOS  48 (Kosovo)
# COK       42 (Cook Islands)
# NIU       42 (Niue)
# TCA       39 (Turks and Caicos Islands)
# VGB       38 (British Virgin Islands)

Top 12 Countries with most missing values:
Code
AIA         49
MSR         48
OWID_KOS    48
COK         42
NIU         42
TCA         39
VGB         38
PRK         26
SSD         26
MCO         25
GNQ         22
VEN         21
Name: total_missing, dtype: int64


In [64]:
# Exclude high missing values countries (threshold >= 50%) & save as new df
# new filtered version has now 967 missing values (306 less than before)
exclude_countries = top_missing_countries[top_missing_countries["total_missing_%"] >= 50]
filtered_df = merged_df[~merged_df.index.get_level_values(0).isin(exclude_countries.index.tolist())].copy()
filtered_df.to_csv('../00_data/1_interim/filtered_data_01.csv', index=True)
filtered_df.isna().sum().sum()

np.int64(967)

In [65]:
filtered_df.isna().sum()

Entity                                        0
child_mortality_igme                          0
annual_healthcare_expenditure_per_capita     16
gdp_per_capita_worldbank                     42
nurses_and_midwives_per_1000_people         300
physicians_per_1000_people                  405
prevalence_of_undernourishment              156
share_of_population_urban                     0
share_without_improved_water                 16
vaccination_coverage_who_unicef               6
years_of_schooling                           26
dtype: int64

### Handle Missing Values #2: Missing Indikators

In [79]:
missing_indicators_df = filtered_df.copy()
# add _missing indikator: 0 if not missing, 1 if missing
for column in missing_indicators_df.columns:
    if missing_indicators_df[column].isna().sum() > 0:
        missing_indicators_df[column+"_missing"] = missing_indicators_df[column].isna().astype(int)

In [80]:
missing_indicators_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Entity,child_mortality_igme,annual_healthcare_expenditure_per_capita,gdp_per_capita_worldbank,nurses_and_midwives_per_1000_people,physicians_per_1000_people,prevalence_of_undernourishment,share_of_population_urban,share_without_improved_water,vaccination_coverage_who_unicef,years_of_schooling,annual_healthcare_expenditure_per_capita_missing,gdp_per_capita_worldbank_missing,nurses_and_midwives_per_1000_people_missing,physicians_per_1000_people_missing,prevalence_of_undernourishment_missing,share_without_improved_water_missing,vaccination_coverage_who_unicef_missing,years_of_schooling_missing
Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AFG,2013,Afghanistan,78.022990,177.62009,3046.5798,0.255,0.291,19.4,24.373,38.863914,64.0,0.672197,0,0,0,0,0,0,0,0
AFG,2014,Afghanistan,75.123625,201.79086,3017.9426,0.151,0.304,19.3,24.587,36.337383,62.0,0.710609,0,0,0,0,0,0,0,0
AFG,2015,Afghanistan,72.438707,215.22618,2967.6921,0.133,0.291,20.0,24.803,33.812515,64.0,0.749020,0,0,0,0,0,0,0,0
AFG,2016,Afghanistan,69.970040,238.74481,2958.7854,0.151,0.284,20.5,25.020,31.289620,66.0,1.009183,0,0,0,0,0,0,0,0
AFG,2017,Afghanistan,67.637370,264.20660,2952.9990,0.179,0.245,21.4,25.250,28.765442,64.0,1.269347,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWE,2014,Zimbabwe,61.771426,203.30057,3352.3813,1.507,0.122,30.4,32.504,22.451942,91.0,7.490000,0,0,0,0,0,0,0,0
ZWE,2015,Zimbabwe,59.763910,191.68187,3366.6338,1.349,0.177,33.0,32.385,22.588303,87.0,7.702070,0,0,0,0,0,0,0,0
ZWE,2016,Zimbabwe,56.912365,194.29993,3345.3150,1.359,0.174,35.5,32.296,22.715965,90.0,7.881035,0,0,0,0,0,0,0,0
ZWE,2017,Zimbabwe,54.953800,143.32605,3453.5059,2.507,0.179,36.7,32.237,22.834778,89.0,8.060000,0,0,0,0,0,0,0,0
