# Preprocessing

In [8]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_rows', 500)

In [9]:
merged_df = pd.read_csv("../00_data/1_interim/merged_data.csv", index_col=["Code", "Year"])

### Handle High Missing Values #1: by Code (Country ID)

In [10]:
# Missing Values by Country ID (Code)
all_missing_values = merged_df.isnull().groupby(level=0).sum()
# 9 main potential features: 
values_count_per_country = merged_df.groupby(level=0).size().iloc[0] * 9

all_missing_values["total_missing"] = all_missing_values.sum(axis=1)
all_missing_values["total_missing_%"] = round( ( all_missing_values["total_missing"] / values_count_per_country ) * 100, 2)
#plot top 12 countries with most missing values
top_missing_countries = all_missing_values.sort_values(ascending=False, by="total_missing")
top_12_missing = top_missing_countries["total_missing"].head(12)
print(f"Top 12 Countries with most missing values:\n{top_12_missing}")
#Threshold 50%: all countries with missing values over 50% get excluded from Dataframe
# 9 x 6 = 54 data point for each country --> 27 NaNs as upper limit: 
# AIA       49 (Anguilla)
# MSR       48 (Montserrat)
# OWID_KOS  48 (Kosovo)
# COK       42 (Cook Islands)
# NIU       42 (Niue)
# TCA       39 (Turks and Caicos Islands)
# VGB       38 (British Virgin Islands)

Top 12 Countries with most missing values:
Code
AIA         49
MSR         48
OWID_KOS    48
COK         42
NIU         42
TCA         39
VGB         38
PRK         26
SSD         26
MCO         25
GNQ         22
VEN         21
Name: total_missing, dtype: int64


In [11]:
# Exclude high missing values countries (threshold >= 50%) & save as new df
# new filtered version has now 967 missing values (306 less than before)
exclude_countries = top_missing_countries[top_missing_countries["total_missing_%"] >= 50]
filtered_df = merged_df[~merged_df.index.get_level_values(0).isin(exclude_countries.index.tolist())].copy()
filtered_df.to_csv('../00_data/1_interim/filtered_data_01.csv', index=True)
filtered_df.isna().sum().sum()

np.int64(967)

In [12]:
filtered_df.isna().sum()

Entity                                        0
child_mortality_igme                          0
annual_healthcare_expenditure_per_capita     16
gdp_per_capita_worldbank                     42
nurses_and_midwives_per_1000_people         300
physicians_per_1000_people                  405
prevalence_of_undernourishment              156
share_of_population_urban                     0
share_without_improved_water                 16
vaccination_coverage_who_unicef               6
years_of_schooling                           26
dtype: int64

### Testing: Handle Missing Values #2: Missing Indikators

In [19]:
# add _missing indikator: 0 if not missing, 1 if missing

example_train_set = pd.read_csv('../00_data/2_split/train_df_raw.csv', index_col=0)
missing_indicators_df = example_train_set.copy()

for column in missing_indicators_df.columns:
    if missing_indicators_df[column].isna().sum() > 0:
        missing_indicators_df[column+"_missing"] = missing_indicators_df[column].isna().astype(int)

# Data not missing at random > include missing indicator values to trainingset in pipeline later

In [32]:
missing_indicators_df.head()

Unnamed: 0,Code,Year,Entity,child_mortality_igme,annual_healthcare_expenditure_per_capita,gdp_per_capita_worldbank,nurses_and_midwives_per_1000_people,physicians_per_1000_people,prevalence_of_undernourishment,share_of_population_urban,...,vaccination_coverage_who_unicef,years_of_schooling,annual_healthcare_expenditure_per_capita_missing,gdp_per_capita_worldbank_missing,nurses_and_midwives_per_1000_people_missing,physicians_per_1000_people_missing,prevalence_of_undernourishment_missing,share_without_improved_water_missing,vaccination_coverage_who_unicef_missing,years_of_schooling_missing
0,AFG,2013,Afghanistan,78.02299,177.62009,3046.5798,0.255,0.291,19.4,24.373,...,64.0,0.672197,0,0,0,0,0,0,0,0
1,AFG,2014,Afghanistan,75.123625,201.79086,3017.9426,0.151,0.304,19.3,24.587,...,62.0,0.710609,0,0,0,0,0,0,0,0
2,AFG,2015,Afghanistan,72.438707,215.22618,2967.6921,0.133,0.291,20.0,24.803,...,64.0,0.74902,0,0,0,0,0,0,0,0
3,AFG,2016,Afghanistan,69.97004,238.74481,2958.7854,0.151,0.284,20.5,25.02,...,66.0,1.009183,0,0,0,0,0,0,0,0
4,AFG,2017,Afghanistan,67.63737,264.2066,2952.999,0.179,0.245,21.4,25.25,...,64.0,1.269347,0,0,0,0,0,0,0,0
