# Preprocessing

In [94]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
plt.style.use('ggplot')

In [95]:
merged_df = pd.read_csv("../00_data/1_interim/merged_data.csv", index_col=["Code", "Year"])

### Handle High Missing Values by Code (Country ID)

In [96]:
# Missing Values by Country ID (Code)
all_missing_values = merged_df.isnull().groupby(level=0).sum()
all_missing_values["total_missing"] = all_missing_values.sum(axis=1)
all_missing_values["total_missing_%"] = round( ( all_missing_values["total_missing"] / 54 ) * 100, 2)
#plot top 12 countries with most missing values
top_missing_countries = all_missing_values.sort_values(ascending=False, by="total_missing")
top_12_missing = top_missing_countries["total_missing"].head(12)
print(f"Top 12 Countries with most missing values:\n{top_12_missing}")
#Threshold 50%: all countries with missing values over 50% get excluded from Dataframe
# 9 x 6 = 54 data point for each country --> 27 NaNs as upper limit: 
# AIA       49 (Anguilla)
# MSR       48 (Montserrat)
# OWID_KOS  48 (Kosovo)
# COK       42 (Cook Islands)
# NIU       42 (Niue)
# TCA       39 (Turks and Caicos Islands)
# VGB       38 (British Virgin Islands)

Top 12 Countries with most missing values:
Code
AIA         49
MSR         48
OWID_KOS    48
COK         42
NIU         42
TCA         39
VGB         38
PRK         26
SSD         26
MCO         25
GNQ         22
VEN         21
Name: total_missing, dtype: int64


In [97]:
# Exclude high missing values countries (threshold >= 50%) & save as new df
# new filtered version has now 967 missing values (306 less than before)
exclude_countries = top_missing_countries[top_missing_countries["total_missing_%"] >= 50]
filtered_df = merged_df[~merged_df.index.get_level_values(0).isin(exclude_countries.index.tolist())].copy()
filtered_df.to_csv('../00_data/1_interim/filtered_data_01.csv', index=True)
filtered_df.isna().sum().sum()

np.int64(967)

### Handle Missing Values for "vaccination_coverage_who_unicef"

In [98]:
filtered_df.isna().sum()

Entity                                        0
child_mortality_igme                          0
annual_healthcare_expenditure_per_capita     16
gdp_per_capita_worldbank                     42
nurses_and_midwives_per_1000_people         300
physicians_per_1000_people                  405
prevalence_of_undernourishment              156
share_of_population_urban                     0
share_without_improved_water                 16
vaccination_coverage_who_unicef               6
years_of_schooling                           26
dtype: int64

In [99]:
#vaccination_coverage_who_unicef (3 doses diphtheria, tetanus and pertussis vaccine) has only 6 missing values for country NIC (Nicaragua)
filtered_df.loc[filtered_df["vaccination_coverage_who_unicef"].isnull(), ["vaccination_coverage_who_unicef"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,vaccination_coverage_who_unicef
Code,Year,Unnamed: 2_level_1
NIC,2013,
NIC,2014,
NIC,2015,
NIC,2016,
NIC,2017,
NIC,2018,


In [110]:
# OWID has data from WHO, but WUENIC data source does not exist for Nicaragua
# Source: WHO immunizationdata, https://immunizationdata.who.int/dashboard/regions/region-of-the-americas/NIC (Zugriff am: 27.10.2025)
# Use worldbank countries groups (7 different groups) & imputate vaccination values for "NIC" by median of its countrys group
world_regions = pd.read_csv("../00_data/1_interim/world-regions-worldbank.csv")
country_groups = world_regions.loc[:, ["Code", "World regions according to WB"]]

new_col_group = filtered_df.copy()
new_col_group = new_col_group.reset_index()
new_col_group = new_col_group.merge(country_groups, on=['Code'], how="left")
new_col_group =  new_col_group.set_index(["Code", "Year"])

In [111]:
# Nicaragua (NIC) belongs to world region group: "Latin America and Caribbean (WB)"
# Calculate median of column vaccination_coverage for "Latin America and Caribbean (WB)"
# set value to missing values for Nicaragua (NIC)

#latin_caribbean = new_col_group.loc[new_col_group["World regions according to WB"] == "Latin America and Caribbean (WB)"]
#latin_caribbean.head()
latin_caribbean = new_col_group.groupby(["World regions according to WB"])["vaccination_coverage_who_unicef"].median()
latin_caribbean

World regions according to WB
East Asia and Pacific (WB)                                  92.0
Europe and Central Asia (WB)                                95.5
Latin America and Caribbean (WB)                            92.0
Middle East, North Africa, Afghanistan and Pakistan (WB)    97.0
North America (WB)                                          92.5
South Asia (WB)                                             98.0
Sub-Saharan Africa (WB)                                     85.0
Name: vaccination_coverage_who_unicef, dtype: float64

In [112]:
#new_col_group.loc[new_col_group["Entity"] == "Nicaragua", "vaccination_coverage_who_unicef"] = 92.0

In [113]:
filtered_df.loc[filtered_df["Entity"] == "Nicaragua", "vaccination_coverage_who_unicef"] = 92.0

In [114]:
filtered_df.isna().sum()

Entity                                        0
child_mortality_igme                          0
annual_healthcare_expenditure_per_capita     16
gdp_per_capita_worldbank                     42
nurses_and_midwives_per_1000_people         300
physicians_per_1000_people                  405
prevalence_of_undernourishment              156
share_of_population_urban                     0
share_without_improved_water                 16
vaccination_coverage_who_unicef               0
years_of_schooling                           26
dtype: int64

In [118]:
filtered_df.loc[filtered_df["Entity"] == "Nicaragua"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Entity,child_mortality_igme,annual_healthcare_expenditure_per_capita,gdp_per_capita_worldbank,nurses_and_midwives_per_1000_people,physicians_per_1000_people,prevalence_of_undernourishment,share_of_population_urban,share_without_improved_water,vaccination_coverage_who_unicef,years_of_schooling
Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NIC,2013,Nicaragua,20.337691,356.79343,6494.1494,1.347,0.89,19.3,57.505,17.44706,92.0,8.581959
NIC,2014,Nicaragua,19.446352,399.16168,6712.2173,1.341,0.885,19.8,57.7,17.334541,92.0,8.781239
NIC,2015,Nicaragua,18.606286,426.6129,6939.0327,1.35,0.92,19.3,57.895,17.220947,92.0,8.980518
NIC,2016,Nicaragua,17.80585,463.54483,7158.758,1.447,0.941,18.3,58.09,17.106255,92.0,9.135801
NIC,2017,Nicaragua,17.066324,505.4194,7391.197,1.53,0.975,17.6,58.299,17.03334,92.0,9.291083
NIC,2018,Nicaragua,16.364048,503.79822,7049.0273,,0.664,17.6,58.522,16.955544,92.0,9.446366


In [117]:
# leave world groups with One Hot Encoding? 
#new_col_group.head(2)