In [61]:
import pandas as pd
pd.set_option('display.max_rows', 500)
import os

In [62]:
PATH = "../00_data/0_raw/"
all_files = [f for f in os.listdir(PATH)] #get all file names

#use file names as column names for later
def new_col_names(name):
    #print(os.path.basename(name).split('.')[0].replace('-', '_'))
    return os.path.basename(name).split('.')[0].replace('-', '_')

#make label u5mr first
label_file = all_files.pop(5)
all_files.insert(0, label_file)

In [63]:
big_df = None
joins = ['Entity', 'Code', 'Year']

EXCLUDE_NO_COUNTRIES = ["Africa", "Asia", "Europe", "European Union (27)", "High-income countries", "Low-income countries", "Lower-middle-income countries", 
                      "North America", "Oceania", "South America", "Upper-middle-income countries", "World"]

#change column names and add as columns to df
for name in all_files:
    cols_names = new_col_names(name)

    df = pd.read_csv(os.path.join(PATH, name), usecols=[0, 1, 2, 3])
    df.columns = joins + [cols_names]

    #https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isin.html
    df = df[~df['Entity'].isin(EXCLUDE_NO_COUNTRIES)] #remove continents etc. from df
    
    df = df.set_index(joins) #entity, code, year as index

    if big_df is None:
        big_df = df.copy() 
    else: #outer left join - merge all dfs
        big_df = big_df.merge(
            df, 
            left_index=True, 
            right_index=True, 
            how='left' 
        )

#big_df.loc[("Ghana")].head(15)
big_df.info()
#big_df.shape
#big_df.isnull().sum()
#big_df.head(450)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 13038 entries, ('Afghanistan', 'AFG', np.int64(1957)) to ('Zimbabwe', 'ZWE', np.int64(2023))
Data columns (total 10 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   child_mortality_igme                      13038 non-null  float64
 1   nurses_and_midwives_per_1000_people       3109 non-null   float64
 2   annual_healthcare_expenditure_per_capita  4357 non-null   float64
 3   gdp_per_capita_worldbank                  6299 non-null   float64
 4   share_of_population_urban                 11580 non-null  float64
 5   years_of_schooling                        5931 non-null   float64
 6   physicians_per_1000_people                4986 non-null   float64
 7   vaccination_coverage_who_unicef           7897 non-null   float64
 8   share_without_improved_water              4433 non-null   float64
 9   prevalence_of_undernourishment        

In [64]:
# which 6 year span has least NaN values for whole df, start from 2000 => 'cause where MDG goals started (most current years)
# between 2000 and 2019 find the 6 year period to use for later as main df, after can be more biased cause of Corona period
# so i can have a argument for my thesis in "Methodik"
# Zeitraum: 2000 - 2019, loop in 6 year periods (2000-2005, 2001-2006, 2002-2007 ... 2013-2018...)
nan_count = 0
df_period = big_df.copy()
year_val = df_period.index.get_level_values(2)

for year_start in range(2000,2015):
    year_end = year_start + 5
    # get filtered df for each period 
    df_filtered = df_period[(year_val >= year_start) & (year_val <= year_end)]
    # get number of all NaNs and print
    nan_count = df_filtered.isna().sum().sum()
    print(f"From {year_start} - {year_end}, NaN values count: {nan_count}")

From 2000 - 2005, NaN values count: 2003
From 2001 - 2006, NaN values count: 1813
From 2002 - 2007, NaN values count: 1763
From 2003 - 2008, NaN values count: 1672
From 2004 - 2009, NaN values count: 1586
From 2005 - 2010, NaN values count: 1590
From 2006 - 2011, NaN values count: 1562
From 2007 - 2012, NaN values count: 1524
From 2008 - 2013, NaN values count: 1479
From 2009 - 2014, NaN values count: 1463
From 2010 - 2015, NaN values count: 1447
From 2011 - 2016, NaN values count: 1423
From 2012 - 2017, NaN values count: 1368
From 2013 - 2018, NaN values count: 1273
From 2014 - 2019, NaN values count: 1285


In [65]:
# 2013 - 2018 has least NaN vals, take as main starting dataframe
df_main_period = df_period[(year_val >= 2013) & (year_val <= 2018)]
df_main_period.isna().sum().sum()

np.int64(1273)

In [74]:
#u5mr_df = pd.read_csv("../00_data/0_raw/child-mortality-igme.csv")
#u5mr_df.head()
big_df.head(200)
non_countries = big_df.copy()
#non_countries = non_countries[non_countries.index.get_level_values(0).str.contains('\(')]
#non_countries.head(500)
#non_countries.index.levels[0].to_list()
#print(non_countries.index.levels[0])
df_main_period

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,child_mortality_igme,nurses_and_midwives_per_1000_people,annual_healthcare_expenditure_per_capita,gdp_per_capita_worldbank,share_of_population_urban,years_of_schooling,physicians_per_1000_people,vaccination_coverage_who_unicef,share_without_improved_water,prevalence_of_undernourishment
Entity,Code,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Afghanistan,AFG,2013,7.802299,0.255,177.62009,3046.5798,24.373,0.672197,0.291,64.0,38.863914,19.4
Afghanistan,AFG,2014,7.512363,0.151,201.79086,3017.9426,24.587,0.710609,0.304,62.0,36.337383,19.3
Afghanistan,AFG,2015,7.243871,0.133,215.22618,2967.6921,24.803,0.749020,0.291,64.0,33.812515,20.0
Afghanistan,AFG,2016,6.997004,0.151,238.74481,2958.7854,25.020,1.009183,0.284,66.0,31.289620,20.5
Afghanistan,AFG,2017,6.763737,0.179,264.20660,2952.9990,25.250,1.269347,0.245,64.0,28.765442,21.4
...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,ZWE,2014,6.177143,1.507,203.30057,3352.3813,32.504,7.490000,0.122,91.0,22.451942,30.4
Zimbabwe,ZWE,2015,5.976391,1.349,191.68187,3366.6338,32.385,7.702070,0.177,87.0,22.588303,33.0
Zimbabwe,ZWE,2016,5.691236,1.359,194.29993,3345.3150,32.296,7.881035,0.174,90.0,22.715965,35.5
Zimbabwe,ZWE,2017,5.495380,2.507,143.32605,3453.5059,32.237,8.060000,0.179,89.0,22.834778,36.7
