In [1]:
# read data
import kagglehub
import os
# scientific computing
import pandas as pd
import numpy as np
# data processing - pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer  # Required for IterativeImputer
from sklearn.impute import IterativeImputer

In [2]:
# read the data
def read_raw_data():
    path = kagglehub.dataset_download("kumarajarshi/life-expectancy-who")

    file_name = os.listdir(path)[0]

    raw_data = pd.read_csv(os.path.join(path, file_name))
    
    return raw_data

In [3]:
raw_data = read_raw_data()
raw_data.index.name = "index"

# store the raw data to data directory
raw_data.to_csv("../data/raw_data.csv")

raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [4]:
# split the orginal data into train and test data sets
def split_train_test(rawdata):
    """Split the raw data into train and test sets randomly and store them 
    in the data directory.

    Args:
        rawdata (_type_): a pandas data frame
    """
    raw_data_train = raw_data.sample(frac=0.8, random_state=42)
    raw_data_test = raw_data.drop(raw_data_train.index)
    
    store_path = "../data"
    raw_data_train.to_csv(store_path + "/raw_data_train.csv")
    raw_data_test.to_csv(store_path + "/raw_data_test.csv")
    
    print(f"The test and train data sets are stored into path: {store_path}")
    
    return None

In [5]:
split_train_test(raw_data)

The test and train data sets are stored into path: ../data


## Data Cleaning

**From this section, all the operations will be performed on the train data set.**

**data = raw_data_train**

The test data set will be ignored to simulate a real situation in real world.

After doing statistics, calculating estimates and training models on train data, we can repeat this steps or apply the fitted models on test data.

In [6]:
# handling the missing data in raw data
data = pd.read_csv("../data/raw_data_train.csv", index_col = "index")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2350 entries, 2546 to 933
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2350 non-null   object 
 1   Year                             2350 non-null   int64  
 2   Status                           2350 non-null   object 
 3   Life expectancy                  2343 non-null   float64
 4   Adult Mortality                  2343 non-null   float64
 5   infant deaths                    2350 non-null   int64  
 6   Alcohol                          2194 non-null   float64
 7   percentage expenditure           2350 non-null   float64
 8   Hepatitis B                      1925 non-null   float64
 9   Measles                          2350 non-null   int64  
 10   BMI                             2322 non-null   float64
 11  under-five deaths                2350 non-null   int64  
 12  Polio                  

In [7]:
# categorize numerical and categorical variables
# Automatically detect column types in a data frame

def cols_categorize(dataframe):
    """Fetch the column names for different data types: numerical and categorical

    Args:
        dataframe (_type_): a data frame
    """
    numerical_cols = data.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = data.select_dtypes(include=["object"]).columns.tolist()
    
    categorical_cols = [col for col in categorical_cols if dataframe[col].nunique() < 50]  # Adjust threshold
    # print("Numerical Columns:", numerical_cols)
    # print("Categorical Columns:", categorical_cols)
    return numerical_cols, categorical_cols

In [8]:
num_cols, cat_cols = cols_categorize(data)

In [9]:
# different numerical imputing methods
num_imputer_1 = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])
num_imputer_2 = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))  # Uses median instead of mean
])

num_imputer_3 = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=-1))  # Replace NaN with -1
])

num_imputer_4 = Pipeline([
    ("imputer", KNNImputer(n_neighbors=3))  # Uses 3 nearest neighbors
])

num_imputer_5 = Pipeline([
    ("imputer", IterativeImputer(max_iter=10, random_state=42))  # Uses regression models
])

num_imputers = [num_imputer_1, num_imputer_2, num_imputer_3, num_imputer_4, num_imputer_5]

# different categorical imputing methods
cat_imputer_1 = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

cat_imputer_2 = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),  # Replace NaN with "Unknown"
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

cat_imputers = [cat_imputer_1, cat_imputer_2]
# combining them to create complete pipelines
imputers = [
    ColumnTransformer([("num", num_imputer, num_cols),("cat", cat_imputer, cat_cols)]) 
for cat_imputer in cat_imputers
for num_imputer in num_imputers]

# identify all imputation strategies for testing
imputers = {i: imputer for i, imputer in enumerate(imputers)}
imputers

{0: ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('imputer', SimpleImputer())]),
                                  ['Year', 'Life expectancy ', 'Adult Mortality',
                                   'infant deaths', 'Alcohol',
                                   'percentage expenditure', 'Hepatitis B',
                                   'Measles ', ' BMI ', 'under-five deaths ',
                                   'Polio', 'Total expenditure', 'Diphtheria ',
                                   ' HIV/AIDS', 'GDP', 'Population',
                                   ' thinness  1-19 years',
                                   ' thinness 5-9 years',
                                   'Income composition of resources',
                                   'Schooling']),
                                 ('cat',
                                  Pipeline(steps=[('imputer',
                                                   SimpleImputer(strategy='most_frequ

In [None]:
imputers.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])