# Data Preprocessing

## Objective
The goal of this notebook is to prepare the cleaned dataset for machine learning models.


In [2]:
## import needed libarary & set style 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

sys.path.append(str(Path("../src").resolve()))

%load_ext autoreload
%autoreload 2
%matplotlib inline
from plot_style import set_plot_style

set_plot_style()
## 1. Data Loading


from data_loader import load_data
df = load_data("../data/processed/life_expectancy_clean.csv")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Feature Engineering

In [3]:
df.head()

Unnamed: 0,Country,Year,Status,Life_expectancy,Adult_Mortality,infant_deaths,Alcohol,percentage_expenditure,Hepatitis_B,Measles,...,Polio,Total_expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness_1-19_years,thinness_5-9_years,Income_composition_of_resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,4.143135,0.01,4.280542,65.0,7.051856,...,6.0,8.16,65.0,0.09531,6.372055,17.334091,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,4.174387,0.01,4.311116,62.0,6.200509,...,58.0,8.18,62.0,0.09531,6.419501,12.699497,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,4.204693,0.01,4.307023,64.0,6.066108,...,62.0,8.13,64.0,0.09531,6.450067,17.272826,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,4.248495,0.01,4.371777,67.0,7.93308,...,67.0,8.52,67.0,0.09531,6.508708,15.123021,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,4.276666,0.01,2.091507,68.0,8.011023,...,68.0,7.87,68.0,0.09531,4.167242,14.906964,18.2,18.2,0.454,9.5


### Child Mortality Rate

This feature combines infant deaths and under-five deaths into a single indicator.
It represents the overall burden of child mortality in a country.

High child mortality is strongly associated with lower life expectancy.


In [4]:
df['child_mortality_rate'] = (
    df['infant_deaths'] + df['under-five_deaths']
)


### Immunization Coverage 

This feature represents the average immunization coverage for major diseases:
Hepatitis B, Polio, and Diphtheria.

Higher immunization coverage usually indicates better healthcare systems
and is expected to increase life expectancy.


In [5]:
df['immunization_avg'] = (
    df['Hepatitis_B'] + df['Polio'] + df['Diphtheria']
) / 3


### Economic Strength

This feature combines GDP and income composition of resources.
It reflects both economic power and income distribution quality.

Stronger economies tend to invest more in healthcare, education,
and living conditions, which positively impact life expectancy.


In [6]:
df['economic_strength'] = (
    df['GDP'] * df['Income_composition_of_resources']
)


### Education Index

This feature combines schooling years and income composition.
It represents access to education and overall human development.

Higher education levels are associated with healthier lifestyles
and better health awareness.


In [7]:
df['education_index'] = (
    df['Schooling'] * df['Income_composition_of_resources']
)


### Mortality Pressure

This feature combines adult mortality and HIV/AIDS prevalence.
It reflects the overall mortality pressure on the adult population.

Higher mortality pressure is expected to reduce life expectancy.


In [8]:
df['mortality_pressure'] = (
    df['Adult_Mortality'] + df['HIV/AIDS']
)


In [9]:
df[
    [
        'child_mortality_rate',
        'immunization_avg',
        'economic_strength',
        'education_index',
        'mortality_pressure'
    ]
].head()


Unnamed: 0,child_mortality_rate,immunization_avg,economic_strength,education_index,mortality_pressure
0,8.573952,45.333333,3.052214,4.8379,263.09531
1,8.640295,60.666667,3.055682,4.76,271.09531
2,8.704502,63.333333,3.031532,4.653,268.09531
3,8.79179,67.0,3.013532,4.5374,272.09531
4,8.861634,68.0,1.891928,4.313,275.09531


##  Save Dataset for Dashboard

At this stage, the dataset contains:
- Cleaned original features
- A small number of meaningful engineered features
- Country and year information for grouping and filtering

This version of the dataset is saved for dashboard visualization.
No encoding or scaling is applied to keep the data readable and interpretable.


In [10]:
df.to_csv(
    '../data/processed/life_expectancy_dashboard.csv',
    index=False
)


# Encoding Categorical

In [11]:
df['Status'] = df['Status'].map({
    'Developed': 1,
    'Developing': 0
})


In [12]:
df['Status'].value_counts()


Status
0    2416
1     512
Name: count, dtype: int64

In [18]:
# Drop non-numerical identifier
df_model = df.drop(columns=['Country'])


# Train / Test Split

In [19]:
X = df_model.drop(columns=['Life_expectancy'])
y = df_model['Life_expectancy']
X.shape, y.shape

((2928, 25), (2928,))

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [21]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (2342, 25)
X_test shape: (586, 25)
y_train shape: (2342,)
y_test shape: (586,)


# Scaling Numerical Features

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training data only
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data using same scaler
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((2342, 25), (586, 25))

# Feature Selection

In [23]:
# Convert scaled train data back to DataFrame
X_train_scaled_df = pd.DataFrame(
    X_train_scaled,
    columns=X_train.columns
)

# Compute correlation with target
correlation = X_train_scaled_df.corrwith(y_train).abs().sort_values(ascending=False)

correlation

Status                             0.042376
Polio                              0.041160
Population                         0.035120
Hepatitis_B                        0.026718
HIV/AIDS                           0.024111
mortality_pressure                 0.021708
Adult_Mortality                    0.021645
Year                               0.020489
economic_strength                  0.017585
GDP                                0.016554
Measles                            0.015296
Diphtheria                         0.014552
infant_deaths                      0.013471
Alcohol                            0.012333
immunization_avg                   0.012271
child_mortality_rate               0.012250
Total_expenditure                  0.011970
under-five_deaths                  0.011089
thinness_1-19_years                0.010944
thinness_5-9_years                 0.009207
Schooling                          0.008850
Income_composition_of_resources    0.007075
education_index                 

In [24]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

coef_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': np.abs(lr.coef_)
}).sort_values(by='Coefficient', ascending=False)

coef_df


Unnamed: 0,Feature,Coefficient
9,under-five_deaths,7.97211
3,infant_deaths,7.173173
13,HIV/AIDS,3.502727
23,education_index,2.396972
22,economic_strength,2.064521
14,GDP,1.435344
18,Income_composition_of_resources,1.181872
24,mortality_pressure,0.870115
2,Adult_Mortality,0.851699
20,child_mortality_rate,0.663439


In [29]:
np.savez(
    '../data/processed/model_data.npz',
    X_train=X_train_scaled,
    X_test=X_test_scaled,
    y_train=y_train.values,
    y_test=y_test.values
)


## Saving Arrays for Modeling
