# Feature Engingeering

## Import Libraries

In [6]:
## Import core libraries:

# For data
import pandas as pd
import numpy as np

# For scaling processes
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

# For modelling (specifically for generating the constant column as part of FE)
import statsmodels.api as sm

## Importing Data

In [7]:
## Read the data set:

# Create data frame from the WHO data
who = pd.read_csv("https://raw.githubusercontent.com/Ale42RA/WHO_analytica/refs/heads/main/life_expectancy_data.csv")

## Feature Engingeering

In [8]:
## Feature Engingeering process:

# Custom function for all FE processes
def feature_eng(who):
        # Create copy of the data frame
        who = who.copy()
        
        # One Hot Encoding (OHE)
        who = pd.get_dummies(who, columns = ['Region'], drop_first = True, prefix = 'Region', dtype = int)
                   
        # Converting features into logarithm
        who['Incidents_HIV_log'] = np.log(who['Incidents_HIV'])
        who['GDP_per_capita_log'] = np.log(who['GDP_per_capita'])

        # Creaitng scaler variables
        standard_scaler = StandardScaler()
        minmax_scaler = MinMaxScaler()
        robust_scaler = RobustScaler()

        # Normalisation distributed features
        standard_cols = ['BMI', 'Schooling', 'Infant_deaths']
        who[standard_cols] = standard_scaler.fit_transform(who[standard_cols])

        # MinMax scaling for bounded features to preserve the original shape of the distribution
        minmax_cols = ['GDP_per_capita_log', 'Incidents_HIV_log']
        who[minmax_cols] = minmax_scaler.fit_transform(who[minmax_cols])

         # Robust scaling for features with outliers and maintain distribution's shape
        robust_cols = ['Under_five_deaths', 'Adult_mortality']
        for col in robust_cols:
                who[[col]] = robust_scaler.fit_transform(who[[col]])
        
        # Created for statsmodeling. Must always be present
        who = sm.add_constant(who)

        # Return the results
        return who

## Feature Columns for the 'Main' model

In [9]:
## Main model feature columns:

feature_cols = [
 'const',
 'Under_five_deaths',
 'Adult_mortality',
 'BMI',
 'Incidents_HIV_log',
 'GDP_per_capita_log',
 'Schooling',
 'Region_Asia',
 'Region_Central America and Caribbean',
 'Region_European Union',
 'Region_Middle East',
 'Region_North America',
 'Region_Oceania',
 'Region_Rest of Europe',
 'Region_South America'
 ]

## Feature Columns for the 'Ethical' model

In [10]:
## Ethical model feature columns:

feature_cols_eth = [
 'const',
 'Under_five_deaths',
 'Adult_mortality',
 'Region_Asia',
 'Region_Central America and Caribbean',
 'Region_European Union',
 'Region_Middle East',
 'Region_North America',
 'Region_Oceania',
 'Region_Rest of Europe',
 'Region_South America'
 ]