# Feature Engineering notebook


## Setup

In [2]:
import pandas as pd
import statsmodels.api as sm
import sys
sys.path.append('../src')

from preprocessing import load_data
from features import splitting_data, feature_engineering, scaling, add_constant_column

## Load the data

In [3]:
df = load_data()

In [4]:
feature_cols = [#'Country',
               'Region',
               #'Year',
               'Infant_deaths',
               'Under_five_deaths',
               'Adult_mortality',
               'Hepatitis_B',
               'Measles',
               'BMI',
               'Polio',
               'Diphtheria',
               'Incidents_HIV',
               'GDP_per_capita',
               'Population_mln',
               'Thinness_ten_nineteen_years',
               'Thinness_five_nine_years',
               'Schooling',
               'Economy_status_Developed',
               'Life_expectancy'
              ]

In [5]:
# Use only specified features
df_copy = df[feature_cols].copy() 

## Perform Train Test Splitting

In [6]:
X_train, X_test, y_train, y_test = splitting_data(df_copy, 'Life_expectancy')

In [7]:
# Check indices still match
print(f'Train data indices match: {all(X_train.index == y_train.index)}')
print(f'Test data indices match: {all(X_test.index == y_test.index)}')

Train data indices match: True
Test data indices match: True


## Perform Feature Engineering and Scaling

In [8]:
# Execute feature engineering and normalisation
X_train_fe = feature_engineering(X_train)
X_train_fe = scaling(X_train_fe)

# Add a constant (intercept)
X_train_fe = add_constant_column(X_train_fe)

# Update feature_cols
feature_cols_final = X_train_fe.columns

# Check indices still match
print(all(X_train_fe.index == y_train.index))

True


## Let's compare!

In [9]:
df.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,...,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Turkiye,Middle East,2015,11.1,13.0,105.824,1.32,97,65,27.8,...,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5
1,Spain,European Union,2015,2.7,3.3,57.9025,10.35,97,94,26.0,...,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
2,India,Asia,2007,51.5,67.9,201.0765,1.57,60,35,21.2,...,64,0.13,1076,1183.21,27.1,28.0,5.0,0,1,65.4
3,Guyana,South America,2006,32.8,40.5,222.1965,5.68,93,74,25.3,...,93,0.79,4146,0.75,5.7,5.5,7.9,0,1,67.0
4,Israel,Middle East,2012,3.4,4.3,57.951,2.89,97,89,27.0,...,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7


In [10]:
X_train_fe.head()

Unnamed: 0,const,Infant_deaths,Adult_mortality,BMI,Schooling,Economy_status_Developed,Asia,Central America and Caribbean,European Union,Middle East,...,South America,Under_five_deaths_log,Incidents_HIV_log,GDP_per_capita_log,Population_mln_log,Thinness_metric_log,Hepatitis_B_exp,Measles_exp,Polio_exp,Diphtheria_exp
2026,1.0,-0.298246,-0.369456,-0.8125,0.566038,0.0,1.0,0.0,0.0,0.0,...,0.0,-0.460733,-0.439473,-0.059503,0.422533,1.110856,0.595282,0.623665,0.401822,0.401822
651,1.0,-0.403509,-0.34836,0.34375,0.698113,1.0,0.0,0.0,1.0,0.0,...,0.0,-0.787833,-0.226796,0.539377,0.100125,-0.29027,0.53304,0.581694,0.197898,0.333166
2225,1.0,-0.110276,-0.147051,0.34375,0.396226,0.0,0.0,0.0,0.0,0.0,...,1.0,-0.132067,0.596838,0.59851,0.594279,-0.460829,-0.589593,0.0,-0.848928,-0.905138
2357,1.0,-0.200501,-0.581719,0.1875,0.264151,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.284608,-0.37724,-0.073915,-0.410034,-0.506439,0.595282,0.581694,0.401822,0.401822
670,1.0,0.588972,2.319636,-0.71875,-0.396226,0.0,0.0,0.0,0.0,0.0,...,0.0,0.607809,7.063437,-0.110276,-0.55825,1.061779,-0.329621,-0.621975,-0.676884,-0.848928
