# Feature Engineering notebook


## Setup

In [None]:
import sys
sys.path.append('../src')

from preprocessing import load_data
from features import splitting_data, feature_engineering, scaling, add_constant_column

## Load the data

In [None]:
df = load_data()

In [None]:
minimal_cols = ['Region',
                'Under_five_deaths',
                'Adult_mortality',
                'GDP_per_capita',
                'Schooling',
                'Economy_status_Developed',
                'Life_expectancy'
                ]

elaborate_cols = minimal_cols + ['Alcohol_consumption',
                  'Hepatitis_B',
                  'Measles',
                  'BMI',
                  'Polio',
                  'Diphtheria',
                  'Incidents_HIV',
                  'Thinness_ten_nineteen_years',
                  'Thinness_five_nine_years',
                 ]

In [14]:
# Use only specified features
df_copy = df[elaborate_cols].copy() 

## Perform Train Test Splitting

In [None]:
X_train, X_test, y_train, y_test = splitting_data(df_copy, 'Life_expectancy')

In [None]:
# Check indices still match
print(f'Train data indices match: {all(X_train.index == y_train.index)}')
print(f'Test data indices match: {all(X_test.index == y_test.index)}')

## Perform Feature Engineering and Scaling

In [None]:
# Execute feature engineering and normalisation
X_train_fe = feature_engineering(X_train)
X_train_fe = scaling(X_train_fe)

# Add a constant (intercept)
X_train_fe = add_constant_column(X_train_fe)

# Update feature_cols
feature_cols_final = X_train_fe.columns

# Check indices still match
print(all(X_train_fe.index == y_train.index))

## Let's compare!

In [None]:
df.head()

In [None]:
X_train_fe.head()