In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

import sys 
import os
sys.path.append(os.path.abspath("../src/data/"))
from make_dataset import *

In [2]:
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

# Feature Engineering

## Encode categorical features
As we have seen in the previuous notebook, we have in total 9 categorical features. Out of these, 5 features are binary - simple yes/no answers and Male/Female values for gender.

Let's convert categorical variable into dummy/indicator variables. Each variable is converted in as many 0/1 variables as there are different values. 

In [3]:
continuous_vars, categorical_vars = get_variable_types(train)
continuous_vars.remove('id'), categorical_vars.remove('NObeyesdad')
print("Continuous Variables:", continuous_vars)
print("Categorical Variables:", categorical_vars)

train = pd.get_dummies(train, columns=categorical_vars, drop_first=True)
test = pd.get_dummies(test, columns=categorical_vars, drop_first=True)

#Let's check the Shape of data
print(f'The encoded Train dataset has {train.shape[0]} rows and {train.shape[1]} columns')
print(f'The encoded Test dataset has {test.shape[0]} rows and {test.shape[1]} columns')
display(train.head())

Continuous Variables: ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
Categorical Variables: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
The encoded Train dataset has 20758 rows and 24 columns
The encoded Test dataset has 13840 rows and 24 columns


Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad,...,CAEC_Sometimes,CAEC_no,SMOKE_yes,SCC_yes,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473,Overweight_Level_II,...,True,False,False,False,True,False,False,False,True,False
1,1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0,Normal_Weight,...,False,False,False,False,False,True,False,False,False,False
2,2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,Insufficient_Weight,...,True,False,False,False,False,True,False,False,True,False
3,3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199,Obesity_Type_III,...,True,False,False,False,True,False,False,False,True,False
4,4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,Overweight_Level_II,...,True,False,False,False,True,False,False,False,True,False


## Creation of new features

### BMI (Body Mass Index)
Calculate BMI by dividing weight (kg) by height squared (m²).

In [4]:
train['BMI'] = train['Weight'] / train['Height']**2

train[['Weight', 'Height', 'BMI']].head()

Unnamed: 0,Weight,Height,BMI
0,81.66995,1.699998,28.259565
1,57.0,1.56,23.422091
2,50.165754,1.71146,17.126706
3,131.274851,1.71073,44.855798
4,93.798055,1.914186,25.599151


### BMR (Basal Metabolic Rate)
BMR is calculated using individuals age, gender, height and weight and activity level. I will use the revised Harris–Benedict equation which is calculated differently for women and men:

• Male: (88.4 + 13.4 x weight in kilograms) + (4.8 x height in centimeters) – (5.68 x age)
• Female: (447.6 + 9.25 x weight in kilograms) + (3.10 x height in centimeters) – (4.33 x age)
The result is an estimated amount of energy in kcal required to maintain the body's basic metabolic activity (without additional activity, so sufficient only for the functioning of the vital organs).

In [5]:
def calculate_bmr(weight, height, age, is_male):
  """
  Calculates the BMR based on the revised Harris-Benedict equation.

  Args:
    weight: Weight in kilograms.
    height: Height in centimeters.
    age: Age in years.
    is_male: 0 if female, 1 if male.

  Returns:
    The BMR value.
  """
  if is_male:
    return (447.6 + 9.25 * weight) + (3.10 * height * 100) - 4.33 * age
  else:
    return (88.4 + 13.4 * weight) + (4.8 * height * 100) - 5.68 * age

train['BMR'] = train.apply(
    lambda row: calculate_bmr(row['Weight'], row['Height'], row['Age'], row['Gender_Male']), axis=1
)

# Check the results
print(train[['Weight', 'Height', 'Age', 'Gender_Male', 'BMR']].head())

       Weight    Height        Age  Gender_Male          BMR
0   81.669950  1.699998  24.443011         True  1624.208180
1   57.000000  1.560000  18.000000        False  1498.760000
2   50.165754  1.711460  18.000000        False  1479.881904
3  131.274851  1.710730  20.952737        False  2549.621857
4   93.798055  1.914186  31.641081         True  1771.623788


## Model ready dataset

In [6]:
X = train.drop(['NObeyesdad'], axis=1)
y = train['NObeyesdad']

In [7]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
X_train.head()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Male,...,SMOKE_yes,SCC_yes,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,BMI,BMR
9958,9958,17.0,1.77,97.0,3.0,3.0,2.0,3.0,1.0,True,...,False,False,False,True,False,False,False,False,30.961729,1819.94
7841,7841,22.667596,1.753389,54.877111,2.0,4.0,2.0,2.0,1.0,True,...,False,False,False,True,False,False,True,False,17.849855,1400.613176
9293,9293,21.838323,1.819867,122.024954,3.0,2.880817,1.645338,0.739881,0.0,True,...,False,False,True,False,False,False,True,False,36.844219,2045.929656
15209,15209,41.0,1.58,80.0,2.0,3.0,1.0,0.0,0.0,False,...,False,False,True,False,False,False,False,False,32.046146,1685.92
16515,16515,23.0,1.8,95.0,3.0,3.0,3.0,2.0,1.0,True,...,False,False,False,False,False,False,True,False,29.320988,1784.76
