# This notebook will contain feature engineering steps:

- Encoding categorical variables

- Feature scaling (e.g., standardization, normalization)

- Feature selection (optional)

- Creating new features

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
import sys
import os
import warnings
warnings.filterwarnings('ignore')  # To avoid non-critical warnings

### Input 01-notebook file

In [5]:
path = '../data/cleaned/heart_2022_cleaned_01.csv'

In [6]:
# read data
df = pd.read_csv(path ,sep=',',encoding='utf-8')
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category
179220,MA,Male,Very good,3.0,3.0,Within past 2 years (1 year but less than 2 ye...,Yes,5.0,None of them,0,...,25.09,Yes,Yes,Yes,No,"Yes, received Tdap",No,No,Overweight,Short Sleep
128892,KS,Male,Good,1.0,4.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,0,...,19.48,No,No,No,Yes,"Yes, received tetanus shot but not sure what type",No,No,Normal weight,Normal Sleep
377122,VA,Female,Good,15.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",0,...,27.44,No,No,Yes,Yes,"Yes, received Tdap",No,No,Overweight,Normal Sleep
42527,CO,Female,Excellent,1.0,0.0,Within past year (anytime less than 12 months ...,Yes,8.0,None of them,0,...,21.14,Yes,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No,Normal weight,Normal Sleep
411727,WA,Male,Good,8.0,30.0,Within past 2 years (1 year but less than 2 ye...,Yes,5.0,None of them,0,...,33.96,Yes,No,Yes,No,"No, did not receive any tetanus shot in the pa...",No,Yes,Obese,Short Sleep


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442067 entries, 0 to 442066
Data columns (total 42 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      442067 non-null  object 
 1   Sex                        442067 non-null  object 
 2   GeneralHealth              442067 non-null  object 
 3   PhysicalHealthDays         442067 non-null  float64
 4   MentalHealthDays           442067 non-null  float64
 5   LastCheckupTime            442067 non-null  object 
 6   PhysicalActivities         442067 non-null  object 
 7   SleepHours                 442067 non-null  float64
 8   RemovedTeeth               442067 non-null  object 
 9   HadHeartAttack             442067 non-null  int64  
 10  HadAngina                  442067 non-null  object 
 11  HadStroke                  442067 non-null  object 
 12  HadAsthma                  442067 non-null  object 
 13  HadSkinCancer              44

### Encoding
> Since there are many categorical features (like Sex, GeneralHealth, HadHeartAttack, etc.), we need to encode them. We will use Label Encoding for ordinal features and One-Hot Encoding for nominal features (those without an inherent order).

In [8]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
State,442067,54,WA,25997
Sex,442067,2,Female,234519
GeneralHealth,442067,5,Very good,149062
LastCheckupTime,442067,4,Within past year (anytime less than 12 months ...,356694
PhysicalActivities,442067,2,Yes,336750
RemovedTeeth,442067,4,None of them,243503
HadAngina,442067,2,No,415919
HadStroke,442067,2,No,423171
HadAsthma,442067,2,No,375918
HadSkinCancer,442067,2,No,406804


In [9]:
df['SleepHours_Category'].unique()

array(['Normal Sleep', 'Short Sleep', 'Long Sleep', 'Very Short Sleep',
       'Very Long Sleep'], dtype=object)

In [10]:
os.chdir('..')  # Go up one level if needed to reach the root
from src.feature_engineering.encoding import encode_features
df = encode_features(df)

In [11]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
State,442067,54,WA,25997
RaceEthnicityCategory,442067,5,"White only, Non-Hispanic",332480


In [12]:
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category
0,AL,0,3,0.0,0.0,3,0,8.0,0,0,...,27.44,0,0,1,0,1,0,0,2,0
1,AL,0,4,0.0,0.0,3,0,6.0,0,0,...,26.57,0,0,0,0,0,0,0,2,1
2,AL,0,3,2.0,3.0,3,1,5.0,0,0,...,25.61,0,0,0,0,0,0,2,2,1
3,AL,0,4,0.0,0.0,3,1,7.0,0,0,...,23.30,0,0,1,1,0,0,0,1,0
4,AL,0,1,2.0,0.0,3,1,9.0,0,0,...,21.77,1,0,0,1,0,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442062,VI,0,2,0.0,3.0,2,1,6.0,0,0,...,25.63,1,1,0,0,0,0,2,2,1
442063,VI,0,4,2.0,2.0,3,1,7.0,0,0,...,28.66,0,1,1,0,1,0,0,2,0
442064,VI,0,0,30.0,30.0,0,0,5.0,1,0,...,17.23,1,0,0,0,0,0,0,0,1
442065,VI,1,3,0.0,0.0,3,0,5.0,0,1,...,32.55,0,1,1,1,0,0,2,3,1


### Creating new features

####  e.g. Interaction Features:

In [13]:
# Creating interactions between BMI, Weight, Height, etc.
df['BMI_Weight_Interaction'] = df['BMI'] * df['WeightInKilograms']
df['BMI_Height_Interaction'] = df['BMI'] * df['HeightInMeters']

df.sample(5)


Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category,BMI_Weight_Interaction,BMI_Height_Interaction
237982,NH,0,2,0.0,0.0,3,1,7.0,0,0,...,0,1,1,2,0,0,2,0,1795.78,42.42
237127,NH,0,2,0.0,2.0,3,0,6.0,2,0,...,1,1,1,1,0,0,2,1,2123.3982,45.9382
159495,MD,1,2,15.0,0.0,1,1,7.0,0,0,...,0,1,0,0,0,0,1,0,1778.1566,42.1998
178230,MA,0,4,2.0,5.0,3,1,7.0,1,0,...,0,1,0,0,0,0,2,0,2215.5056,46.648
225523,MT,0,2,0.0,0.0,3,1,8.0,1,0,...,1,1,1,2,0,2,1,0,1470.645,38.3625


#### Polynomial Features
> Creating higher-degree polynomial features from numerical features

In [14]:
numerical_columns = df.select_dtypes(include=['float64']).columns.tolist()
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[numerical_columns])

# Create new column names for the polynomial features
poly_features_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(numerical_columns))

# Concatenate the new polynomial features to the original DataFrame
df = pd.concat([df, poly_features_df], axis=1)
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,WeightInKilograms^2,WeightInKilograms BMI,WeightInKilograms BMI_Weight_Interaction,WeightInKilograms BMI_Height_Interaction,BMI^2,BMI BMI_Weight_Interaction,BMI BMI_Height_Interaction,BMI_Weight_Interaction^2,BMI_Weight_Interaction BMI_Height_Interaction,BMI_Height_Interaction^2
164333,MD,1,4,0.0,0.0,3,1,7.0,0,0,...,4879.0225,1419.352,99141.7372,2625.8012,412.9024,28841.23264,763.86944,2014560.0,53356.280384,1413.158464
114217,IA,0,3,0.0,15.0,1,1,6.0,1,0,...,3477.4609,1237.1906,72957.129682,2078.480208,440.1604,25956.258788,739.469472,1530641.0,43606.514764,1242.308713
246257,NJ,0,3,0.0,0.0,3,1,7.0,0,0,...,2720.6656,1062.4992,55419.958272,1699.99872,414.9369,21643.108704,663.89904,1128905.0,34628.973926,1062.238464
48296,CT,0,2,0.0,0.0,3,1,8.0,0,0,...,6518.9476,2215.5056,178879.922144,3766.35952,752.9536,60793.473664,1280.02112,4908465.0,103348.905229,2176.035904
220244,MT,0,3,2.0,4.0,3,1,8.0,1,0,...,6301.1844,2242.485,178008.4593,3767.3748,798.0625,63350.20125,1340.745,5028739.0,106428.3381,2252.4516


#### Use domain knowledge and external data sources, e.g. create new features from spatial data, such as location, by calculating distances, angles, or areas.

### Feature scaling

In [15]:
# Apply StandardScaler to the numerical columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,WeightInKilograms^2,WeightInKilograms BMI,WeightInKilograms BMI_Weight_Interaction,WeightInKilograms BMI_Height_Interaction,BMI^2,BMI BMI_Weight_Interaction,BMI BMI_Height_Interaction,BMI_Weight_Interaction^2,BMI_Weight_Interaction BMI_Height_Interaction,BMI_Height_Interaction^2
98260,ID,0,3,-0.258523,-0.515488,3,0,0.656772,3,0,...,6518.9476,2215.5056,178879.922144,3544.80896,752.9536,60793.473664,1204.72576,4908465.0,97269.557862,1927.561216
215236,MO,0,2,-0.49153,-0.515488,3,0,0.656772,1,0,...,2816.4249,1254.0441,66552.120387,1881.06615,558.3769,29633.062083,837.56535,1572627.0,44449.593124,1256.348025
276934,NC,1,2,-0.49153,-0.515488,3,1,-0.687687,1,0,...,13066.7761,3907.1158,446622.407098,7150.021914,1168.2724,133545.218044,2137.938492,15265550.0,244387.749021,3912.42744
286371,OH,0,2,-0.375027,-0.515488,3,1,-0.015457,0,0,...,6814.5025,2352.675,194213.32125,3999.5475,812.25,67051.2375,1380.825,5535080.0,113987.10375,2347.4025
214128,MO,0,4,-0.025516,-0.515488,3,1,0.656772,1,0,...,2913.8404,1137.8984,61423.755632,1820.63744,444.3664,23986.898272,710.98624,1294813.0,38379.037235,1137.577984


### Output new final data

In [16]:
output_path = 'data/cleaned/heart_2022_cleaned_02.csv'
df.to_csv(output_path, index=False)