# This notebook will contain feature engineering steps:

- Encoding categorical variables

- Feature scaling (e.g., standardization, normalization)

- Feature selection (optional)

- Creating new features

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
import sys
import os
import warnings
warnings.filterwarnings('ignore')  # To avoid non-critical warnings

### Input 01-notebook file

In [2]:
path = '../data/cleaned/heart_2022_cleaned_01.csv'

In [3]:
# read data
df = pd.read_csv(path ,sep=',',encoding='utf-8')
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category
32349,California,Male,Very good,2.0,0.0,Within past year (anytime less than 12 months ...,No,6.0,1 to 5,1,...,25.55,No,No,Yes,No,"No, did not receive any tetanus shot in the pa...",No,No,Overweight,Short Sleep
329971,South Carolina,Male,Very good,7.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,0,...,27.89,Yes,No,No,Yes,"Yes, received Tdap",No,No,Overweight,Normal Sleep
407677,Washington,Male,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,0,...,28.25,Yes,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,Overweight,Normal Sleep
71868,Florida,Male,Good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,0,...,26.83,Yes,No,No,No,"Yes, received tetanus shot, but not Tdap",No,No,Overweight,Normal Sleep
240415,New Hampshire,Female,Fair,30.0,30.0,Within past year (anytime less than 12 months ...,No,7.0,1 to 5,0,...,38.74,Yes,No,No,Yes,"Yes, received tetanus shot but not sure what type",No,No,Extremly Obese,Normal Sleep


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442067 entries, 0 to 442066
Data columns (total 42 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      442067 non-null  object 
 1   Sex                        442067 non-null  object 
 2   GeneralHealth              442067 non-null  object 
 3   PhysicalHealthDays         442067 non-null  float64
 4   MentalHealthDays           442067 non-null  float64
 5   LastCheckupTime            442067 non-null  object 
 6   PhysicalActivities         442067 non-null  object 
 7   SleepHours                 442067 non-null  float64
 8   RemovedTeeth               442067 non-null  object 
 9   HadHeartAttack             442067 non-null  int64  
 10  HadAngina                  442067 non-null  object 
 11  HadStroke                  442067 non-null  object 
 12  HadAsthma                  442067 non-null  object 
 13  HadSkinCancer              44

### Encoding
> Since there are many categorical features (like Sex, GeneralHealth, HadHeartAttack, etc.), we need to encode them. We will use Label Encoding for ordinal features and One-Hot Encoding for nominal features (those without an inherent order).

In [5]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
State,442067,54,Washington,25997
Sex,442067,2,Female,234519
GeneralHealth,442067,5,Very good,149062
LastCheckupTime,442067,4,Within past year (anytime less than 12 months ...,356694
PhysicalActivities,442067,2,Yes,336750
RemovedTeeth,442067,4,None of them,243503
HadAngina,442067,2,No,415919
HadStroke,442067,2,No,423171
HadAsthma,442067,2,No,375918
HadSkinCancer,442067,2,No,406804


In [6]:
df['SleepHours_Category'].unique()

array(['Normal Sleep', 'Short Sleep', 'Long Sleep', 'Very Short Sleep',
       'Very Long Sleep'], dtype=object)

In [7]:
os.chdir('..')  # Go up one level if needed to reach the root
from src.feature_engineering.encoding import encode_features
df = encode_features(df)

In [8]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
State,442067,54,Washington,25997
RaceEthnicityCategory,442067,5,"White only, Non-Hispanic",332480


In [9]:
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category
0,Alabama,0,3,0.0,0.0,3,0,8.0,0,0,...,27.44,0,0,1,0,1,0,0,2,0
1,Alabama,0,4,0.0,0.0,3,0,6.0,0,0,...,26.57,0,0,0,0,0,0,0,2,1
2,Alabama,0,3,2.0,3.0,3,1,5.0,0,0,...,25.61,0,0,0,0,0,0,2,2,1
3,Alabama,0,4,0.0,0.0,3,1,7.0,0,0,...,23.30,0,0,1,1,0,0,0,1,0
4,Alabama,0,1,2.0,0.0,3,1,9.0,0,0,...,21.77,1,0,0,1,0,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442062,Virgin Islands,0,2,0.0,3.0,2,1,6.0,0,0,...,25.63,1,1,0,0,0,0,2,2,1
442063,Virgin Islands,0,4,2.0,2.0,3,1,7.0,0,0,...,28.66,0,1,1,0,1,0,0,2,0
442064,Virgin Islands,0,0,30.0,30.0,0,0,5.0,1,0,...,17.23,1,0,0,0,0,0,0,0,1
442065,Virgin Islands,1,3,0.0,0.0,3,0,5.0,0,1,...,32.55,0,1,1,1,0,0,2,3,1


### Creating new features

####  e.g. Interaction Features:

In [10]:
# Creating interactions between BMI, Weight, Height, etc.
df['BMI_Weight_Interaction'] = df['BMI'] * df['WeightInKilograms']
df['BMI_Height_Interaction'] = df['BMI'] * df['HeightInMeters']

df.sample(5)


Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category,BMI_Weight_Interaction,BMI_Height_Interaction
46851,Connecticut,0,2,7.0,15.0,2,0,8.0,1,0,...,1,0,1,3,0,2,2,0,2242.485,47.46
358151,Utah,1,4,0.0,0.0,3,1,6.0,0,0,...,0,1,0,0,0,0,1,1,1765.6281,42.0909
412275,Washington,0,4,0.0,0.0,3,1,6.0,1,0,...,1,1,0,3,0,0,2,1,1679.575,40.9975
324712,South Carolina,0,4,0.0,0.0,3,1,7.0,0,0,...,0,0,0,0,0,2,1,0,1765.6281,42.0909
3810,Alabama,1,2,0.0,0.0,3,0,10.0,1,0,...,1,1,1,1,0,0,2,2,2225.6388,47.124


#### Polynomial Features
> Creating higher-degree polynomial features from numerical features

In [12]:
numerical_columns = df.select_dtypes(include=['float64']).columns.tolist()
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[numerical_columns])

# Create new column names for the polynomial features
poly_features_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(numerical_columns))

# Concatenate the new polynomial features to the original DataFrame
df = pd.concat([df, poly_features_df], axis=1)
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,WeightInKilograms^2,WeightInKilograms BMI,WeightInKilograms BMI_Weight_Interaction,WeightInKilograms BMI_Height_Interaction,BMI^2,BMI BMI_Weight_Interaction,BMI BMI_Height_Interaction,BMI_Weight_Interaction^2,BMI_Weight_Interaction BMI_Height_Interaction,BMI_Height_Interaction^2
138687,Louisiana,0,1,30.0,0.0,3,0,8.0,1,1,...,6666.7225,2776.9165,226735.232225,4304.220575,1156.6801,94442.930165,1792.854155,7711265.0,146386.541756,2778.92394
241078,New Hampshire,0,2,0.0,0.0,3,1,8.0,0,0,...,3584.4169,1445.2618,86527.823966,2269.061026,582.7396,34888.619852,914.901172,2088782.0,54775.133168,1436.39484
418643,Wisconsin,0,2,0.0,0.0,3,0,9.0,0,0,...,5945.9521,2115.8984,163156.925624,3554.709312,752.9536,58060.252096,1264.962048,4477026.0,97541.223521,2125.136241
420656,Wisconsin,0,2,2.0,0.0,3,1,8.0,0,0,...,7426.9924,2418.2108,208401.406744,4231.8689,787.3636,67854.995048,1377.8863,5847743.0,118746.241334,2411.301025
118653,Iowa,0,3,0.0,20.0,3,0,9.0,1,0,...,3214.89,1144.206,64876.4802,1922.26608,407.2324,23090.07708,684.150432,1309207.0,38791.329494,1149.372726


#### Use domain knowledge and external data sources, e.g. create new features from spatial data, such as location, by calculating distances, angles, or areas.

### Feature scaling

In [13]:
# Apply StandardScaler to the numerical columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,WeightInKilograms^2,WeightInKilograms BMI,WeightInKilograms BMI_Weight_Interaction,WeightInKilograms BMI_Height_Interaction,BMI^2,BMI BMI_Weight_Interaction,BMI BMI_Height_Interaction,BMI_Weight_Interaction^2,BMI_Weight_Interaction BMI_Height_Interaction,BMI_Height_Interaction^2
381411,Virginia,0,3,-0.49153,-0.515488,3,1,6.034607,1,0,...,9958.0441,3337.9755,333096.575145,5774.697615,1118.9025,111655.280475,1935.701325,11142080.0,193163.635222,3348.763292
308768,Oregon,1,3,-0.258523,-0.274857,3,1,0.656772,0,1,...,6965.5716,2267.6082,189254.580372,3968.31435,738.2089,61610.914794,1291.865575,5142047.0,107819.100889,2260.764756
69142,Florida,1,2,-0.49153,-0.515488,0,1,-0.015457,2,0,...,6666.7225,1993.0765,162734.696225,3647.329995,595.8481,48650.997365,1090.402023,3972354.0,89031.325178,1995.435702
10027,Alaska,0,0,3.003572,-0.515488,3,1,0.656772,0,0,...,6518.9476,2215.5056,178879.922144,3766.35952,752.9536,60793.473664,1280.02112,4908465.0,103348.905229,2176.035904
110899,Indiana,1,3,-0.49153,-0.515488,0,1,0.656772,0,0,...,6301.1844,1736.0406,137806.902828,3315.837546,478.2969,37967.207922,913.547079,3013837.0,72517.367131,1744.874921


### Output new final data

In [16]:
output_path = 'data/cleaned/heart_2022_cleaned_02.csv'
df.to_csv(output_path, index=False)