# Feature Engineering

This notebook applies feature engineering steps using our modular code:

- Encoding categorical variables
- Feature scaling (standardization, normalization)
- Creating new features using our modular feature engineering code

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import sys
import os
import warnings
warnings.filterwarnings('ignore')  # To avoid non-critical warnings
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import our custom modules
from src.feature_engineering.encoding import encode_features
from src.feature_engineering.feature_engineering import engineer_features
from src.config import FEATURE_CONFIG

### Load the cleaned data from notebook 01

In [2]:
path = '../data/cleaned/heart_2022_cleaned_01.csv'
df = pd.read_csv(path, sep=',', encoding='utf-8')
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category
184186,MI,Male,Very good,0.0,2.0,Within past 5 years (2 years but less than 5 y...,Yes,7.0,1 to 5,0,...,34.18,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No,Obese,Normal Sleep
119914,IA,Female,Good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,0,...,27.44,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,Yes,Overweight,Normal Sleep
356784,TX,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,6.0,None of them,0,...,33.28,Yes,No,Yes,No,"No, did not receive any tetanus shot in the pa...",No,No,Obese,Short Sleep
277698,NC,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,1 to 5,0,...,27.99,Yes,Yes,Yes,No,"No, did not receive any tetanus shot in the pa...",No,No,Overweight,Short Sleep
308782,OR,Male,Poor,5.0,29.0,Within past year (anytime less than 12 months ...,No,5.0,None of them,0,...,23.17,Yes,Yes,No,No,"Yes, received tetanus shot, but not Tdap",No,No,Normal weight,Short Sleep


### 1. Encoding Categorical Variables

In [3]:
# Apply encoding
df = encode_features(df)
print("Remaining object (categorical) columns:")
df.describe(include='object').T

Remaining object (categorical) columns:


Unnamed: 0,count,unique,top,freq
State,442067,54,WA,25997
RaceEthnicityCategory,442067,5,"White only, Non-Hispanic",332480


### 2. Feature Engineering

In [4]:
# Apply feature engineering
print("Creating interaction and polynomial features for:", FEATURE_CONFIG['interaction_features'])
df = engineer_features(df)
df.sample(5)

Creating interaction and polynomial features for: ['BMI', 'WeightInKilograms', 'HeightInMeters']


Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,SleepHours_Category,BMI_WeightInKilograms_interaction,BMI_HeightInMeters_interaction,WeightInKilograms_HeightInMeters_interaction,BMI^2_poly,BMI_WeightInKilograms_poly,BMI_HeightInMeters_poly,WeightInKilograms^2_poly,WeightInKilograms_HeightInMeters_poly,HeightInMeters^2_poly
81293,GA,0,2,0.0,0.0,3,1,6.0,0,0,...,1,1786.0893,42.1496,97.9032,768.9529,1786.0893,42.1496,4148.6481,97.9032,2.3104
127346,KS,0,3,2.0,0.0,3,1,9.0,0,0,...,2,2613.66,48.02,166.6875,752.9536,2613.66,48.02,9072.5625,166.6875,3.0625
267825,NY,1,1,5.0,20.0,3,0,3.0,0,0,...,3,9934.1099,99.6964,352.1804,2812.1809,9934.1099,99.6964,35092.5289,352.1804,3.5344
200012,MN,1,1,30.0,5.0,2,0,5.0,2,0,...,1,3008.492,54.913,173.5856,951.7225,3008.492,54.913,9510.1504,173.5856,3.1684
10420,AZ,1,1,0.0,20.0,3,0,7.0,1,0,...,0,7087.8215,84.283,266.4482,2242.0225,7087.8215,84.283,22407.0961,266.4482,3.1684


### Rod can paste your work here

In [None]:
import seaborn as sns


# 定義特徵列表和目標變數
features = df.drop(columns="HadHeartAttack")
target = 'HadHeartAttack'

# 選擇一個主要分類變數作為交互基準
interaction_base = 'AgeCategory'

# 定義連續變數和分類變數
continuous_vars = ['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'HeightInMeters', 'WeightInKilograms', 'BMI']
categorical_vars = [f for f in features if f not in continuous_vars and f != interaction_base]

# 1. 連續變數：散點圖觀察交互作用（與AgeCategory）
for feature in continuous_vars:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x=feature, y=target, hue=interaction_base, s=100)
    plt.title(f'Relationship between {feature} and Heart Attack Risk by {interaction_base}', fontsize=14)
    plt.xlabel(feature, fontsize=12)
    plt.ylabel('Had Heart Attack (0 = No, 1 = Yes)', fontsize=12)
    plt.legend(title=interaction_base)
    plt.tight_layout()
    plt.show()

# 2. 分類變數：分組條形圖觀察交互作用（與 AgeCategory）
for feature in categorical_vars:
    plt.figure(figsize=(10, 6))
    # 計算每個組合下的心臟病比例
    grouped = df.groupby([feature, interaction_base])[target].mean().reset_index()
    sns.barplot(data=grouped, x=feature, y=target, hue=interaction_base)
    plt.title(f'Heart Attack Risk by {feature} and {interaction_base}', fontsize=14)
    plt.xlabel(feature, fontsize=12)
    plt.ylabel('Proportion of Heart Attack (0-1)', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.legend(title=interaction_base)
    plt.tight_layout()
    plt.show()

In [None]:
# there are related paper to support these
df["Sex_Age"] = df["Sex"] * df["AgeCategory"]
df["Weight_MentalHealthDays"] = df["WeightInKilograms"] * df["MentalHealthDays"]
df["Sex_Diabetes"] = df["Sex"] * df["HadDiabetes"]
df["AgeCategory_Diabetes"] = df["AgeCategory"] * df["HadDiabetes"]
df["BMI_Age"] = df["BMI"] * df["AgeCategory"] 
df["BMI_SleepHours"] = df["BMI"] * df["SleepHours"] 
df["SmokerStatus_Sex"] = df["SmokerStatus"] * df["Sex"]
df["SmokerStatus_Age"] = df["SmokerStatus"] * df["AgeCategory"]
df["PhysicalActivities_AgeCategory"] = df["PhysicalActivities"] * df["AgeCategory"]
df["PhysicalActivities_AgeCategory"] = df["PhysicalActivities"] * df["AgeCategory"]
df["AgeCategory_HadStroke"] = df["AgeCategory"] * df["HadStroke"]
df["Age_Ethnicity"] = df["AgeCategory"] * df["RaceEthnicityCategory"]


### 3. Feature Scaling

In [5]:
# Get numerical columns
numerical_columns = df.select_dtypes(include=['float64']).columns.tolist()

In [6]:
# Apply StandardScaler to the numerical columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,SleepHours_Category,BMI_WeightInKilograms_interaction,BMI_HeightInMeters_interaction,WeightInKilograms_HeightInMeters_interaction,BMI^2_poly,BMI_WeightInKilograms_poly,BMI_HeightInMeters_poly,WeightInKilograms^2_poly,WeightInKilograms_HeightInMeters_poly,HeightInMeters^2_poly
95093,ID,1,2,3.003572,-0.515488,3,1,0.656772,1,0,...,0,1.528528,1.611651,2.581372,0.753741,1.528528,1.611651,2.236352,2.581372,2.304482
245432,NJ,0,3,-0.49153,-0.515488,3,1,-0.687687,1,0,...,1,-0.104917,-0.002327,-0.652066,0.302309,-0.104917,-0.002327,-0.424933,-0.652066,-1.433055
191920,MN,1,2,-0.49153,-0.515488,3,1,0.656772,0,0,...,0,-0.43083,-0.409767,-0.177782,-0.513906,-0.43083,-0.409767,-0.337763,-0.177782,0.432322
295782,OH,1,2,-0.49153,1.890823,1,1,-0.015457,0,0,...,0,-0.073458,0.041618,-0.121954,-0.003703,-0.073458,0.041618,-0.155609,-0.121954,-0.246279
49848,CT,0,3,-0.14202,-0.395172,3,1,0.656772,0,0,...,0,-1.097478,-1.36703,-1.335204,-1.004447,-1.097478,-1.36703,-1.091714,-1.335204,-0.714037


### 4. Save the Engineered Dataset

In [7]:
output_path = '../data/cleaned/heart_2022_cleaned_02.csv'
df.to_csv(output_path, index=False)
print(f"Saved engineered dataset to {output_path}")

Saved engineered dataset to ../data/cleaned/heart_2022_cleaned_02.csv
