# Feature Engineering

This notebook applies feature engineering steps using our modular code:

- Encoding categorical variables
- Feature scaling (standardization, normalization)
- Creating new features using our modular feature engineering code

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import sys
import os
import warnings
warnings.filterwarnings('ignore')  # To avoid non-critical warnings
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import our custom modules
from src.feature_engineering.encoding import encode_features
from src.feature_engineering.feature_engineering import engineer_features
from src.config import FEATURE_CONFIG

### Load the cleaned data from notebook 01

In [2]:
path = '../data/cleaned/heart_2022_cleaned_01.csv'
df = pd.read_csv(path, sep=',', encoding='utf-8')
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category
236108,Nevada,Female,Very good,30.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,All,0,...,28.29,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No,Overweight,Normal Sleep
291709,Ohio,Female,Excellent,0.0,30.0,Within past year (anytime less than 12 months ...,Yes,8.0,None of them,0,...,20.41,Yes,Yes,Yes,Yes,"Yes, received Tdap",No,No,Normal weight,Normal Sleep
422401,Wisconsin,Female,Poor,30.0,0.0,Within past year (anytime less than 12 months ...,No,5.0,All,1,...,28.25,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,Overweight,Short Sleep
389016,Virginia,Male,Good,3.0,10.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,0,...,32.42,Yes,No,No,No,"Yes, received Tdap",No,Tested positive using home test without a heal...,Obese,Short Sleep
307098,Oregon,Male,Good,30.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,0,...,23.75,Yes,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,Normal weight,Normal Sleep


### 1. Encoding Categorical Variables

In [3]:
# Apply encoding
df = encode_features(df)
print("Remaining object (categorical) columns:")
df.describe(include='object').T

Remaining object (categorical) columns:


Unnamed: 0,count,unique,top,freq
State,445132,54,Washington,26152
RaceEthnicityCategory,445132,5,"White only, Non-Hispanic",334469


### 2. Feature Engineering

In [4]:
# Apply feature engineering
print("Creating interaction and polynomial features for:", FEATURE_CONFIG['interaction_features'])
df = engineer_features(df)
df.sample(5)

Creating interaction and polynomial features for: ['BMI', 'WeightInKilograms', 'HeightInMeters']


Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,SleepHours_Category,BMI_WeightInKilograms_interaction,BMI_HeightInMeters_interaction,WeightInKilograms_HeightInMeters_interaction,BMI^2_poly,BMI_WeightInKilograms_poly,BMI_HeightInMeters_poly,WeightInKilograms^2_poly,WeightInKilograms_HeightInMeters_poly,HeightInMeters^2_poly
249755,New Jersey,1,4,1.0,0.0,3,1,8.0,0,0,...,0,2486.76,49.98,191.1392,650.25,2486.76,49.98,9510.1504,191.1392,3.8416
3566,Alabama,0,3,0.0,0.0,3,1,8.0,0,0,...,0,2059.3799,45.3995,108.9805,857.9041,2059.3799,45.3995,4943.4961,108.9805,2.4025
22127,Arkansas,1,3,1.0,0.0,3,0,5.0,0,0,...,1,3353.9766,57.9228,204.6568,949.2561,3353.9766,57.9228,11850.4996,204.6568,3.5344
35930,California,1,4,0.0,0.0,2,0,5.0,0,0,...,1,1657.2067,40.7761,121.6363,555.5449,1657.2067,40.7761,4943.4961,121.6363,2.9929
41802,Colorado,0,2,0.0,2.0,3,1,9.0,1,1,...,2,1295.2851,36.0696,101.3544,460.9609,1295.2851,36.0696,3639.7089,101.3544,2.8224


### Rod can paste your work here

### visualizing interaction terms

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt


# 定義特徵列表和目標變數
features = df.drop(columns="HadHeartAttack")
target = 'HadHeartAttack'

# 選擇一個主要分類變數作為交互基準
interaction_base = 'AgeCategory'

# 定義連續變數和分類變數
continuous_vars = ['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'HeightInMeters', 'WeightInKilograms', 'BMI']
categorical_vars = [f for f in features if f not in continuous_vars and f != interaction_base]

# 1. 連續變數：散點圖觀察交互作用（與AgeCategory）
for feature in continuous_vars:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x=feature, y=target, hue=interaction_base, s=100)
    plt.title(f'Relationship between {feature} and Heart Attack Risk by {interaction_base}', fontsize=14)
    plt.xlabel(feature, fontsize=12)
    plt.ylabel('Had Heart Attack (0 = No, 1 = Yes)', fontsize=12)
    plt.legend(title=interaction_base)
    plt.tight_layout()
    plt.show()

# 2. 分類變數：分組條形圖觀察交互作用（與 AgeCategory）
for feature in categorical_vars:
    plt.figure(figsize=(10, 6))
    # 計算每個組合下的心臟病比例
    grouped = df.groupby([feature, interaction_base])[target].mean().reset_index()
    sns.barplot(data=grouped, x=feature, y=target, hue=interaction_base)
    plt.title(f'Heart Attack Risk by {feature} and {interaction_base}', fontsize=14)
    plt.xlabel(feature, fontsize=12)
    plt.ylabel('Proportion of Heart Attack (0-1)', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.legend(title=interaction_base)
    plt.tight_layout()
    plt.show()

KeyboardInterrupt: 

In [6]:
df["Sex_Age"] = df["Sex"] * df["AgeCategory"]
df["Weight_MentalHealthDays"] = df["WeightInKilograms"] * df["MentalHealthDays"]
df["Sex_Diabetes"] = df["Sex"] * df["HadDiabetes"]
df["AgeCategory_Diabetes"] = df["AgeCategory"] * df["HadDiabetes"]
df["BMI_Age"] = df["BMI_Category"] * df["AgeCategory"] 
df["BMI_SleepHours"] = df["BMI_Category"] * df["SleepHours_Category"] 
df["SmokerStatus_Sex"] = df["SmokerStatus"] * df["Sex"]
df["SmokerStatus_Age"] = df["SmokerStatus"] * df["AgeCategory"]
df["PhysicalActivities_AgeCategory"] = df["PhysicalActivities"] * df["AgeCategory"]
df["PhysicalActivities_AgeCategory"] = df["PhysicalActivities"] * df["AgeCategory"]
df["AgeCategory_HadStroke"] = df["AgeCategory"] * df["HadStroke"]
df["Age_Ethnicity"] = df["AgeCategory"] * df["RaceEthnicityCategory"]

### Construct interaction terms

In [7]:
cat_vars = [
    'Sex',
    'AgeCategory',
    'HadDiabetes',
    'SmokerStatus',
    'PhysicalActivities',
    'HadStroke',
    'RaceEthnicityCategory',
    'SleepHours_Category',
    'BMI_Category'  
]

formula = """
HadHeartAttack ~ 
C(Sex) * C(AgeCategory) + 
WeightInKilograms * MentalHealthDays + 
C(Sex) * C(HadDiabetes) + 
C(AgeCategory) * C(HadDiabetes) + 
C(BMI_Category) * C(AgeCategory) + 
C(BMI_Category) * C(SleepHours_Category) + 
C(SmokerStatus) * C(Sex) + 
C(SmokerStatus) * C(AgeCategory) + 
C(PhysicalActivities) * C(AgeCategory) + 
C(AgeCategory) * C(HadStroke) + 
C(AgeCategory) * C(RaceEthnicityCategory)
"""


from patsy import dmatrices

# 生成設計矩陣（包含自動的 one-hot + 交互作用項）
y, X = dmatrices(formula, data=df, return_type='dataframe')

# 合併目標變數與特徵
df = pd.concat([y, X], axis=1)

# # 輸出成 CSV（可選）
# df.to_csv('../data/cleaned/heart_2022_cleaned_02.csv', index=False)


### 3. Feature Scaling

In [8]:
# Get numerical columns
numerical_columns = df.select_dtypes(include=['float64']).columns.tolist()

In [9]:
# Apply StandardScaler to the numerical columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

df.sample(5)

Unnamed: 0,HadHeartAttack,Intercept,C(Sex)[T.1],C(AgeCategory)[T.1],C(AgeCategory)[T.2],C(AgeCategory)[T.3],C(AgeCategory)[T.4],C(AgeCategory)[T.5],C(AgeCategory)[T.6],C(AgeCategory)[T.7],...,"C(AgeCategory)[T.6]:C(RaceEthnicityCategory)[T.White only, Non-Hispanic]","C(AgeCategory)[T.7]:C(RaceEthnicityCategory)[T.White only, Non-Hispanic]","C(AgeCategory)[T.8]:C(RaceEthnicityCategory)[T.White only, Non-Hispanic]","C(AgeCategory)[T.9]:C(RaceEthnicityCategory)[T.White only, Non-Hispanic]","C(AgeCategory)[T.10]:C(RaceEthnicityCategory)[T.White only, Non-Hispanic]","C(AgeCategory)[T.11]:C(RaceEthnicityCategory)[T.White only, Non-Hispanic]","C(AgeCategory)[T.12]:C(RaceEthnicityCategory)[T.White only, Non-Hispanic]",WeightInKilograms,MentalHealthDays,WeightInKilograms:MentalHealthDays
287002,-0.245134,0.0,-0.941811,-0.227966,-0.248081,-0.261672,-0.268545,-0.261697,-0.28594,-0.300298,...,-0.241279,-0.259519,-0.293971,-0.339246,-0.299769,-0.259564,-0.27825,-0.282491,-0.519475,-0.491521
366555,-0.245134,0.0,-0.941811,-0.227966,-0.248081,-0.261672,-0.268545,-0.261697,-0.28594,-0.300298,...,-0.241279,-0.259519,-0.293971,2.947711,-0.299769,-0.259564,-0.27825,-0.171665,-0.519475,-0.491521
283058,-0.245134,0.0,1.061784,-0.227966,-0.248081,-0.261672,-0.268545,-0.261697,-0.28594,-0.300298,...,-0.241279,-0.259519,-0.293971,-0.339246,3.335907,-0.259564,-0.27825,1.046441,-0.519475,-0.491521
12441,-0.245134,0.0,-0.941811,-0.227966,-0.248081,-0.261672,-0.268545,-0.261697,-0.28594,-0.300298,...,-0.241279,-0.259519,3.401692,-0.339246,-0.299769,-0.259564,-0.27825,-0.171665,1.882729,1.640786
156647,-0.245134,0.0,-0.941811,-0.227966,-0.248081,-0.261672,-0.268545,-0.261697,-0.28594,3.330027,...,-0.241279,-0.259519,-0.293971,-0.339246,-0.299769,-0.259564,-0.27825,0.093926,-0.519475,-0.491521


### 4. Save the Engineered Dataset

In [10]:
output_path = '../data/cleaned/heart_2022_cleaned_02.csv'
df.to_csv(output_path, index=False)
print(f"Saved engineered dataset to {output_path}")

Saved engineered dataset to ../data/cleaned/heart_2022_cleaned_02.csv
