# Feature Engineering

This notebook applies feature engineering steps using our modular code:

- Encoding categorical variables
- Feature scaling (standardization, normalization)
- Creating new features using our modular feature engineering code

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import sys
import os
import warnings
warnings.filterwarnings('ignore')  # To avoid non-critical warnings
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import our custom modules
from src.feature_engineering.feature_engineering import engineer_features
from src.config import FEATURE_CONFIG

### Load the cleaned data from notebook 01

In [3]:
path = '../data/processed/heart_2022_processed_01.csv'
df = pd.read_csv(path, sep=',', encoding='utf-8')
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category
187918,MN,1,1,0.0,30.0,3,0,5.0,0,0,...,17.79,1,1,0,0,0,1,0,0,1
97800,IN,0,2,2.0,3.0,3,0,4.0,0,0,...,34.61,0,0,0,0,2,0,0,3,3
301245,RI,0,0,20.0,20.0,3,1,5.0,2,0,...,29.05,0,1,1,0,1,0,2,2,1
333004,TX,0,1,10.0,20.0,3,0,6.0,1,0,...,28.34,0,0,0,0,0,0,2,2,1
295562,PA,0,2,1.0,1.0,2,1,6.0,0,0,...,24.33,0,0,1,1,0,0,0,1,1


### 1. Feature Engineering

In [4]:
# Apply feature engineering
print("Creating interaction and polynomial features for:", FEATURE_CONFIG['interaction_features'])
df = engineer_features(df)
df.sample(5)

Creating interaction and polynomial features for: ['BMI', 'WeightInKilograms', 'HeightInMeters', 'PhysicalHealthDays', 'MentalHealthDays', 'SleepHours']


Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters^2_poly,HeightInMeters_PhysicalHealthDays_poly,HeightInMeters_MentalHealthDays_poly,HeightInMeters_SleepHours_poly,PhysicalHealthDays^2_poly,PhysicalHealthDays_MentalHealthDays_poly,PhysicalHealthDays_SleepHours_poly,MentalHealthDays^2_poly,MentalHealthDays_SleepHours_poly,SleepHours^2_poly
29135,CA,0,0,20.0,20.0,3,1,7.0,3,0,...,2.7225,33.0,33.0,11.55,400.0,400.0,140.0,400.0,140.0,49.0
160008,MA,0,3,0.0,0.0,0,1,8.0,0,0,...,2.6569,0.0,0.0,13.04,0.0,0.0,0.0,0.0,0.0,64.0
156191,MD,0,2,0.0,0.0,3,0,9.0,0,0,...,2.4025,0.0,0.0,13.95,0.0,0.0,0.0,0.0,0.0,81.0
374829,WA,1,3,0.0,5.0,3,1,7.0,1,0,...,3.0625,0.0,8.75,12.25,0.0,0.0,0.0,25.0,35.0,49.0
134789,ME,1,3,0.0,2.0,3,0,7.0,1,0,...,2.8224,0.0,3.36,11.76,0.0,0.0,0.0,4.0,14.0,49.0


### visualizing interaction terms

In [5]:
# import seaborn as sns
# import matplotlib.pyplot as plt


# # 定義特徵列表和目標變數
# features = df.drop(columns="HadHeartAttack")
# target = 'HadHeartAttack'

# # 選擇一個主要分類變數作為交互基準
# interaction_base = 'BMI_Category'

# # 定義連續變數和分類變數
# continuous_vars = ['PhysicalHealthDays', 'MentalHealthDays', 'HeightInMeters', 'WeightInKilograms']
# binning_vars = ["SleepHours_Category"]

# for feature in continuous_vars:
#     # 將連續變數分組為 4 個區間
#     df[f'{feature}_bin'] = pd.cut(df[feature], bins=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
    
#     # 計算每個區間和 BMI_Category 的心臟病風險比例
#     grouped = df.groupby([f'{feature}_bin', interaction_base])[target].mean().reset_index()
    
#     # 繪製長條圖
#     plt.figure(figsize=(10, 6))
#     sns.barplot(data=grouped, x=f'{feature}_bin', y=target, hue=interaction_base, errorbar='sd', palette='Set2')
#     plt.title(f'Heart Attack Risk by {feature} (Binned) and {interaction_base}', fontsize=14)
#     plt.xlabel(f'{feature} (Binned)', fontsize=12)
#     plt.ylabel('Heart Attack Risk (Proportion)', fontsize=12)
#     plt.legend(title=interaction_base)
#     plt.tight_layout()
#     plt.show()
    
#     df = df.drop(columns=f'{feature}_bin')

# for feature in binning_vars: 
#     # 計算每個類別和 BMI_Category 的心臟病風險比例
#     grouped = df.groupby([feature, interaction_base])[target].mean().reset_index()

#     # 繪製長條圖
#     plt.figure(figsize=(10, 6))
#     sns.barplot(data=grouped, x=feature, y=target, hue=interaction_base, errorbar='sd', palette='Set2')
#     plt.title(f'Heart Attack Risk by {feature} and {interaction_base}', fontsize=14)
#     plt.xlabel(feature, fontsize=12)
#     plt.ylabel('Heart Attack Risk (Proportion)', fontsize=12)
#     plt.xticks(rotation=45, ha='right')
#     plt.legend(title=interaction_base)
#     plt.tight_layout()
#     plt.show()

### 2. Feature Scaling

In [None]:
# Define binary columns
binary_cols = [
    'Sex', 'PhysicalActivities', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer',
    'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
    'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating',
    'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands',
    'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
    'ChestScan', 'HighRiskLastYear'
]

# Select numeric columns and exclude binary ones
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
columns_to_scale = [col for col in numerical_columns if col not in binary_cols]

# Scale only non-binary numeric features
scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters^2_poly,HeightInMeters_PhysicalHealthDays_poly,HeightInMeters_MentalHealthDays_poly,HeightInMeters_SleepHours_poly,PhysicalHealthDays^2_poly,PhysicalHealthDays_MentalHealthDays_poly,PhysicalHealthDays_SleepHours_poly,MentalHealthDays^2_poly,MentalHealthDays_SleepHours_poly,SleepHours^2_poly
350784,VT,1,-0.428104,-0.253499,-0.272239,0.434949,1,0.66384,-0.775133,-0.242639,...,2.034335,-0.22266,-0.240513,1.282244,-0.352995,-0.236947,-0.204283,-0.359848,-0.215425,0.541774
400904,WI,1,0.533014,-0.489368,0.336565,-2.165654,1,-0.700087,-0.775133,-0.242639,...,1.413254,-0.4883,0.414847,-0.331621,-0.369584,-0.264944,-0.46604,-0.160897,0.242078,-0.674034
240338,NM,1,1.494132,-0.489368,-0.515761,0.434949,1,0.66384,0.370562,-0.242639,...,0.230557,-0.4883,-0.514716,0.726272,-0.369584,-0.264944,-0.46604,-0.377532,-0.496965,0.541774
68199,FL,0,1.494132,-0.253499,-0.150478,0.434949,1,0.66384,-0.775133,-0.242639,...,-1.223037,-0.269947,-0.176628,0.232074,-0.352995,-0.222948,-0.204283,-0.337742,-0.074655,0.541774
41580,CO,0,1.494132,-0.489368,-0.150478,0.434949,1,-0.700087,-0.775133,-0.242639,...,-0.694458,-0.4883,-0.163707,-0.841263,-0.369584,-0.264944,-0.46604,-0.337742,-0.180233,-0.674034


### 3. Save the Engineered Dataset

In [7]:
output_path = '../data/processed/heart_2022_processed_02.csv'
df.to_csv(output_path, index=False)
print(f"Saved engineered dataset to {output_path}")

Saved engineered dataset to ../data/processed/heart_2022_processed_02.csv
