# This notebook will contain feature engineering steps:

- Encoding categorical variables

- Feature scaling (e.g., standardization, normalization)

- Feature selection (optional)

- Creating new features (if applicable)

In [50]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

### Input 01-notebook file

In [51]:
path = '../data/cleaned/heart_2022_cleaned_01.csv'

In [52]:
# read data
df = pd.read_csv(path ,sep=',',encoding='utf-8')
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartAttack_bin,HeartAttackBinary
35759,California,Female,Good,0.0,2.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,39.16,No,No,No,No,"Yes, received Tdap",No,No,0,0
31170,California,Male,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,8.0,All,No,...,26.5,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No,0,0
10555,Arizona,Male,Good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,8.0,None of them,No,...,20.89,Yes,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,0,0
38217,Colorado,Male,Good,0.0,4.0,Within past 5 years (2 years but less than 5 y...,Yes,8.0,None of them,No,...,23.4,Yes,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,0,0
57445,Delaware,Female,Good,3.0,30.0,Within past year (anytime less than 12 months ...,Yes,6.0,1 to 5,No,...,36.39,No,No,Yes,No,"No, did not receive any tetanus shot in the pa...",No,No,0,0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59903 entries, 0 to 59902
Data columns (total 42 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      59903 non-null  object 
 1   Sex                        59903 non-null  object 
 2   GeneralHealth              59903 non-null  object 
 3   PhysicalHealthDays         59903 non-null  float64
 4   MentalHealthDays           59903 non-null  float64
 5   LastCheckupTime            59903 non-null  object 
 6   PhysicalActivities         59903 non-null  object 
 7   SleepHours                 59903 non-null  float64
 8   RemovedTeeth               59903 non-null  object 
 9   HadHeartAttack             59903 non-null  object 
 10  HadAngina                  59903 non-null  object 
 11  HadStroke                  59903 non-null  object 
 12  HadAsthma                  59903 non-null  object 
 13  HadSkinCancer              59903 non-null  obj

### Features testing

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pointbiserialr, chi2_contingency

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    if confusion_matrix.shape[0] < 2 or confusion_matrix.shape[1] < 2:
        return 0.0
    chi2, _, _, _ = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    phi2 = chi2 / n
    return np.sqrt(phi2 / min(k - 1, r - 1)) if min(k - 1, r - 1) > 0 else 0.0

# 預設 target 已經 LabelEncoded 成 0/1
target = "HadHeartAttack"

cat_cols = df.select_dtypes(include=["object", "category"]).columns.drop(target, errors="ignore").tolist()
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

df["HadHeartAttack_bin"] = df["HadHeartAttack"].map({"Yes": 1, "No": 0})
# === 類別型欄位：Cramér's V ===
cat_corr = {}
for col in cat_cols:
    valid = df[[col, target]]
    if not valid.empty:
        cat_corr[col] = cramers_v(valid[col], valid[target])

# === 數值型欄位：Point Biserial Correlation ===
num_corr = {}
for col in num_cols:
    valid = df[[col, "HadHeartAttack_bin"]]
    try:
        x = valid[col].astype(float)
        y = valid["HadHeartAttack_bin"].astype(int)
        corr, _ = pointbiserialr(x, y)
        if np.isscalar(corr) and not np.isnan(corr):  # 確保是數值且不是 NaN
            num_corr[col] = abs(corr)
    except Exception as e:
        print(f"{col} 計算錯誤：{e}")

# === 排序與輸出 ===
sorted_cat = sorted(cat_corr.items(), key=lambda x: float(x[1]), reverse=True)
sorted_num = sorted(num_corr.items(), key=lambda x: float(x[1]), reverse=True)

print("\n📊 數值型特徵與目標變數的相關係數（Point Biserial）:")
for col, score in sorted_num:
    print(f"{col}: {score:.4f}")
print("📊 類別型特徵與目標變數的相關係數（Cramér’s V）:")
for col, score in sorted_cat:
    print(f"{col}: {score:.4f}")


0

📊 數值型特徵與目標變數的相關係數（Point Biserial）:
HeartAttackBinary: 1.0000
PhysicalHealthDays: 0.1421
WeightInKilograms: 0.0337
MentalHealthDays: 0.0263
BMI: 0.0260
HeightInMeters: 0.0257
SleepHours: 0.0010
📊 類別型特徵與目標變數的相關係數（Cramér’s V）:
HadAngina: 0.4158
GeneralHealth: 0.1911
HadStroke: 0.1848
AgeCategory: 0.1823
ChestScan: 0.1732
RemovedTeeth: 0.1665
DifficultyWalking: 0.1627
HadDiabetes: 0.1478
HadCOPD: 0.1458
PneumoVaxEver: 0.1323
HadArthritis: 0.1220
HadKidneyDisease: 0.1196
DeafOrHardOfHearing: 0.1011
SmokerStatus: 0.0937
DifficultyErrands: 0.0915
PhysicalActivities: 0.0823
AlcoholDrinkers: 0.0800
DifficultyDressingBathing: 0.0769
BlindOrVisionDifficulty: 0.0734
State: 0.0662
LastCheckupTime: 0.0661
Sex: 0.0649
HadSkinCancer: 0.0563
TetanusLast10Tdap: 0.0559
DifficultyConcentrating: 0.0492
FluVaxLast12: 0.0476
RaceEthnicityCategory: 0.0403
CovidPos: 0.0310
HadDepressiveDisorder: 0.0310
HadAsthma: 0.0287
HIVTesting: 0.0239
ECigaretteUsage: 0.0234
HighRiskLastYear: 0.0187


### Encoding

In [70]:
df.drop(["HadHeartAttack_bin", "HeartAttackBinary"], axis=1, inplace=True)
col_obj = df.select_dtypes('object').columns

le = LabelEncoder()

for col in col_obj:
    df[col] = le.fit_transform(df[col])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59903 entries, 0 to 59902
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      59903 non-null  int64  
 1   Sex                        59903 non-null  int64  
 2   GeneralHealth              59903 non-null  int64  
 3   PhysicalHealthDays         59903 non-null  float64
 4   MentalHealthDays           59903 non-null  float64
 5   LastCheckupTime            59903 non-null  int64  
 6   PhysicalActivities         59903 non-null  int64  
 7   SleepHours                 59903 non-null  float64
 8   RemovedTeeth               59903 non-null  int64  
 9   HadHeartAttack             59903 non-null  int64  
 10  HadAngina                  59903 non-null  int64  
 11  HadStroke                  59903 non-null  int64  
 12  HadAsthma                  59903 non-null  int64  
 13  HadSkinCancer              59903 non-null  int

In [71]:
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
42915,5,1,2,0.0,2.0,2,1,7.0,3,0,...,1.73,70.31,23.57,1,1,0,0,0,0,2
5334,1,1,4,1.0,0.0,3,1,8.0,3,0,...,1.73,77.11,25.85,1,0,1,1,2,0,0
56465,7,0,1,0.0,0.0,3,1,7.0,0,0,...,1.73,77.11,25.85,1,0,1,1,0,0,0
8489,1,1,4,0.0,0.0,3,1,8.0,3,0,...,1.85,92.99,27.05,0,0,1,1,2,0,0
54901,6,1,2,0.0,0.0,3,0,7.0,3,0,...,1.78,124.74,29.47,0,0,1,0,0,0,0


### Construct interaction terms (try)

In [None]:
from sklearn.ensemble import RandomForestClassifier
df["BMI_PhysicalActivities"] = df["BMI"] * df["PhysicalActivities"]
df["PhysicalHealthDays_GeneralHealth"] = df["PhysicalHealthDays"] * df["GeneralHealth"]
df["AgeCategory_HadAngina"] = df["AgeCategory"] * df["HadAngina"]
df["SmokerStatus_HadCOPD"] = df["SmokerStatus"] * df["HadCOPD"]
df["DifficultyWalking_GeneralHealth"] = df["DifficultyWalking"] * df["GeneralHealth"]
df["BMI_DifficultyWalking"] = df["BMI"] * df["DifficultyWalking"]
df["HadStroke_AgeCategory"] = df["HadStroke"] * df["AgeCategory"]
df["HadDiabetes_PhysicalActivities"] = df["HadDiabetes"] * df["PhysicalActivities"]
df["HadKidneyDisease_PneumoVaxEver"] = df["HadKidneyDisease"] * df["PneumoVaxEver"]
df["AlcoholDrinkers_HadAngina"] = df["AlcoholDrinkers"] * df["HadAngina"]
df["Weight_diabetes"] = df["WeightInKilograms"] * df["HadDiabetes"]


# diabetes 
# It is noticed that in age group 41–50 and 51–60, females are more prone than males
# Weight and diabetes

all_terms = list(df.columns.drop("HadHeartAttack"))

X = df[all_terms]
y = df["HadHeartAttack"] 

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# 取得 Feature Importance 分數
importance_df = pd.DataFrame({
    "Term": all_terms,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# 顯示結果
display(importance_df)
condition = importance_df["Importance"] > (importance_df["Importance"].sum() / len(importance_df)) 
display(condition)

Unnamed: 0,Term,Importance
41,AgeCategory_HadAngina,0.07923
31,BMI,0.066835
9,HadAngina,0.063806
30,WeightInKilograms,0.060228
29,HeightInMeters,0.050756
39,BMI_PhysicalActivities,0.050701
7,SleepHours,0.039957
28,AgeCategory,0.038576
0,State,0.038489
40,PhysicalHealthDays_GeneralHealth,0.029305


41     True
31     True
9      True
30     True
29     True
39     True
7      True
28     True
0      True
40     True
3      True
44     True
4      True
36     True
8      True
2      True
24    False
45    False
48    False
27    False
26    False
17    False
43    False
1     False
38    False
25    False
34    False
16    False
32    False
35    False
33    False
10    False
14    False
5     False
11    False
18    False
42    False
12    False
6     False
13    False
20    False
46    False
15    False
19    False
23    False
21    False
47    False
22    False
37    False
Name: Importance, dtype: bool

### 利用 GAM 模型去檢查交互作用項的合理性

### Features Selection (ToDo)

### Creating new features (ToDo)

### Output new final data

In [16]:
output_path = '../data/cleaned/heart_2022_cleaned_02.csv'
df.to_csv(output_path, index=False)