# This notebook will contain feature engineering steps:

- Encoding categorical variables

- Feature scaling (e.g., standardization, normalization)

- Feature selection (optional)

- Creating new features (if applicable)

In [85]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

### Input 01-notebook file

In [86]:
path = '../data/cleaned/heart_2022_cleaned_01.csv'

In [87]:
# read data
df = pd.read_csv(path ,sep=',',encoding='utf-8')
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartAttack_bin,HeartAttackBinary
44691,Colorado,Male,Good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,19.37,Yes,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No,0,0
36955,Colorado,Female,Good,0.0,5.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,27.46,No,No,Yes,Yes,"Yes, received Tdap",No,No,0,0
11967,Arizona,Male,Very good,0.0,5.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,26.54,No,No,Yes,No,"Yes, received Tdap",No,No,0,0
35387,California,Male,Good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,None of them,No,...,23.57,No,Yes,Yes,No,"Yes, received Tdap",No,Yes,0,0
27032,California,Female,Very good,5.0,5.0,Within past year (anytime less than 12 months ...,Yes,8.0,None of them,No,...,22.6,Yes,No,Yes,Yes,"Yes, received Tdap",No,No,0,0


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59903 entries, 0 to 59902
Data columns (total 42 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      59903 non-null  object 
 1   Sex                        59903 non-null  object 
 2   GeneralHealth              59903 non-null  object 
 3   PhysicalHealthDays         59903 non-null  float64
 4   MentalHealthDays           59903 non-null  float64
 5   LastCheckupTime            59903 non-null  object 
 6   PhysicalActivities         59903 non-null  object 
 7   SleepHours                 59903 non-null  float64
 8   RemovedTeeth               59903 non-null  object 
 9   HadHeartAttack             59903 non-null  object 
 10  HadAngina                  59903 non-null  object 
 11  HadStroke                  59903 non-null  object 
 12  HadAsthma                  59903 non-null  object 
 13  HadSkinCancer              59903 non-null  obj

### Features testing

In [89]:
from scipy.stats import pointbiserialr
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import mutual_info_classif


# Convert binary target variable to numeric
df["HadHeartAttack_bin"] = df["HadHeartAttack"].map({"Yes": 1, "No": 0})
print(df["HadHeartAttack"])
# Identify categorical and numerical features
categorical_cols = df.select_dtypes(include="object").columns.drop("HadHeartAttack")
numerical_cols = df.select_dtypes(include=["float64", "int64"]).columns.drop("HadHeartAttack_bin")

results = []

# Evaluate numerical features using point-biserial correlation and AUC
for col in numerical_cols:
    try:
        temp = df[[col, "HadHeartAttack_bin"]].dropna()
        if temp[col].nunique() > 1:
            r, p_val = pointbiserialr(temp["HadHeartAttack_bin"], temp[col])
            auc = roc_auc_score(temp["HadHeartAttack_bin"], temp[col])
        else:
            r, p_val, auc = np.nan, np.nan, np.nan
    except:
        r, p_val, auc = np.nan, np.nan, np.nan
    results.append({
        "Feature": col,
        "Type": "Numerical",
        "Test": "Point-Biserial",
        "p-value": p_val,
        "Effect Size": r,
        "AUC": auc
    })

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df[categorical_cols], drop_first=True)
mi_scores = mutual_info_classif(df_encoded, df["HadHeartAttack_bin"], discrete_features=True)

# Store mutual information scores with corresponding original feature names
mi_summary = pd.DataFrame({
    "OneHotFeature": df_encoded.columns,
    "Mutual Information": mi_scores
})
mi_summary["OriginalFeature"] = mi_summary["OneHotFeature"].apply(lambda x: x.split("_")[0])

# Aggregate mutual information scores by original feature
mi_aggregated = mi_summary.groupby("OriginalFeature").agg({
    "Mutual Information": ["max", "mean"]
}).reset_index()
mi_aggregated.columns = ["Feature", "Max Mutual Info", "Avg Mutual Info"]
mi_aggregated = mi_aggregated.sort_values("Max Mutual Info", ascending=False)

# Append mutual information results to the results list
for _, row in mi_aggregated.iterrows():
    results.append({
        "Feature": row["Feature"],
        "Type": "Categorical",
        "Test": "Mutual Information",
        "p-value": np.nan,
        "Effect Size": row["Max Mutual Info"],
        "AUC": np.nan
    })

# Create final dataframe
results_df = pd.DataFrame(results)

# Determine significance
def determine_significance(row):
    if row["Type"] == "Numerical":
        return row["p-value"] < 0.05 if not pd.isna(row["p-value"]) else False
    elif row["Type"] == "Categorical":
        return row["Effect Size"] > 0.01
    else:
        return False

results_df["Significant"] = results_df.apply(determine_significance, axis=1)


display(results_df)


0         No
1         No
2         No
3         No
4         No
        ... 
59898     No
59899     No
59900     No
59901    Yes
59902     No
Name: HadHeartAttack, Length: 59903, dtype: object


Unnamed: 0,Feature,Type,Test,p-value,Effect Size,AUC,Significant
0,PhysicalHealthDays,Numerical,Point-Biserial,1.1916270000000001e-267,0.142086,0.625658,True
1,MentalHealthDays,Numerical,Point-Biserial,1.134651e-10,0.02634,0.498889,True
2,SleepHours,Numerical,Point-Biserial,0.8123487,0.00097,0.498958,False
3,HeightInMeters,Numerical,Point-Biserial,2.923344e-10,0.025748,0.537945,True
4,WeightInKilograms,Numerical,Point-Biserial,1.486017e-16,0.03373,0.552272,True
5,BMI,Numerical,Point-Biserial,1.957167e-10,0.026001,0.539121,True
6,HeartAttackBinary,Numerical,Point-Biserial,0.0,1.0,1.0,True
7,HadAngina,Categorical,Mutual Information,,0.04109,,True
8,ChestScan,Categorical,Mutual Information,,0.015096,,True
9,DifficultyWalking,Categorical,Mutual Information,,0.010206,,True


### Encoding

In [90]:
df.drop(["HadHeartAttack_bin", "HeartAttackBinary"], axis=1, inplace=True)
col_obj = df.select_dtypes('object').columns

le = LabelEncoder()

for col in col_obj:
    df[col] = le.fit_transform(df[col])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59903 entries, 0 to 59902
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      59903 non-null  int64  
 1   Sex                        59903 non-null  int64  
 2   GeneralHealth              59903 non-null  int64  
 3   PhysicalHealthDays         59903 non-null  float64
 4   MentalHealthDays           59903 non-null  float64
 5   LastCheckupTime            59903 non-null  int64  
 6   PhysicalActivities         59903 non-null  int64  
 7   SleepHours                 59903 non-null  float64
 8   RemovedTeeth               59903 non-null  int64  
 9   HadHeartAttack             59903 non-null  int64  
 10  HadAngina                  59903 non-null  int64  
 11  HadStroke                  59903 non-null  int64  
 12  HadAsthma                  59903 non-null  int64  
 13  HadSkinCancer              59903 non-null  int

In [91]:
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
27397,4,0,0,1.0,3.0,1,1,8.0,3,0,...,1.7,77.11,26.63,0,0,0,0,0,0,0
9354,1,0,4,0.0,2.0,3,1,7.0,0,0,...,1.57,57.61,23.23,0,1,1,1,3,0,1
40959,5,0,0,0.0,0.0,3,1,8.0,3,0,...,1.68,60.78,21.63,1,0,0,0,2,0,2
52406,6,0,4,2.0,8.0,3,1,9.0,3,0,...,1.57,52.16,21.03,1,0,1,0,3,0,2
31389,4,1,1,20.0,14.0,2,1,6.0,0,0,...,1.6,72.57,28.34,1,1,0,1,2,0,0


### Construct interaction terms

In [None]:
from sklearn.ensemble import RandomForestClassifier
df["BMI_PhysicalActivities"] = df["BMI"] * df["PhysicalActivities"]
df["PhysicalHealthDays_GeneralHealth"] = df["PhysicalHealthDays"] * df["GeneralHealth"]
df["AgeCategory_HadAngina"] = df["AgeCategory"] * df["HadAngina"]
df["SmokerStatus_HadCOPD"] = df["SmokerStatus"] * df["HadCOPD"]
df["DifficultyWalking_GeneralHealth"] = df["DifficultyWalking"] * df["GeneralHealth"]
df["BMI_DifficultyWalking"] = df["BMI"] * df["DifficultyWalking"]
df["HadStroke_AgeCategory"] = df["HadStroke"] * df["AgeCategory"]
df["HadDiabetes_PhysicalActivities"] = df["HadDiabetes"] * df["PhysicalActivities"]
df["HadKidneyDisease_PneumoVaxEver"] = df["HadKidneyDisease"] * df["PneumoVaxEver"]
df["AlcoholDrinkers_HadAngina"] = df["AlcoholDrinkers"] * df["HadAngina"]


all_terms = list(df.columns.drop("HadHeartAttack"))

X = df[all_terms]
y = df["HadHeartAttack"] 

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# 取得 Feature Importance 分數
importance_df = pd.DataFrame({
    "Term": all_terms,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# 顯示結果
display(importance_df)
condition = importance_df["Importance"] > (importance_df["Importance"].sum() / len(importance_df)) 
display(condition)

Unnamed: 0,Term,Importance
41,AgeCategory_HadAngina,0.07923
31,BMI,0.066835
9,HadAngina,0.063806
30,WeightInKilograms,0.060228
29,HeightInMeters,0.050756
39,BMI_PhysicalActivities,0.050701
7,SleepHours,0.039957
28,AgeCategory,0.038576
0,State,0.038489
40,PhysicalHealthDays_GeneralHealth,0.029305


49


41     True
31     True
9      True
30     True
29     True
39     True
7      True
28     True
0      True
40     True
3      True
44     True
4      True
36     True
8      True
2      True
24    False
45    False
48    False
27    False
26    False
17    False
43    False
1     False
38    False
25    False
34    False
16    False
32    False
35    False
33    False
10    False
14    False
5     False
11    False
18    False
42    False
12    False
6     False
13    False
20    False
46    False
15    False
19    False
23    False
21    False
47    False
22    False
37    False
Name: Importance, dtype: bool

### Features Selection (ToDo)

### Creating new features (ToDo)

### Output new final data

In [16]:
output_path = '../data/cleaned/heart_2022_cleaned_02.csv'
df.to_csv(output_path, index=False)