# This notebook will contain feature engineering steps:

- Encoding categorical variables

- Feature scaling (e.g., standardization, normalization)

- Feature selection (optional)

- Creating new features (if applicable)

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

### Input 01-notebook file

In [2]:
path = '../data/cleaned/heart_2022_cleaned_01.csv'

In [3]:
# read data
df = pd.read_csv(path ,sep=',',encoding='utf-8')
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartAttack_bin,HeartAttackBinary
20092,Arizona,Female,Very good,2.0,5.0,Within past year (anytime less than 12 months ...,No,7.0,None of them,No,...,25.84,No,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No,0,0
20699,Arkansas,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,24.0,All,No,...,17.75,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No,0,0
24384,Arkansas,Male,Fair,20.0,5.0,Within past year (anytime less than 12 months ...,No,7.0,1 to 5,No,...,32.28,Yes,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No,0,0
53390,Connecticut,Female,Good,0.0,3.0,Within past year (anytime less than 12 months ...,No,8.0,None of them,No,...,35.67,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes,0,0
15314,Arizona,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,34.46,Yes,Yes,No,No,"Yes, received tetanus shot, but not Tdap",No,No,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59903 entries, 0 to 59902
Data columns (total 42 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      59903 non-null  object 
 1   Sex                        59903 non-null  object 
 2   GeneralHealth              59903 non-null  object 
 3   PhysicalHealthDays         59903 non-null  float64
 4   MentalHealthDays           59903 non-null  float64
 5   LastCheckupTime            59903 non-null  object 
 6   PhysicalActivities         59903 non-null  object 
 7   SleepHours                 59903 non-null  float64
 8   RemovedTeeth               59903 non-null  object 
 9   HadHeartAttack             59903 non-null  object 
 10  HadAngina                  59903 non-null  object 
 11  HadStroke                  59903 non-null  object 
 12  HadAsthma                  59903 non-null  object 
 13  HadSkinCancer              59903 non-null  obj

### Features testing

In [5]:
from scipy.stats import pointbiserialr
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import mutual_info_classif


# Convert binary target variable to numeric
df["HadHeartAttack_bin"] = df["HadHeartAttack"].map({"Yes": 1, "No": 0})

# Identify categorical and numerical features
categorical_cols = df.select_dtypes(include="object").columns.drop("HadHeartAttack")
numerical_cols = df.select_dtypes(include=["float64", "int64"]).columns.drop("HadHeartAttack_bin")

results = []

# Evaluate numerical features using point-biserial correlation and AUC
for col in numerical_cols:
    try:
        temp = df[[col, "HadHeartAttack_bin"]].dropna()
        if temp[col].nunique() > 1:
            r, p_val = pointbiserialr(temp["HadHeartAttack_bin"], temp[col])
            auc = roc_auc_score(temp["HadHeartAttack_bin"], temp[col])
        else:
            r, p_val, auc = np.nan, np.nan, np.nan
    except:
        r, p_val, auc = np.nan, np.nan, np.nan
    results.append({
        "Feature": col,
        "Type": "Numerical",
        "Test": "Point-Biserial",
        "p-value": p_val,
        "Effect Size": r,
        "AUC": auc
    })

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df[categorical_cols], drop_first=True)
mi_scores = mutual_info_classif(df_encoded, df["HadHeartAttack_bin"], discrete_features=True)

# Store mutual information scores with corresponding original feature names
mi_summary = pd.DataFrame({
    "OneHotFeature": df_encoded.columns,
    "Mutual Information": mi_scores
})
mi_summary["OriginalFeature"] = mi_summary["OneHotFeature"].apply(lambda x: x.split("_")[0])

# Aggregate mutual information scores by original feature
mi_aggregated = mi_summary.groupby("OriginalFeature").agg({
    "Mutual Information": ["max", "mean"]
}).reset_index()
mi_aggregated.columns = ["Feature", "Max Mutual Info", "Avg Mutual Info"]
mi_aggregated = mi_aggregated.sort_values("Max Mutual Info", ascending=False)

# Append mutual information results to the results list
for _, row in mi_aggregated.iterrows():
    results.append({
        "Feature": row["Feature"],
        "Type": "Categorical",
        "Test": "Mutual Information",
        "p-value": np.nan,
        "Effect Size": row["Max Mutual Info"],
        "AUC": np.nan
    })

# Create final dataframe
results_df = pd.DataFrame(results)

# Determine significance
def determine_significance(row):
    if row["Type"] == "Numerical":
        return row["p-value"] < 0.05 if not pd.isna(row["p-value"]) else False
    elif row["Type"] == "Categorical":
        return row["Effect Size"] > 0.01
    else:
        return False

results_df["Significant"] = results_df.apply(determine_significance, axis=1)


display(results_df)


Unnamed: 0,Feature,Type,Test,p-value,Effect Size,AUC,Significant
0,PhysicalHealthDays,Numerical,Point-Biserial,1.1916270000000001e-267,0.142086,0.625658,True
1,MentalHealthDays,Numerical,Point-Biserial,1.134651e-10,0.02634,0.498889,True
2,SleepHours,Numerical,Point-Biserial,0.8123487,0.00097,0.498958,False
3,HeightInMeters,Numerical,Point-Biserial,2.923344e-10,0.025748,0.537945,True
4,WeightInKilograms,Numerical,Point-Biserial,1.486017e-16,0.03373,0.552272,True
5,BMI,Numerical,Point-Biserial,1.957167e-10,0.026001,0.539121,True
6,HeartAttackBinary,Numerical,Point-Biserial,0.0,1.0,1.0,True
7,HadAngina,Categorical,Mutual Information,,0.04109,,True
8,ChestScan,Categorical,Mutual Information,,0.015096,,True
9,DifficultyWalking,Categorical,Mutual Information,,0.010206,,True


### Encoding

In [None]:
col_obj = df.select_dtypes('object').columns

le = LabelEncoder()

for col in col_obj:
    df[col] = le.fit_transform(df[col])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442067 entries, 0 to 442066
Data columns (total 43 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      442067 non-null  int64  
 1   Sex                        442067 non-null  int64  
 2   GeneralHealth              442067 non-null  int64  
 3   PhysicalHealthDays         442067 non-null  float64
 4   MentalHealthDays           442067 non-null  float64
 5   LastCheckupTime            442067 non-null  int64  
 6   PhysicalActivities         442067 non-null  int64  
 7   SleepHours                 442067 non-null  float64
 8   RemovedTeeth               442067 non-null  int64  
 9   HadHeartAttack             442067 non-null  int64  
 10  HadAngina                  442067 non-null  int64  
 11  HadStroke                  442067 non-null  int64  
 12  HadAsthma                  442067 non-null  int64  
 13  HadSkinCancer              44

In [None]:
df.sample(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,BMI_Category,SleepHours_Category,HeartAttackBinary
320496,41,1,2,0.0,0.0,3,1,6.0,2,0,...,1,0,1,0,0,0,0,3,2,0
69406,9,1,1,5.0,0.0,2,1,6.0,2,1,...,0,1,0,1,3,0,2,3,2,1
347257,45,1,2,30.0,30.0,3,1,6.0,3,0,...,1,0,1,0,1,0,2,3,2,0
43689,5,0,0,0.0,0.0,3,1,9.0,0,0,...,0,0,1,1,0,0,0,3,0,0
160205,21,0,1,0.0,6.0,3,0,4.0,1,1,...,0,1,0,1,0,0,0,0,4,1


### Features Selection (ToDo)

### Creating new features (ToDo)

### Output new final data

In [16]:
output_path = '../data/cleaned/heart_2022_cleaned_02.csv'
df.to_csv(output_path, index=False)