In [78]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mahad049/heart-health-stats-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/mahad049/heart-health-stats-dataset/versions/1


In [79]:
import os
os.listdir(path)

['Heart_health.csv']

In [80]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [81]:
df = pd.read_csv(f'{path}/Heart_health.csv')
df.head()

Unnamed: 0,ID,Name,Age,Gender,Height(cm),Weight(kg),Blood Pressure(mmHg),Cholesterol(mg/dL),Glucose(mg/dL),Smoker,Exercise(hours/week),Heart Attack
0,1,John Doe,45,Male,175,80,120/80,200,90,No,3,0
1,2,Jane Smith,35,Female,160,65,110/70,180,80,No,2,0
2,3,Michael Johnson,55,Male,180,85,130/85,220,95,Yes,4,1
3,4,Sarah Brown,40,Female,165,70,115/75,190,85,No,3,0
4,5,David Lee,50,Male,170,75,125/80,210,92,Yes,2,1


In [82]:
df.shape

(724, 12)

In [83]:
df = df.drop_duplicates().reset_index(drop=True)

In [84]:
df.shape

(714, 12)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ID                    714 non-null    int64 
 1   Name                  714 non-null    object
 2   Age                   714 non-null    int64 
 3   Gender                714 non-null    object
 4   Height(cm)            714 non-null    int64 
 5   Weight(kg)            714 non-null    int64 
 6   Blood Pressure(mmHg)  714 non-null    object
 7   Cholesterol(mg/dL)    714 non-null    int64 
 8   Glucose(mg/dL)        714 non-null    int64 
 9   Smoker                714 non-null    object
 10  Exercise(hours/week)  714 non-null    int64 
 11  Heart Attack          714 non-null    int64 
dtypes: int64(8), object(4)
memory usage: 67.1+ KB


In [86]:
df.head()

Unnamed: 0,ID,Name,Age,Gender,Height(cm),Weight(kg),Blood Pressure(mmHg),Cholesterol(mg/dL),Glucose(mg/dL),Smoker,Exercise(hours/week),Heart Attack
0,1,John Doe,45,Male,175,80,120/80,200,90,No,3,0
1,2,Jane Smith,35,Female,160,65,110/70,180,80,No,2,0
2,3,Michael Johnson,55,Male,180,85,130/85,220,95,Yes,4,1
3,4,Sarah Brown,40,Female,165,70,115/75,190,85,No,3,0
4,5,David Lee,50,Male,170,75,125/80,210,92,Yes,2,1


In [87]:
df.drop(['ID', 'Name'], axis=1, inplace=True)

In [88]:
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})
df['Smoker'] = df['Smoker'].map({'No': 0, 'Yes': 1})

In [89]:
df['Blood Pressure(mmHg)'].unique()

array(['120/80', '110/70', '130/85', '115/75', '125/80', '105/65',
       '135/85', '118/72', '128/82', '120/78', '122/80', '108/68',
       '132/84', '116/74', '123/78', '130/83', '117/73', '119/75',
       '133/85', '112/72', '124/78', '121/77', '126/82', '109/69',
       '130/82', '122/78', '113/71', '124/79', '118/75', '115/70',
       '124/80', '120/75', '123/79', '119/78', '125/78', '118/76',
       '121/78', '117/74', '123/80', '116/73'], dtype=object)

In [90]:
def classify_blood_pressure(bp):
    systolic, diastolic = map(int, bp.split('/'))
    
    if systolic < 120 and diastolic < 80:
        return 'Normal'
    elif 120 <= systolic <= 129 and diastolic < 80:
        return 'Elevated'
    elif 130 <= systolic <= 139 or 80 <= diastolic <= 89:
        return 'Hypertension Stage 1'
    elif 140 <= systolic <= 179 or 90 <= diastolic <= 119:
        return 'Hypertension Stage 2'
    elif systolic >= 180 or diastolic >= 120:
        return 'Hypertensive Crisis'
    else:
        return 'Unknown'

In [91]:
df['Blood Pressure(mmHg)'] = df['Blood Pressure(mmHg)'].apply(classify_blood_pressure)

In [92]:
df['Blood Pressure(mmHg)'].unique()

array(['Hypertension Stage 1', 'Normal', 'Elevated'], dtype=object)

In [93]:
df['Blood Pressure(mmHg)'] = df['Blood Pressure(mmHg)'].map({'Normal': 0, 'Elevated': 1, 'Hypertension Stage 1': 2})

In [94]:
df.head()

Unnamed: 0,Age,Gender,Height(cm),Weight(kg),Blood Pressure(mmHg),Cholesterol(mg/dL),Glucose(mg/dL),Smoker,Exercise(hours/week),Heart Attack
0,45,1,175,80,2,200,90,0,3,0
1,35,0,160,65,0,180,80,0,2,0
2,55,1,180,85,2,220,95,1,4,1
3,40,0,165,70,0,190,85,0,3,0
4,50,1,170,75,2,210,92,1,2,1


In [95]:
y = df['Heart Attack'].values
X = df.drop('Heart Attack', axis=1)

In [96]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(714, 9)
(714, 9)


In [97]:
# build mask
gender_0_mask = df['Gender'] == 0
gender_1_mask = df['Gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  357
Male:  357


In [98]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [99]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KN

In [100]:
results_df = pd.concat(results_list, ignore_index=True)

In [101]:
result_path = './results/K21_result.xlsx'
results_df.to_excel(result_path, index=False)

In [102]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.952381,1.0,0.0,0.047619,20,51,0,1,...,0,1,0.952381,1.0,0.0,0.047619,20,51,0,1
1,2,Female,1.0,1.0,0.0,0.0,23,49,0,0,...,0,0,1.0,1.0,0.0,0.0,23,49,0,0
2,3,Female,1.0,1.0,0.0,0.0,25,46,0,0,...,0,0,1.0,1.0,0.0,0.0,25,46,0,0
3,4,Female,1.0,1.0,0.0,0.0,21,50,0,0,...,0,0,1.0,1.0,0.0,0.0,21,50,0,0
4,5,Female,1.0,1.0,0.0,0.0,25,46,0,0,...,0,0,1.0,1.0,0.0,0.0,25,46,0,0


In [103]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
DT -TPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
DT - FPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
RF -TPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
RF - FPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
LR -TPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
LR - FPR: MannwhitneyuResult(statis