In [11]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("alexteboul/heart-disease-health-indicators-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/alexteboul/heart-disease-health-indicators-dataset/versions/3


In [12]:
import os
os.listdir(path)

['heart_disease_health_indicators_BRFSS2015.csv']

In [13]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

In [14]:
df = pd.read_csv(f'{path}/heart_disease_health_indicators_BRFSS2015.csv')
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [None]:
df.shape

In [None]:
df = df.drop_duplicates().reset_index(drop=True)

In [15]:
df.shape

(253680, 22)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [17]:
y = df['HeartDiseaseorAttack'].values
X = df.drop('HeartDiseaseorAttack', axis=1)

In [18]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(253680, 21)
(253680, 16)


In [19]:
# build mask
gender_0_mask = df['Sex'] == 0
gender_1_mask = df['Sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  141974
Male:  111706


In [20]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=20, shuffle=True, random_state=seed)

In [21]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: A

In [22]:
results_df = pd.concat(results_list, ignore_index=True)

In [23]:
result_path = './results/d10_result.xlsx'
results_df.to_excel(result_path, index=False)

In [24]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,DT_FP,DT_FN,ANN_TPR,ANN_TNR,ANN_FPR,ANN_FNR,ANN_TP,ANN_TN,ANN_FP,ANN_FN
0,1,Female,0.0,1.0,0.0,1.0,0,6609,0,490,...,477,383,0.042857,0.998184,0.001816,0.957143,21,6597,12,469
1,2,Female,0.0,1.0,0.0,1.0,0,6616,0,483,...,482,381,0.028986,0.996826,0.003174,0.971014,14,6595,21,469
2,3,Female,0.0,1.0,0.0,1.0,0,6579,0,520,...,505,412,0.051923,0.996352,0.003648,0.948077,27,6555,24,493
3,4,Female,0.0,1.0,0.0,1.0,0,6604,0,495,...,487,392,0.066667,0.993186,0.006814,0.933333,33,6559,45,462
4,5,Female,0.0,1.0,0.0,1.0,0,6570,0,529,...,460,406,0.060491,0.994825,0.005175,0.939509,32,6536,34,497


In [25]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)

SVM -TPR: MannwhitneyuResult(statistic=np.float64(0.0), pvalue=np.float64(8.006545033944715e-09))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(0.0), pvalue=np.float64(8.006545033944715e-09))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(400.0), pvalue=np.float64(6.738331748971247e-08))
DT -TPR: TtestResult(statistic=np.float64(-9.786368023371555), pvalue=np.float64(6.200550574661367e-12), df=np.float64(38.0))
DT - FPR: TtestResult(statistic=np.float64(-28.826347909033988), pvalue=np.float64(1.9647816785548686e-27), df=np.float64(38.0))
DT - FN/FP: TtestResult(statistic=np.float64(-4.349421478085459), pvalue=np.float64(9.883992678806984e-05), df=np.float64(38.0))
RF -TPR: TtestResult(statistic=np.float64(-19.694875428857742), pvalue=np.float64(1.540641939716282e-21), df=np.float64(38.0))
RF - FPR: TtestResult(statistic=np.float64(-35.34595070732021), pvalue=np.float64(1.1145730852182199e-30), df=np.float64(38.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(400.0), p