In [155]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/fedesoriano/heart-failure-prediction/versions/1


In [156]:
import os
os.listdir(path)

['heart.csv']

In [157]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

In [158]:
df = pd.read_csv(f'{path}/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [159]:
df.shape

(918, 12)

In [160]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [161]:
df['ChestPainType'] = df['ChestPainType'].map({'ASY':0, 'NAP':1, 'ATA':2, 'TA':3})
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,2,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,1,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,2,130,283,0,ST,98,N,0.0,Up,0
3,48,F,0,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,1,150,195,0,Normal,122,N,0.0,Up,0


In [162]:
df['RestingECG'] = df['RestingECG'].map({'Normal':0, 'ST':1, 'LVH':2})
df['ExerciseAngina'] = df['ExerciseAngina'].map({'N':0, 'Y':1})
df['ST_Slope'] = df['ST_Slope'].map({'Up':0, 'Flat':1, 'Down':2})
df['Sex'] = df['Sex'].map({'F':0, 'M':1})

In [163]:
df['Oldpeak'] = pd.to_numeric(df['Oldpeak'], errors='coerce')

In [164]:
df.shape

(918, 12)

In [165]:
y = df['HeartDisease'].values
X = df.drop('HeartDisease', axis=1)

In [166]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(918, 11)
(918, 11)


In [167]:
# build mask
gender_0_mask = df['Sex'] == 0
gender_1_mask = df['Sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  193
Male:  725


In [168]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [169]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.857143  0.906250  0.093750  0.142857       6      29   
1     2  Female  0.416667  0.962963  0.037037  0.583333       5      26   
2     3  Female  0.666667  0.933333  0.066667  0.333333       6      28   
3     4  Female  0.769231  0.960000  0.040000  0.230769      10      24   
4     5  Female  0.777778  0.965517  0.034483  0.222222       7      28   
5     1    Male  0.885417  0.775510  0.224490  0.114583      85      38   
6     2    Male  0.988636  0.842105  0.157895  0.011364      87      48   
7     3    Male  0.926316  0.720000  0.280000  0.073684      88      36   
8     4    Male  



In [170]:
results_df = pd.concat(results_list, ignore_index=True)

In [171]:
result_path = './results/k2_result.xlsx'
results_df.to_excel(result_path, index=False)

In [172]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.857143,0.90625,0.09375,0.142857,6,29,3,1,...,3,2,1.0,0.84375,0.15625,0.0,7,27,5,0
1,2,Female,0.416667,0.962963,0.037037,0.583333,5,26,1,7,...,1,8,0.75,0.814815,0.185185,0.25,9,22,5,3
2,3,Female,0.666667,0.933333,0.066667,0.333333,6,28,2,3,...,1,4,0.777778,0.866667,0.133333,0.222222,7,26,4,2
3,4,Female,0.769231,0.96,0.04,0.230769,10,24,1,3,...,1,3,0.846154,0.84,0.16,0.153846,11,21,4,2
4,5,Female,0.777778,0.965517,0.034483,0.222222,7,28,1,2,...,3,1,0.777778,0.896552,0.103448,0.222222,7,26,3,2


In [173]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(-2.859667130221249), pvalue=np.float64(0.02116223157223505), df=np.float64(8.0))
SVM - FPR: TtestResult(statistic=np.float64(-7.168248756347442), pvalue=np.float64(9.536869322529326e-05), df=np.float64(8.0))
SVM - FN/FP: TtestResult(statistic=np.float64(1.9270470543417777), pvalue=np.float64(0.09013041696153655), df=np.float64(8.0))
DT -TPR: TtestResult(statistic=np.float64(-1.604120225593932), pvalue=np.float64(0.14735509789681267), df=np.float64(8.0))
DT - FPR: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.2222222222222222))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.2222222222222222))
RF -TPR: TtestResult(statistic=np.float64(-2.356517709762286), pvalue=np.float64(0.046208721748669565), df=np.float64(8.0))
RF - FPR: MannwhitneyuResult(statistic=np.float64(0.0), pvalue=np.float64(0.007936507936507936))
RF - FN/FP: TtestResult(statistic=np.float64(2.0049566981551554), pvalue=np.float64(0.