In [35]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("abdmental01/heart-disease-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/abdmental01/heart-disease-dataset/versions/1


In [36]:
import os
os.listdir(path)

['heart_disease_uci.csv', 'heart_disease_cleaned.csv']

In [37]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [38]:
df = pd.read_csv("/home/morning/.cache/kagglehub/datasets/abdmental01/heart-disease-dataset/versions/1/heart_disease_cleaned.csv")
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [39]:
df.shape

(919, 16)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919 entries, 0 to 918
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        919 non-null    int64  
 1   age       919 non-null    int64  
 2   sex       919 non-null    object 
 3   dataset   919 non-null    object 
 4   cp        919 non-null    object 
 5   trestbps  919 non-null    float64
 6   chol      919 non-null    float64
 7   fbs       919 non-null    bool   
 8   restecg   919 non-null    object 
 9   thalch    919 non-null    float64
 10  exang     919 non-null    bool   
 11  oldpeak   919 non-null    float64
 12  slope     919 non-null    object 
 13  ca        919 non-null    float64
 14  thal      919 non-null    object 
 15  num       919 non-null    int64  
dtypes: bool(2), float64(5), int64(3), object(6)
memory usage: 102.4+ KB


In [41]:
df['sex'] = df['sex'].map({'Female': 0, 'Male': 1})

In [42]:
enc = OneHotEncoder(categories='auto')
dataset = df['dataset'].values.reshape(-1, 1)
enc.fit(dataset)
new_features = enc.get_feature_names_out()
print(new_features)
new_dataset = pd.DataFrame(enc.transform(dataset).toarray())

['x0_Cleveland' 'x0_Hungary' 'x0_Switzerland' 'x0_VA Long Beach']


In [43]:
new_dataset.columns = ['Cleveland', 'Hungary', 'Switzerland', 'VA Long Beach']

In [44]:
df = pd.concat([df, new_dataset], axis=1)
df.drop('dataset', axis=1, inplace=True)

In [45]:
df['cp'].unique()

array(['typical angina', 'asymptomatic', 'non-anginal', 'atypical angina'],
      dtype=object)

In [46]:
df['cp'] = df['cp'].map({'asymptomatic': 0, 'non-anginal': 1, 'atypical angina': 2, 'typical angina': 3})

In [47]:
df['fbs'].unique()

array([ True, False])

In [48]:
a = df['fbs'].unique()

In [49]:
df['fbs'] = df['fbs'].map({a[1]: 0, a[0]: 1})
df['fbs'].unique()

array([1, 0])

In [50]:
df['restecg'].unique()

array(['lv hypertrophy', 'normal', 'st-t abnormality'], dtype=object)

In [51]:
df['restecg'] = df['restecg'].map({'normal': 0, 'st-t abnormality': 1, 'lv hypertrophy': 2})

In [52]:
b = df['exang'].unique()
b

array([False,  True])

In [53]:
df['exang'] = df['exang'].map({b[0]: 0, b[1]: 1})
df['exang'].unique()

array([0, 1])

In [54]:
df.head()

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,Cleveland,Hungary,Switzerland,VA Long Beach
0,1,63,1,3,145.0,233.0,1,2,150.0,0,2.3,downsloping,0.0,fixed defect,0,1.0,0.0,0.0,0.0
1,2,67,1,0,160.0,286.0,0,2,108.0,1,1.5,flat,3.0,normal,2,1.0,0.0,0.0,0.0
2,3,67,1,0,120.0,229.0,0,2,129.0,1,2.6,flat,2.0,reversable defect,1,1.0,0.0,0.0,0.0
3,4,37,1,1,130.0,250.0,0,0,187.0,0,3.5,downsloping,0.0,normal,0,1.0,0.0,0.0,0.0
4,5,41,0,2,130.0,204.0,0,2,172.0,0,1.4,upsloping,0.0,normal,0,1.0,0.0,0.0,0.0


In [55]:
df['slope'].unique()

array(['downsloping', 'flat', 'upsloping'], dtype=object)

In [56]:
df['slope'] = df['slope'].map({'upsloping': 0, 'flat': 1, 'downsloping': 2})

In [57]:
df['thal'] = df['thal'].map({'normal': 0, 'reversable defect': 1, 'fixed defect': 2})

In [58]:
df.head()

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,Cleveland,Hungary,Switzerland,VA Long Beach
0,1,63,1,3,145.0,233.0,1,2,150.0,0,2.3,2,0.0,2,0,1.0,0.0,0.0,0.0
1,2,67,1,0,160.0,286.0,0,2,108.0,1,1.5,1,3.0,0,2,1.0,0.0,0.0,0.0
2,3,67,1,0,120.0,229.0,0,2,129.0,1,2.6,1,2.0,1,1,1.0,0.0,0.0,0.0
3,4,37,1,1,130.0,250.0,0,0,187.0,0,3.5,2,0.0,0,0,1.0,0.0,0.0,0.0
4,5,41,0,2,130.0,204.0,0,2,172.0,0,1.4,0,0.0,0,0,1.0,0.0,0.0,0.0


In [59]:
df['num'].unique()

array([0, 2, 1, 3, 4])

In [60]:
y = df['num'].values
X = df.drop('num', axis=1)

In [61]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(919, 18)
(919, 18)


In [62]:
# build mask
gender_0_mask = df['sex'] == 0
gender_1_mask = df['sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  194
Male:  725


In [63]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [64]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.400000  0.967742  0.032258  0.600000       2      30   
1     2  Female  0.400000  1.000000  0.000000  0.600000       4      27   
2     3  Female  0.000000  0.933333  0.066667  1.000000       0      28   
3     4  Female  0.375000  0.965517  0.034483  0.625000       3      28   
4     5  Female  0.833333  0.962963  0.037037  0.166667       5      26   
5     1    Male  0.808511  0.850000  0.150000  0.191489      38      51   
6     2    Male  0.846154  0.818182  0.181818  0.153846      33      45   
7     3    Male  0.705882  0.822581  0.177419  0.294118      24      51   
8     4    Male  



In [65]:
results_df = pd.concat(results_list, ignore_index=True)

In [66]:
result_path = './results/K64_result.xlsx'
results_df.to_excel(result_path, index=False)

In [67]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.4,0.967742,0.032258,0.6,2,30,1,3,...,2,3,0.0,0.944444,0.055556,1.0,0,17,1,3
1,2,Female,0.4,1.0,0.0,0.6,4,27,0,6,...,0,4,0.333333,1.0,0.0,0.666667,1,16,0,2
2,3,Female,0.0,0.933333,0.066667,1.0,0,28,2,1,...,1,1,0.0,0.961538,0.038462,1.0,0,25,1,1
3,4,Female,0.375,0.965517,0.034483,0.625,3,28,1,5,...,3,3,0.166667,0.947368,0.052632,0.833333,1,18,1,5
4,5,Female,0.833333,0.962963,0.037037,0.166667,5,26,1,1,...,3,1,0.5,1.0,0.0,0.5,1,18,0,1


In [68]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(-2.8528634810792766), pvalue=np.float64(0.02138470462186272), df=np.float64(8.0))
SVM - FPR: TtestResult(statistic=np.float64(-7.596057366427978), pvalue=np.float64(6.328301281014061e-05), df=np.float64(8.0))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(17.0), pvalue=np.float64(0.42063492063492064))
DT -TPR: MannwhitneyuResult(statistic=np.float64(5.0), pvalue=np.float64(0.14245669739409875))
DT - FPR: TtestResult(statistic=np.float64(-3.1510668485986955), pvalue=np.float64(0.013576611989903689), df=np.float64(8.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(16.0), pvalue=np.float64(0.5476190476190477))
RF -TPR: TtestResult(statistic=np.float64(-1.4722801199584987), pvalue=np.float64(0.1791608697324971), df=np.float64(8.0))
RF - FPR: TtestResult(statistic=np.float64(-3.9377034066430108), pvalue=np.float64(0.004309776337556291), df=np.float64(8.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(8.0), pvalue=np.float64(0.40