In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("redwankarimsony/heart-disease-data")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/redwankarimsony/heart-disease-data/versions/6


In [2]:
import os
os.listdir(path)

['heart_disease_uci.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [5]:
df.shape

(920, 16)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [7]:
df.drop(['id', 'ca', 'thal'], axis=1, inplace=True)


In [8]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       531 non-null    int64  
 1   sex       531 non-null    object 
 2   dataset   531 non-null    object 
 3   cp        531 non-null    object 
 4   trestbps  531 non-null    float64
 5   chol      531 non-null    float64
 6   fbs       531 non-null    object 
 7   restecg   531 non-null    object 
 8   thalch    531 non-null    float64
 9   exang     531 non-null    object 
 10  oldpeak   531 non-null    float64
 11  slope     531 non-null    object 
 12  num       531 non-null    int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 54.1+ KB


In [9]:
df.head()

Unnamed: 0,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,num
0,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0
1,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,2
2,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,1
3,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0
4,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0


In [10]:
df['dataset'].unique()

array(['Cleveland', 'Hungary', 'Switzerland', 'VA Long Beach'],
      dtype=object)

In [11]:
df['sex'] = df['sex'].map({'Female': 0, 'Male': 1})

In [12]:
enc = OneHotEncoder(categories='auto')
dataset = df['dataset'].values.reshape(-1, 1)
enc.fit(dataset)
new_features = enc.get_feature_names_out()
print(new_features)
new_dataset = pd.DataFrame(enc.transform(dataset).toarray())

['x0_Cleveland' 'x0_Hungary' 'x0_Switzerland' 'x0_VA Long Beach']


In [13]:
new_dataset.columns = ['Cleveland', 'Hungary', 'Switzerland', 'VA Long Beach']

In [14]:
df = pd.concat([df, new_dataset], axis=1)
df.drop('dataset', axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,num,Cleveland,Hungary,Switzerland,VA Long Beach
0,63,1,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0,1.0,0.0,0.0,0.0
1,67,1,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,2,1.0,0.0,0.0,0.0
2,67,1,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,1,1.0,0.0,0.0,0.0
3,37,1,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0,1.0,0.0,0.0,0.0
4,41,0,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0,1.0,0.0,0.0,0.0


In [16]:
df['cp'].unique()

array(['typical angina', 'asymptomatic', 'non-anginal', 'atypical angina'],
      dtype=object)

In [17]:
df['cp'] = df['cp'].map({'asymptomatic': 0, 'non-anginal': 1, 'atypical angina': 2, 'typical angina': 3})

In [18]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,num,Cleveland,Hungary,Switzerland,VA Long Beach
0,63,1,3,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0,1.0,0.0,0.0,0.0
1,67,1,0,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,2,1.0,0.0,0.0,0.0
2,67,1,0,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,1,1.0,0.0,0.0,0.0
3,37,1,1,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0,1.0,0.0,0.0,0.0
4,41,0,2,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0,1.0,0.0,0.0,0.0


In [19]:
df['fbs'].unique()

array([True, False], dtype=object)

In [20]:
a = df['fbs'].unique()

In [21]:
df['fbs'] = df['fbs'].map({a[1]: 0, a[0]: 1})
df['fbs'].unique()

array([1, 0])

In [22]:
df['restecg'].unique()

array(['lv hypertrophy', 'normal', 'st-t abnormality'], dtype=object)

In [23]:
df['restecg'] = df['restecg'].map({'normal': 0, 'st-t abnormality': 1, 'lv hypertrophy': 2})

In [24]:
b = df['exang'].unique()
b

array([False, True], dtype=object)

In [25]:
df['exang'] = df['exang'].map({b[0]: 0, b[1]: 1})
df['exang'].unique()

array([0, 1])

In [26]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,num,Cleveland,Hungary,Switzerland,VA Long Beach
0,63,1,3,145.0,233.0,1,2,150.0,0,2.3,downsloping,0,1.0,0.0,0.0,0.0
1,67,1,0,160.0,286.0,0,2,108.0,1,1.5,flat,2,1.0,0.0,0.0,0.0
2,67,1,0,120.0,229.0,0,2,129.0,1,2.6,flat,1,1.0,0.0,0.0,0.0
3,37,1,1,130.0,250.0,0,0,187.0,0,3.5,downsloping,0,1.0,0.0,0.0,0.0
4,41,0,2,130.0,204.0,0,2,172.0,0,1.4,upsloping,0,1.0,0.0,0.0,0.0


In [27]:
df['slope'].unique()

array(['downsloping', 'flat', 'upsloping'], dtype=object)

In [28]:
df['slope'] = df['slope'].map({'upsloping': 0, 'flat': 1, 'downsloping': 2})

In [29]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,num,Cleveland,Hungary,Switzerland,VA Long Beach
0,63,1,3,145.0,233.0,1,2,150.0,0,2.3,2,0,1.0,0.0,0.0,0.0
1,67,1,0,160.0,286.0,0,2,108.0,1,1.5,1,2,1.0,0.0,0.0,0.0
2,67,1,0,120.0,229.0,0,2,129.0,1,2.6,1,1,1.0,0.0,0.0,0.0
3,37,1,1,130.0,250.0,0,0,187.0,0,3.5,2,0,1.0,0.0,0.0,0.0
4,41,0,2,130.0,204.0,0,2,172.0,0,1.4,0,0,1.0,0.0,0.0,0.0


In [30]:
y = df['num'].values
X = df.drop('num', axis=1)

In [31]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(531, 15)
(531, 14)


In [32]:
# build mask
gender_0_mask = df['sex'] == 0
gender_1_mask = df['sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  127
Male:  404


In [33]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [34]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.142857  1.000000  0.000000  0.857143       1      17   
1     2  Female  0.000000  0.941176  0.058824  1.000000       0      16   
2     3  Female  0.250000  1.000000  0.000000  0.750000       1      19   
3     4  Female  0.400000  0.823529  0.176471  0.600000       2      14   
4     5  Female  0.500000  0.882353  0.117647  0.500000       2      15   
5     1    Male  0.650000  0.800000  0.200000  0.350000      13      24   
6     2    Male  0.684211  0.846154  0.153846  0.315789      13      22   
7     3    Male  0.833333  0.875000  0.125000  0.166667      20      14   
8     4    Male  



In [35]:
results_df = pd.concat(results_list, ignore_index=True)

In [36]:
result_path = './results/K40_result.xlsx'
results_df.to_excel(result_path, index=False)

In [37]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.142857,1.0,0.0,0.857143,1,17,0,6,...,0,4,0.75,0.5,0.5,0.25,3,2,2,1
1,2,Female,0.0,0.941176,0.058824,1.0,0,16,1,3,...,1,3,0.0,0.75,0.25,0.0,0,3,1,0
2,3,Female,0.25,1.0,0.0,0.75,1,19,0,3,...,2,3,1.0,1.0,0.0,0.0,4,4,0,0
3,4,Female,0.4,0.823529,0.176471,0.6,2,14,3,3,...,3,3,0.5,0.727273,0.272727,0.5,2,8,3,2
4,5,Female,0.5,0.882353,0.117647,0.5,2,15,2,2,...,3,2,0.0,0.833333,0.166667,1.0,0,5,1,2


In [38]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(0.0), pvalue=np.float64(0.0119252335930176))
SVM - FPR: TtestResult(statistic=np.float64(-2.854332726971024), pvalue=np.float64(0.02133645756754084), df=np.float64(8.0))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.6751736149271245))
DT -TPR: MannwhitneyuResult(statistic=np.float64(1.0), pvalue=np.float64(0.02000818529160622))
DT - FPR: TtestResult(statistic=np.float64(-2.333610065409396), pvalue=np.float64(0.04789103278120549), df=np.float64(8.0))
DT - FN/FP: TtestResult(statistic=np.float64(0.2962832493728277), pvalue=np.float64(0.774559570194362), df=np.float64(8.0))
RF -TPR: TtestResult(statistic=np.float64(-2.7426193816838143), pvalue=np.float64(0.05022851461938338), df=np.float64(4.1116146179603135))
RF - FPR: TtestResult(statistic=np.float64(-2.632314965171132), pvalue=np.float64(0.030069913938544367), df=np.float64(8.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(17.0), pvalue=np.float