In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kapoorprakhar/cardio-health-risk-assessment-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/kapoorprakhar/cardio-health-risk-assessment-dataset/versions/1


In [2]:
import os
os.listdir(path)

['Heart_Disease_Prediction.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/Heart_Disease_Prediction.csv')
df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,80,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,55,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,65,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,45,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [5]:
df.shape

(270, 14)

In [6]:
df['Heart Disease'].unique()

array(['Presence', 'Absence'], dtype=object)

In [7]:
df['Heart Disease'] = df['Heart Disease'].map({'Absence': 0, 'Presence': 1})

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      270 non-null    int64  
 1   Sex                      270 non-null    int64  
 2   Chest pain type          270 non-null    int64  
 3   BP                       270 non-null    int64  
 4   Cholesterol              270 non-null    int64  
 5   FBS over 120             270 non-null    int64  
 6   EKG results              270 non-null    int64  
 7   Max HR                   270 non-null    int64  
 8   Exercise angina          270 non-null    int64  
 9   ST depression            270 non-null    float64
 10  Slope of ST              270 non-null    int64  
 11  Number of vessels fluro  270 non-null    int64  
 12  Thallium                 270 non-null    int64  
 13  Heart Disease            270 non-null    int64  
dtypes: float64(1), int64(13)
m

In [9]:
df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,1
1,80,0,3,115,564,0,2,160,0,1.6,2,0,7,0
2,55,1,2,124,261,0,0,141,0,0.3,1,0,7,1
3,65,1,4,128,263,0,0,105,1,0.2,2,1,7,0
4,45,0,2,120,269,0,2,121,1,0.2,1,1,3,0


In [10]:
y = df['Heart Disease'].values
X = df.drop('Heart Disease', axis=1)

In [11]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(270, 13)
(270, 13)


In [12]:
# build mask
gender_0_mask = df['Sex'] == 0
gender_1_mask = df['Sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  87
Male:  183


In [13]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [14]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.500000  1.000000  0.000000  0.500000       1      16   
1     2  Female  0.750000  1.000000  0.000000  0.250000       3      14   
2     3  Female  0.400000  1.000000  0.000000  0.600000       2      12   
3     4  Female  1.000000  0.933333  0.066667  0.000000       2      14   
4     5  Female  0.428571  0.900000  0.100000  0.571429       3       9   
5     1    Male  0.736842  0.888889  0.111111  0.263158      14      16   
6     2    Male  0.760000  0.750000  0.250000  0.240000      19       9   
7     3    Male  0.807692  0.909091  0.090909  0.192308      21      10   
8     4    Male  0.928571  0.545455  0.454545  0.071429      13      12   
9     5    Male  0.812500  0.750000  0.250000  0.187500      13      15   

   SVM_FP  SVM_FN  ...  ANN_FP  ANN_FN    NB_TPR    NB_TNR    NB_FPR  \



In [15]:
results_df = pd.concat(results_list, ignore_index=True)

In [16]:
result_path = './results/K18_result.xlsx'
results_df.to_excel(result_path, index=False)

In [17]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.5,1.0,0.0,0.5,1,16,0,1,...,1,0,1.0,0.9375,0.0625,0.0,2,15,1,0
1,2,Female,0.75,1.0,0.0,0.25,3,14,0,1,...,0,1,1.0,0.928571,0.071429,0.0,4,13,1,0
2,3,Female,0.4,1.0,0.0,0.6,2,12,0,3,...,2,0,0.8,0.916667,0.083333,0.2,4,11,1,1
3,4,Female,1.0,0.933333,0.066667,0.0,2,14,1,0,...,1,0,1.0,0.933333,0.066667,0.0,2,14,1,0
4,5,Female,0.428571,0.9,0.1,0.571429,3,9,1,4,...,1,3,0.571429,0.7,0.3,0.428571,4,7,3,3


In [18]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(-1.6271302987475333), pvalue=np.float64(0.14236078760425624), df=np.float64(8.0))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(1.0), pvalue=np.float64(0.019624414976639706))
SVM - FN/FP: TtestResult(statistic=np.float64(1.8257418582458775), pvalue=np.float64(0.10532192113422431), df=np.float64(8.0))
DT -TPR: TtestResult(statistic=np.float64(-2.4426956364529735), pvalue=np.float64(0.040394307478818905), df=np.float64(8.0))
DT - FPR: TtestResult(statistic=np.float64(-3.998938797668668), pvalue=np.float64(0.003955627458726489), df=np.float64(8.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(13.0), pvalue=np.float64(1.0))
RF -TPR: TtestResult(statistic=np.float64(-1.3877510905390327), pvalue=np.float64(0.20264061752396306), df=np.float64(8.0))
RF - FPR: TtestResult(statistic=np.float64(-3.5086414890706994), pvalue=np.float64(0.00797744129484177), df=np.float64(8.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(17.0), pvalue=np