In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rashadrmammadov/heart-disease-prediction")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/rashadrmammadov/heart-disease-prediction/versions/1


In [2]:
import os
os.listdir(path)

['heart_disease_dataset.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/heart_disease_dataset.csv')

In [5]:
df.shape

(1000, 16)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1000 non-null   int64 
 1   Gender                   1000 non-null   object
 2   Cholesterol              1000 non-null   int64 
 3   Blood Pressure           1000 non-null   int64 
 4   Heart Rate               1000 non-null   int64 
 5   Smoking                  1000 non-null   object
 6   Alcohol Intake           660 non-null    object
 7   Exercise Hours           1000 non-null   int64 
 8   Family History           1000 non-null   object
 9   Diabetes                 1000 non-null   object
 10  Obesity                  1000 non-null   object
 11  Stress Level             1000 non-null   int64 
 12  Blood Sugar              1000 non-null   int64 
 13  Exercise Induced Angina  1000 non-null   object
 14  Chest Pain Type          1000 non-null   

In [7]:
df.drop('Alcohol Intake', axis=1, inplace=True)
df.head()

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,Female,228,119,66,Current,1,No,No,Yes,8,119,Yes,Atypical Angina,1
1,48,Male,204,165,62,Current,5,No,No,No,9,70,Yes,Typical Angina,0
2,53,Male,234,91,67,Never,3,Yes,No,Yes,5,196,Yes,Atypical Angina,1
3,69,Female,192,90,72,Current,4,No,Yes,No,7,107,Yes,Non-anginal Pain,0
4,62,Female,172,163,93,Never,6,No,Yes,No,2,183,Yes,Asymptomatic,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1000 non-null   int64 
 1   Gender                   1000 non-null   object
 2   Cholesterol              1000 non-null   int64 
 3   Blood Pressure           1000 non-null   int64 
 4   Heart Rate               1000 non-null   int64 
 5   Smoking                  1000 non-null   object
 6   Exercise Hours           1000 non-null   int64 
 7   Family History           1000 non-null   object
 8   Diabetes                 1000 non-null   object
 9   Obesity                  1000 non-null   object
 10  Stress Level             1000 non-null   int64 
 11  Blood Sugar              1000 non-null   int64 
 12  Exercise Induced Angina  1000 non-null   object
 13  Chest Pain Type          1000 non-null   object
 14  Heart Disease            1000 non-null   

In [9]:
df.head()

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,Female,228,119,66,Current,1,No,No,Yes,8,119,Yes,Atypical Angina,1
1,48,Male,204,165,62,Current,5,No,No,No,9,70,Yes,Typical Angina,0
2,53,Male,234,91,67,Never,3,Yes,No,Yes,5,196,Yes,Atypical Angina,1
3,69,Female,192,90,72,Current,4,No,Yes,No,7,107,Yes,Non-anginal Pain,0
4,62,Female,172,163,93,Never,6,No,Yes,No,2,183,Yes,Asymptomatic,0


In [10]:
df['Chest Pain Type'].unique()

array(['Atypical Angina', 'Typical Angina', 'Non-anginal Pain',
       'Asymptomatic'], dtype=object)

In [11]:
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})
df['Smoking'] = df['Smoking'].map({'Never': 0, 'Former': 1, 'Current': 2})
df['Family History'] = df['Family History'].map({'No': 0, 'Yes': 1})
df['Diabetes'] = df['Diabetes'].map({'No': 0, 'Yes': 1})
df['Obesity'] = df['Obesity'].map({'No': 0, 'Yes': 1})
df['Exercise Induced Angina'] = df['Exercise Induced Angina'].map({'No': 0, 'Yes': 1})
df['Chest Pain Type'] = df['Chest Pain Type'].map({'Asymptomatic': 0, 'Non-anginal Pain': 1, 'Atypical Angina': 2, 'Typical Angina': 3})

In [12]:
df.head()

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,0,228,119,66,2,1,0,0,1,8,119,1,2,1
1,48,1,204,165,62,2,5,0,0,0,9,70,1,3,0
2,53,1,234,91,67,0,3,1,0,1,5,196,1,2,1
3,69,0,192,90,72,2,4,0,1,0,7,107,1,1,0
4,62,0,172,163,93,0,6,0,1,0,2,183,1,0,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Age                      1000 non-null   int64
 1   Gender                   1000 non-null   int64
 2   Cholesterol              1000 non-null   int64
 3   Blood Pressure           1000 non-null   int64
 4   Heart Rate               1000 non-null   int64
 5   Smoking                  1000 non-null   int64
 6   Exercise Hours           1000 non-null   int64
 7   Family History           1000 non-null   int64
 8   Diabetes                 1000 non-null   int64
 9   Obesity                  1000 non-null   int64
 10  Stress Level             1000 non-null   int64
 11  Blood Sugar              1000 non-null   int64
 12  Exercise Induced Angina  1000 non-null   int64
 13  Chest Pain Type          1000 non-null   int64
 14  Heart Disease            1000 non-null   int64
dtypes: in

In [14]:
y = df['Heart Disease'].values
X = df.drop('Heart Disease', axis=1)

In [15]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(1000, 14)
(1000, 14)


In [16]:
# build mask
gender_0_mask = df['Gender'] == 0
gender_1_mask = df['Gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  503
Male:  497


In [17]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=seed)

In [18]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
    Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0      1  Female  0.812500  0.914286  0.085714  0.187500      13      32   
1      2  Female  0.827586  0.954545  0.045455  0.172414      24      21   
2      3  Female  0.842105  0.968750  0.031250  0.157895      16      31   
3      4  Female  0.833333  0.947368  0.052632  0.166667      10      36   
4      5  Female  0.857143  0.972222  0.027778  0.142857      12      35   
5      6  Female  1.000000  0.968750  0.031250  0.000000      18      31   
6      7  Female  0.708333  1.000000  0.000000  0.291667      17      26   
7      8  Female  0.705882  0.969697  0.030303  0.294118      12      32   
8      



In [19]:
results_df = pd.concat(results_list, ignore_index=True)

In [20]:
result_path = './results/K22_result.xlsx'
results_df.to_excel(result_path, index=False)

In [21]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.8125,0.914286,0.085714,0.1875,13,32,3,3,...,3,2,0.8125,0.971429,0.028571,0.1875,13,34,1,3
1,2,Female,0.827586,0.954545,0.045455,0.172414,24,21,1,5,...,1,4,0.758621,1.0,0.0,0.241379,22,22,0,7
2,3,Female,0.842105,0.96875,0.03125,0.157895,16,31,1,3,...,2,2,0.789474,0.96875,0.03125,0.210526,15,31,1,4
3,4,Female,0.833333,0.947368,0.052632,0.166667,10,36,2,2,...,2,0,0.833333,0.921053,0.078947,0.166667,10,35,3,2
4,5,Female,0.857143,0.972222,0.027778,0.142857,12,35,1,2,...,2,1,1.0,1.0,0.0,0.0,14,36,0,0


In [22]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(-0.4981342257369216), pvalue=np.float64(0.6244209221865927), df=np.float64(18.0))
SVM - FPR: TtestResult(statistic=np.float64(-1.6363568703572164), pvalue=np.float64(0.11912945007720623), df=np.float64(18.0))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(52.0), pvalue=np.float64(0.9092450221573444))
DT -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
DT - FPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(45.0), pvalue=np.float64(0.36812025069351895))
RF - FPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(55.0), pvalue=np.float64(0.36812025069351895))
LR -TPR: TtestResult(statistic=np.float64(-0.19835044620174758), pvalue=np.float64(0.8449938997699813), df=np.float64(18.0))
LR -