In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nezahatkk/heart-disease-data")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/nezahatkk/heart-disease-data/versions/1


In [2]:
import os
os.listdir(path)

['heart_disease_data_with_features.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/heart_disease_data_with_features.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,chol_trestbps_ratio,log_thalach_chol,symptom_zscore,avg_chol_by_age_group,thalach_chol_diff,symptom_severity_diff,age_chol_effect,thalach_risk_effect,age_trestbps_effect,chol_risk_ratio
0,63,1,1,145,233,1,2,150,0,2.3,...,1.59589,0.919704,-0.275764,260.024691,-83,-2.532099,14679,2419.35,9135,13.602662
1,67,1,4,160,286,0,2,108,1,1.5,...,1.776398,0.828936,0.568702,260.024691,-178,1.167901,19162,2242.296,10720,13.142174
2,67,1,4,120,229,0,2,129,1,2.6,...,1.892562,0.895083,1.572932,260.024691,-100,5.567901,15343,2134.047,8040,13.05364
3,37,1,3,130,250,0,0,187,0,3.5,...,1.908397,0.947695,1.595755,219.785714,-63,8.178571,9250,1972.85,4810,21.645022
4,41,0,2,130,204,0,2,172,0,1.4,...,1.557252,0.968116,-0.161647,235.847222,-32,0.783333,8364,1662.208,5330,19.129782


In [5]:
df.shape

(303, 36)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 36 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    303 non-null    int64  
 1   sex                    303 non-null    int64  
 2   cp                     303 non-null    int64  
 3   trestbps               303 non-null    int64  
 4   chol                   303 non-null    int64  
 5   fbs                    303 non-null    int64  
 6   restecg                303 non-null    int64  
 7   thalach                303 non-null    int64  
 8   exang                  303 non-null    int64  
 9   oldpeak                303 non-null    float64
 10  slope                  303 non-null    int64  
 11  ca                     299 non-null    float64
 12  thal                   301 non-null    float64
 13  num                    303 non-null    int64  
 14  age_group              302 non-null    object 
 15  choles

In [7]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293 entries, 0 to 292
Data columns (total 36 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    293 non-null    int64  
 1   sex                    293 non-null    int64  
 2   cp                     293 non-null    int64  
 3   trestbps               293 non-null    int64  
 4   chol                   293 non-null    int64  
 5   fbs                    293 non-null    int64  
 6   restecg                293 non-null    int64  
 7   thalach                293 non-null    int64  
 8   exang                  293 non-null    int64  
 9   oldpeak                293 non-null    float64
 10  slope                  293 non-null    int64  
 11  ca                     293 non-null    float64
 12  thal                   293 non-null    float64
 13  num                    293 non-null    int64  
 14  age_group              293 non-null    object 
 15  choles

In [8]:
df['age_group'].unique()

array(['60s', '30s', '40s', '50s', '70s'], dtype=object)

In [9]:
enc = OneHotEncoder(categories='auto')
age = df['age_group'].values.reshape(-1, 1)
enc.fit(age)
new_features = enc.get_feature_names_out()
print(new_features)
new_age = pd.DataFrame(enc.transform(age).toarray())

['x0_30s' 'x0_40s' 'x0_50s' 'x0_60s' 'x0_70s']


In [10]:
new_age.columns = ['30s', '40s', '50s', '60s', '70s']

In [11]:
df = pd.concat([df, new_age], axis=1)
df.drop('age_group', axis=1, inplace=True)

In [12]:
df['cholesterol_level'] = df['cholesterol_level'].map({'low': 0, 'normal': 1, 'high': 2})

In [13]:
df['bp_level'].unique()

array(['high', 'low', 'normal'], dtype=object)

In [14]:
df['bp_level'] = df['bp_level'].map({'low': 0, 'normal': 1, 'high': 2})

In [15]:
df['sex'].unique()

array([1, 0])

In [16]:
y = df['cp'].values
X = df.drop('cp', axis=1)

In [17]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(293, 39)
(293, 33)


In [18]:
# build mask
gender_0_mask = df['sex'] == 0
gender_1_mask = df['sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  95
Male:  198


In [19]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [20]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group  SVM_TPR  SVM_TNR  SVM_FPR  SVM_FNR  SVM_TP  SVM_TN  SVM_FP  \
0     1  Female      0.0      0.0      0.0      0.0       0       0       0   
1     2  Female      0.0      0.0      0.0      0.0       0       0       0   
2     3  Female      0.0      0.0      0.0      0.0       0       0       0   
3     4  Female      1.0      0.0      1.0      0.0       3       0       3   
4     5  Female      1.0      0.0      1.0      0.0       6       0       3   
5     1    Male      1.0      0.0      0.0      0.0       2       0       0   
6     2    Male      0.0      0.0      0.0      0.0       0       0       0   
7     3    Male      1.0      0.0      0.0      0.0       3       0       0   
8     4    Male      0.0      0.0      0.0      1.0       0       0       0   
9     5    Male      1.0      0.0      0.0      0.0       1       0       0   

   SVM_FN  ...  ANN_FP  ANN



In [21]:
results_df = pd.concat(results_list, ignore_index=True)

In [22]:
result_path = './results/K29_result.xlsx'
results_df.to_excel(result_path, index=False)

In [23]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0,0,0,0,0,0,0,0,...,0,0,1.0,1.0,0.0,0.0,4,1,0,0
1,2,Female,0,0,0,0,0,0,0,0,...,0,0,1.0,0.0,1.0,0.0,1,0,2,0
2,3,Female,0,0,0,0,0,0,0,0,...,0,0,1.0,0.0,0.0,0.0,2,0,0,0
3,4,Female,1,0,1,0,3,0,3,0,...,2,0,0.666667,0.666667,0.333333,0.333333,2,2,1,1
4,5,Female,1,0,1,0,6,0,3,0,...,2,3,0.666667,0.0,0.0,0.333333,2,0,0,1


In [24]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.6312273930324451))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(17.5), pvalue=np.float64(0.17701598287480413))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
DT -TPR: MannwhitneyuResult(statistic=np.float64(8.0), pvalue=np.float64(0.3643461266335529))
DT - FPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.6004712601233854))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(10.5), pvalue=np.float64(0.6985353583033387))
RF -TPR: MannwhitneyuResult(statistic=np.float64(7.5), pvalue=np.float64(0.3192938744800329))
RF - FPR: MannwhitneyuResult(statistic=np.float64(13.0), pvalue=np.float64(1.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(11.0), pvalue=np.float64(0.7962534147376392))
LR -TPR: MannwhitneyuResult(statistic=np.float64(10.5), pvalue=np.float64(0.7373156772164182))
LR - FPR: MannwhitneyuResult(statistic=np.float64(1