In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("homelysmile/datacad")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/homelysmile/datacad/versions/2


In [2]:
import os
os.listdir(path)

['DataClean-fullage.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/DataClean-fullage.csv')
df.head()

Unnamed: 0,sno,age,gender,type,day_icu,outcome,smoking,alcohol,diabetes,hypertension,...,chest_infection,count,haemoglobin,anaemia,severe_anaemia,glucose,group_age,group_plate,group_leuk,group_ejectf
0,1,81,M,E,2,DISCHARGE,0,0,1,0,...,0,1,9.5,1,0,80.0,76-150,normal,high,d_normal
1,3,53,M,E,3,DISCHARGE,0,0,1,0,...,0,1,10.6,0,0,187.0,46-60,normal,high,d_normal
2,5,60,F,E,9,DISCHARGE,0,0,0,1,...,0,1,13.6,0,0,144.0,46-60,low,normal,d_normal
3,6,44,M,E,8,DISCHARGE,0,0,1,1,...,0,1,13.5,0,0,217.0,31-45,normal,high,d_normal
4,7,56,F,E,2,DISCHARGE,0,0,1,1,...,0,1,13.3,0,0,277.0,46-60,normal,high,d_normal


In [5]:
df.shape

(6611, 53)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6611 entries, 0 to 6610
Data columns (total 53 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   sno                        6611 non-null   int64  
 1   age                        6611 non-null   int64  
 2   gender                     6611 non-null   object 
 3   type                       6611 non-null   object 
 4   day_icu                    6611 non-null   int64  
 5   outcome                    6611 non-null   object 
 6   smoking                    6611 non-null   int64  
 7   alcohol                    6611 non-null   int64  
 8   diabetes                   6611 non-null   int64  
 9   hypertension               6611 non-null   int64  
 10  cad                        6611 non-null   int64  
 11  cardiomyopathy             6611 non-null   int64  
 12  ckd                        6611 non-null   int64  
 13  leuk_count                 6611 non-null   float

In [7]:
df.isna().sum().sum()

np.int64(0)

In [8]:
df.head()

Unnamed: 0,sno,age,gender,type,day_icu,outcome,smoking,alcohol,diabetes,hypertension,...,chest_infection,count,haemoglobin,anaemia,severe_anaemia,glucose,group_age,group_plate,group_leuk,group_ejectf
0,1,81,M,E,2,DISCHARGE,0,0,1,0,...,0,1,9.5,1,0,80.0,76-150,normal,high,d_normal
1,3,53,M,E,3,DISCHARGE,0,0,1,0,...,0,1,10.6,0,0,187.0,46-60,normal,high,d_normal
2,5,60,F,E,9,DISCHARGE,0,0,0,1,...,0,1,13.6,0,0,144.0,46-60,low,normal,d_normal
3,6,44,M,E,8,DISCHARGE,0,0,1,1,...,0,1,13.5,0,0,217.0,31-45,normal,high,d_normal
4,7,56,F,E,2,DISCHARGE,0,0,1,1,...,0,1,13.3,0,0,277.0,46-60,normal,high,d_normal


In [9]:
df.drop('sno', axis=1, inplace=True)

In [10]:
df['gender'] = df['gender'].map({'F': 0, 'M': 1})

In [11]:
df['type'].unique()

array(['E', 'O'], dtype=object)

In [12]:
enc = OneHotEncoder(categories='auto')
type = df['type'].values.reshape(-1, 1)
enc.fit(type)
new_features = enc.get_feature_names_out()
print(new_features)
new_type = pd.DataFrame(enc.transform(type).toarray())

['x0_E' 'x0_O']


In [13]:
new_type.columns = ['E', 'O']

In [14]:
df = pd.concat([df, new_type], axis=1)
df.drop('type', axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,age,gender,day_icu,outcome,smoking,alcohol,diabetes,hypertension,cad,cardiomyopathy,...,haemoglobin,anaemia,severe_anaemia,glucose,group_age,group_plate,group_leuk,group_ejectf,E,O
0,81,1,2,DISCHARGE,0,0,1,0,0,0,...,9.5,1,0,80.0,76-150,normal,high,d_normal,1.0,0.0
1,53,1,3,DISCHARGE,0,0,1,0,1,0,...,10.6,0,0,187.0,46-60,normal,high,d_normal,1.0,0.0
2,60,0,9,DISCHARGE,0,0,0,1,0,1,...,13.6,0,0,144.0,46-60,low,normal,d_normal,1.0,0.0
3,44,1,8,DISCHARGE,0,0,1,1,1,1,...,13.5,0,0,217.0,31-45,normal,high,d_normal,1.0,0.0
4,56,0,2,DISCHARGE,0,0,1,1,1,1,...,13.3,0,0,277.0,46-60,normal,high,d_normal,1.0,0.0


In [16]:
df['outcome'].unique()

array(['DISCHARGE', 'EXPIRY', 'DAMA'], dtype=object)

In [17]:
enc2 = OneHotEncoder(categories='auto')
outcome = df['outcome'].values.reshape(-1, 1)
enc2.fit(outcome)
new_features = enc2.get_feature_names_out()
print(new_features)
new_outcome = pd.DataFrame(enc2.transform(outcome).toarray())

['x0_DAMA' 'x0_DISCHARGE' 'x0_EXPIRY']


In [18]:
new_outcome.columns = ['DAMA', 'DISCHARGE', 'EXPIRY']

In [19]:
df = pd.concat([df, new_outcome], axis=1)
df.drop('outcome', axis=1, inplace=True)

In [20]:
df.head()

Unnamed: 0,age,gender,day_icu,smoking,alcohol,diabetes,hypertension,cad,cardiomyopathy,ckd,...,glucose,group_age,group_plate,group_leuk,group_ejectf,E,O,DAMA,DISCHARGE,EXPIRY
0,81,1,2,0,0,1,0,0,0,0,...,80.0,76-150,normal,high,d_normal,1.0,0.0,0.0,1.0,0.0
1,53,1,3,0,0,1,0,1,0,0,...,187.0,46-60,normal,high,d_normal,1.0,0.0,0.0,1.0,0.0
2,60,0,9,0,0,0,1,0,1,0,...,144.0,46-60,low,normal,d_normal,1.0,0.0,0.0,1.0,0.0
3,44,1,8,0,0,1,1,1,1,0,...,217.0,31-45,normal,high,d_normal,1.0,0.0,0.0,1.0,0.0
4,56,0,2,0,0,1,1,1,1,0,...,277.0,46-60,normal,high,d_normal,1.0,0.0,0.0,1.0,0.0


In [21]:
df['group_age'].unique()

array(['76-150', '46-60', '31-45', '61-75', '0-30'], dtype=object)

In [22]:
enc3 = OneHotEncoder(categories='auto')
age = df['group_age'].values.reshape(-1, 1)
enc3.fit(age)
new_features = enc3.get_feature_names_out()
print(new_features)
new_age = pd.DataFrame(enc3.transform(age).toarray())

['x0_0-30' 'x0_31-45' 'x0_46-60' 'x0_61-75' 'x0_76-150']


In [23]:
new_age.columns = ['0-30', '31-45', '46-60', '61-75', '76-150']

In [24]:
df = pd.concat([df, new_age], axis=1)
df.drop('group_age', axis=1, inplace=True)
df.head()

Unnamed: 0,age,gender,day_icu,smoking,alcohol,diabetes,hypertension,cad,cardiomyopathy,ckd,...,E,O,DAMA,DISCHARGE,EXPIRY,0-30,31-45,46-60,61-75,76-150
0,81,1,2,0,0,1,0,0,0,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,53,1,3,0,0,1,0,1,0,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,60,0,9,0,0,0,1,0,1,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,44,1,8,0,0,1,1,1,1,0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,56,0,2,0,0,1,1,1,1,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
df['group_plate'].unique()

array(['normal', 'low', 'high'], dtype=object)

In [26]:
df['group_plate'] = df['group_plate'].map({'low': 0, 'normal': 1, 'high': 2})

In [27]:
df['group_leuk'].unique()

array(['high', 'normal', 'low'], dtype=object)

In [28]:
df['group_leuk'] = df['group_leuk'].map({'low': 0, 'normal': 1, 'high': 2})

In [29]:
df['group_ejectf'].unique()

array(['d_normal', 'a_severeHF', 'b_mildHF', 'c_belowNormal'],
      dtype=object)

In [30]:
enc4 = OneHotEncoder(categories='auto')
ejectf = df['group_ejectf'].values.reshape(-1, 1)
enc4.fit(ejectf)
new_features = enc4.get_feature_names_out()
print(new_features)
new_ejectf = pd.DataFrame(enc4.transform(ejectf).toarray())

['x0_a_severeHF' 'x0_b_mildHF' 'x0_c_belowNormal' 'x0_d_normal']


In [31]:
new_ejectf.columns = ['a_severeHF', 'b_mildHF', 'c_belowNormal', 'd_normal']

In [32]:
df = pd.concat([df, new_ejectf], axis=1)
df.drop('group_ejectf', axis=1, inplace=True)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6611 entries, 0 to 6610
Data columns (total 62 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        6611 non-null   int64  
 1   gender                     6611 non-null   int64  
 2   day_icu                    6611 non-null   int64  
 3   smoking                    6611 non-null   int64  
 4   alcohol                    6611 non-null   int64  
 5   diabetes                   6611 non-null   int64  
 6   hypertension               6611 non-null   int64  
 7   cad                        6611 non-null   int64  
 8   cardiomyopathy             6611 non-null   int64  
 9   ckd                        6611 non-null   int64  
 10  leuk_count                 6611 non-null   float64
 11  platelets                  6611 non-null   float64
 12  urea                       6611 non-null   float64
 13  creatinine                 6611 non-null   float

In [34]:
y = df['cad'].values
X = df.drop('cad', axis=1)

In [35]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(6611, 61)
(6611, 32)


In [36]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  2579
Male:  4032


In [37]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=seed)

In [38]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
    Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0      1  Female  0.843023  0.639535  0.360465  0.156977     145      55   
1      2  Female  0.877193  0.747126  0.252874  0.122807     150      65   
2      3  Female  0.834254  0.701299  0.298701  0.165746     151      54   
3      4  Female  0.895349  0.662791  0.337209  0.104651     154      57   
4      5  Female  0.814371  0.758242  0.241758  0.185629     136      69   
5      6  Female  0.885542  0.728261  0.271739  0.114458     147      67   
6      7  Female  0.909091  0.682927  0.317073  0.090909     160      56   
7      8  Female  0.843931  0.635294  0.364706  0.156069     146      54   
8      



In [39]:
results_df = pd.concat(results_list, ignore_index=True)

In [40]:
result_path = './results/K36_result.xlsx'
results_df.to_excel(result_path, index=False)

In [41]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.843023,0.639535,0.360465,0.156977,145,55,31,27,...,25,29,0.773256,0.593023,0.406977,0.226744,133,51,35,39
1,2,Female,0.877193,0.747126,0.252874,0.122807,150,65,22,21,...,27,28,0.859649,0.770115,0.229885,0.140351,147,67,20,24
2,3,Female,0.834254,0.701299,0.298701,0.165746,151,54,23,30,...,29,27,0.839779,0.701299,0.298701,0.160221,152,54,23,29
3,4,Female,0.895349,0.662791,0.337209,0.104651,154,57,29,18,...,33,18,0.848837,0.604651,0.395349,0.151163,146,52,34,26
4,5,Female,0.814371,0.758242,0.241758,0.185629,136,69,22,31,...,26,22,0.760479,0.67033,0.32967,0.239521,127,61,30,40


In [42]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(-3.898004221135078), pvalue=np.float64(0.0010540942301865942), df=np.float64(18.0))
SVM - FPR: TtestResult(statistic=np.float64(-12.18852362195118), pvalue=np.float64(3.9248825520568925e-10), df=np.float64(18.0))
SVM - FN/FP: TtestResult(statistic=np.float64(5.617073262264237), pvalue=np.float64(2.4920344514049116e-05), df=np.float64(18.0))
DT -TPR: TtestResult(statistic=np.float64(2.8090981792633483), pvalue=np.float64(0.011608104510726984), df=np.float64(18.0))
DT - FPR: TtestResult(statistic=np.float64(-4.1389874381942935), pvalue=np.float64(0.000616383501842338), df=np.float64(18.0))
DT - FN/FP: TtestResult(statistic=np.float64(-1.849430479077055), pvalue=np.float64(0.08088629402780313), df=np.float64(18.0))
RF -TPR: TtestResult(statistic=np.float64(-3.381802768860907), pvalue=np.float64(0.0033227214095004824), df=np.float64(18.0))
RF - FPR: TtestResult(statistic=np.float64(-9.277805101642647), pvalue=np.float64(2.794498240935884e-08), df=