In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("asgharalikhan/mortality-rate-heart-patient-pakistan-hospital")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/asgharalikhan/mortality-rate-heart-patient-pakistan-hospital/versions/1


In [2]:
import os
os.listdir(path)

['FIC.Full CSV.csv']

In [3]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/FIC.Full CSV.csv')
df.head()

Unnamed: 0,Age,Age.Group,Gender,Locality,Marital status,Life.Style,Sleep,Category,Depression,Hyperlipi,...,oldpeak,slope,ca,thal,num,SK,SK.React,Reaction,Mortality,Follow.Up
0,45,41-50,Female,RURAL,MARRIED,NO,NO,FREE,YES,YES,...,3.0,2,0,7,2,1,NO,0,0,60
1,51,51-60,Female,URBAN,MARRIED,NO,NO,FREE,YES,YES,...,1.2,2,0,7,2,1,NO,0,0,15
2,55,51-60,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,...,3.4,2,0,3,2,1,NO,0,0,6
3,55,51-60,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,...,2.0,2,1,7,3,1,NO,0,0,52
4,56,51-60,Female,RURAL,MARRIED,YES,NO,FREE,YES,YES,...,4.0,3,2,7,3,1,NO,0,0,34


In [5]:
df.shape

(368, 60)

In [6]:
df.isna().sum().sum()

np.int64(0)

In [7]:
df['Age.Group'].unique()

array(['41-50', '51-60', '61-70', '21-30', '31-40'], dtype=object)

In [8]:
enc = OneHotEncoder(categories='auto')
age = df['Age.Group'].values.reshape(-1, 1)
enc.fit(age)
new_features = enc.get_feature_names_out()
print(new_features)
new_age = pd.DataFrame(enc.transform(age).toarray())

['x0_21-30' 'x0_31-40' 'x0_41-50' 'x0_51-60' 'x0_61-70']


In [9]:
new_age.columns = ['21-30', '31-40', '41-50', '51-60', '61-70']

In [10]:
df = pd.concat([df, new_age], axis=1)
df.drop('Age.Group', axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,Age,Gender,Locality,Marital status,Life.Style,Sleep,Category,Depression,Hyperlipi,Smoking,...,SK,SK.React,Reaction,Mortality,Follow.Up,21-30,31-40,41-50,51-60,61-70
0,45,Female,RURAL,MARRIED,NO,NO,FREE,YES,YES,NO,...,1,NO,0,0,60,0.0,0.0,1.0,0.0,0.0
1,51,Female,URBAN,MARRIED,NO,NO,FREE,YES,YES,NO,...,1,NO,0,0,15,0.0,0.0,0.0,1.0,0.0
2,55,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,NO,...,1,NO,0,0,6,0.0,0.0,0.0,1.0,0.0
3,55,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,NO,...,1,NO,0,0,52,0.0,0.0,0.0,1.0,0.0
4,56,Female,RURAL,MARRIED,YES,NO,FREE,YES,YES,NO,...,1,NO,0,0,34,0.0,0.0,0.0,1.0,0.0


In [12]:
df.columns = df.columns.str.replace(' ', '')

In [13]:
df['Life.Style'].unique()

array(['NO', 'YES'], dtype=object)

In [14]:
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})
df['Locality'] = df['Locality'].map({'RURAL': 0, 'URBAN': 1})
df['Maritalstatus'] = df['Maritalstatus'].map({'MARRIED': 0, 'SINGLE': 1})
df['Life.Style'] = df['Life.Style'].map({'NO': 0, 'YES': 1})

In [15]:
df.head()


Unnamed: 0,Age,Gender,Locality,Maritalstatus,Life.Style,Sleep,Category,Depression,Hyperlipi,Smoking,...,SK,SK.React,Reaction,Mortality,Follow.Up,21-30,31-40,41-50,51-60,61-70
0,45,0,0,0,0,NO,FREE,YES,YES,NO,...,1,NO,0,0,60,0.0,0.0,1.0,0.0,0.0
1,51,0,1,0,0,NO,FREE,YES,YES,NO,...,1,NO,0,0,15,0.0,0.0,0.0,1.0,0.0
2,55,0,0,0,1,YES,FREE,YES,YES,NO,...,1,NO,0,0,6,0.0,0.0,0.0,1.0,0.0
3,55,0,0,0,1,YES,FREE,YES,YES,NO,...,1,NO,0,0,52,0.0,0.0,0.0,1.0,0.0
4,56,0,0,0,1,NO,FREE,YES,YES,NO,...,1,NO,0,0,34,0.0,0.0,0.0,1.0,0.0


In [16]:
df['Smoking'].unique()

array(['NO', 'YES'], dtype=object)

In [17]:
df['Sleep'] = df['Sleep'].map({'NO': 0, 'YES': 1})
df['Category'] = df['Category'].map({'FREE': 0, 'PAID': 1})
df['Depression'] = df['Depression'].map({'NO': 0, 'YES': 1})
df['Hyperlipi'] = df['Hyperlipi'].map({'NO': 0, 'YES': 1})
df['Smoking'] = df['Smoking'].map({'NO': 0, 'YES': 1})
df['Family.History'] = df['Family.History'].map({'NO': 0, 'YES': 1})
df['HTN'] = df['HTN'].map({'NO': 0, 'YES': 1})
df['Allergies'] = df['Allergies'].map({'NO': 0, 'YES': 1})
df['Hypersensitivity'] = df['Hypersensitivity'].map({'NO': 0, 'YES': 1})

In [18]:
df.head()

Unnamed: 0,Age,Gender,Locality,Maritalstatus,Life.Style,Sleep,Category,Depression,Hyperlipi,Smoking,...,SK,SK.React,Reaction,Mortality,Follow.Up,21-30,31-40,41-50,51-60,61-70
0,45,0,0,0,0,0,0,1,1,0,...,1,NO,0,0,60,0.0,0.0,1.0,0.0,0.0
1,51,0,1,0,0,0,0,1,1,0,...,1,NO,0,0,15,0.0,0.0,0.0,1.0,0.0
2,55,0,0,0,1,1,0,1,1,0,...,1,NO,0,0,6,0.0,0.0,0.0,1.0,0.0
3,55,0,0,0,1,1,0,1,1,0,...,1,NO,0,0,52,0.0,0.0,0.0,1.0,0.0
4,56,0,0,0,1,0,0,1,1,0,...,1,NO,0,0,34,0.0,0.0,0.0,1.0,0.0


In [19]:
df['SK.React'].unique()

array(['NO', 'COUGH.BLEEDING', 'SKIN.BLEEDING', 'LUNGS', 'BODY.PAIN',
       'NAUSEA.TEMP', 'STOMACH.BLEEDING'], dtype=object)

In [20]:
df.drop(['Others', 'CO', 'Diagnosis'], axis=1, inplace=True)

In [21]:
enc2 = OneHotEncoder(categories='auto')
sk = df['SK.React'].values.reshape(-1, 1)
enc2.fit(sk)
new_features = enc2.get_feature_names_out()
print(new_features)
new_sk = pd.DataFrame(enc2.transform(sk).toarray())

['x0_BODY.PAIN' 'x0_COUGH.BLEEDING' 'x0_LUNGS' 'x0_NAUSEA.TEMP' 'x0_NO'
 'x0_SKIN.BLEEDING' 'x0_STOMACH.BLEEDING']


In [22]:
new_sk.columns = ['BODY_PAIN', 'COUGH_BLEEDING', 'LUNGS', 'NAUSEA_TEMP', 'NO', 'SKIN_BLEEDING', 'STOMACH_BLEEDING']

In [23]:
df = pd.concat([df, new_sk], axis=1)
df.drop('SK.React', axis=1, inplace=True)

In [24]:
df.head()

Unnamed: 0,Age,Gender,Locality,Maritalstatus,Life.Style,Sleep,Category,Depression,Hyperlipi,Smoking,...,41-50,51-60,61-70,BODY_PAIN,COUGH_BLEEDING,LUNGS,NAUSEA_TEMP,NO,SKIN_BLEEDING,STOMACH_BLEEDING
0,45,0,0,0,0,0,0,1,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,51,0,1,0,0,0,0,1,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,55,0,0,0,1,1,0,1,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,55,0,0,0,1,1,0,1,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,56,0,0,0,1,0,0,1,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 67 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               368 non-null    int64  
 1   Gender            368 non-null    int64  
 2   Locality          368 non-null    int64  
 3   Maritalstatus     368 non-null    int64  
 4   Life.Style        368 non-null    int64  
 5   Sleep             368 non-null    int64  
 6   Category          368 non-null    int64  
 7   Depression        368 non-null    int64  
 8   Hyperlipi         368 non-null    int64  
 9   Smoking           368 non-null    int64  
 10  Family.History    368 non-null    int64  
 11  F.History         368 non-null    int64  
 12  Diabetes          368 non-null    int64  
 13  HTN               368 non-null    int64  
 14  Allergies         368 non-null    int64  
 15  BP                368 non-null    float64
 16  Thrombolysis      368 non-null    int64  
 1

In [26]:
y = df['Mortality'].values
X = df.drop('Mortality', axis=1)

In [27]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(368, 66)
(368, 48)


In [28]:
# build mask
gender_0_mask = df['Gender'] == 0
gender_1_mask = df['Gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  83
Male:  285


In [29]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [30]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Trainin

In [31]:
results_df = pd.concat(results_list, ignore_index=True)

In [32]:
result_path = './results/k44_result.xlsx'
results_df.to_excel(result_path, index=False)

In [33]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.857143,0.9,0.1,0.142857,6,9,1,1,...,1,1,0.0,0.9,0.1,1.0,0,9,1,7
1,2,Female,0.75,1.0,0.0,0.25,6,9,0,2,...,0,1,0.0,1.0,0.0,1.0,0,9,0,8
2,3,Female,1.0,1.0,0.0,0.0,8,9,0,0,...,2,0,0.125,0.888889,0.111111,0.875,1,8,1,7
3,4,Female,0.8,1.0,0.0,0.2,4,11,0,1,...,0,1,0.8,0.272727,0.727273,0.2,4,3,8,1
4,5,Female,1.0,1.0,0.0,0.0,6,10,0,0,...,1,0,0.0,1.0,0.0,1.0,0,10,0,6


In [34]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(0.1587762282710117), pvalue=np.float64(0.877780217168789), df=np.float64(8.0))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(11.0), pvalue=np.float64(0.8271467921464003))
DT -TPR: MannwhitneyuResult(statistic=np.float64(9.0), pvalue=np.float64(0.4801768899060772))
DT - FPR: MannwhitneyuResult(statistic=np.float64(20.5), pvalue=np.float64(0.11384629800665805))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(17.0), pvalue=np.float64(0.34652171170610724))
RF -TPR: TtestResult(statistic=np.float64(0.8086841771874304), pvalue=np.float64(0.442080192386353), df=np.float64(8.0))
RF - FPR: MannwhitneyuResult(statistic=np.float64(20.0), pvalue=np.float64(0.07070114486598297))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(9.0), pvalue=np.float64(0.5124116946904278))
LR -TPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.6