In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/indian-liver-patient-records")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/uciml/indian-liver-patient-records/versions/1


In [2]:
import os
os.listdir(path)

['indian_liver_patient.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/indian_liver_patient.csv')
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [5]:
df.shape

(583, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [7]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [8]:
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})

In [9]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [10]:
df['Gender'].unique()

array([0, 1])

In [11]:
y = df['Dataset'].values
X = df.drop('Dataset', axis=1)

In [12]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(579, 10)
(579, 10)


In [13]:
# build mask
gender_0_mask = df['Gender'] == 0
gender_1_mask = df['Gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  140
Male:  439


In [14]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [15]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.000000  1.000000  0.000000  1.000000       0      19   
1     2  Female  0.000000  1.000000  0.000000  1.000000       0      16   
2     3  Female  0.000000  1.000000  0.000000  1.000000       0      19   
3     4  Female  0.000000  1.000000  0.000000  1.000000       0      18   
4     5  Female  0.000000  0.894737  0.105263  1.000000       0      17   
5     1    Male  0.074074  0.918033  0.081967  0.925926       2      56   
6     2    Male  0.000000  1.000000  0.000000  1.000000       0      63   
7     3    Male  0.000000  1.000000  0.000000  1.000000       0      64   
8     4    Male  



In [16]:
results_df = pd.concat(results_list, ignore_index=True)

In [17]:
result_path = './results/K39_result.xlsx'
results_df.to_excel(result_path, index=False)

In [18]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.0,1.0,0.0,1.0,0,19,0,9,...,6,8,0.888889,0.473684,0.526316,0.111111,8,9,10,1
1,2,Female,0.0,1.0,0.0,1.0,0,16,0,12,...,1,11,1.0,0.375,0.625,0.0,12,6,10,0
2,3,Female,0.0,1.0,0.0,1.0,0,19,0,9,...,5,6,0.777778,0.473684,0.526316,0.222222,7,9,10,2
3,4,Female,0.0,1.0,0.0,1.0,0,18,0,10,...,7,8,0.9,0.277778,0.722222,0.1,9,5,13,1
4,5,Female,0.0,0.894737,0.105263,1.0,0,17,2,9,...,5,7,0.888889,0.368421,0.631579,0.111111,8,7,12,1


In [19]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(13.0), pvalue=np.float64(1.0))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(4.0), pvalue=np.float64(0.0936926194932482))
DT -TPR: TtestResult(statistic=np.float64(-0.7988841209911899), pvalue=np.float64(0.44742401614012195), df=np.float64(8.0))
DT - FPR: MannwhitneyuResult(statistic=np.float64(22.0), pvalue=np.float64(0.059327060946523506))
DT - FN/FP: TtestResult(statistic=np.float64(0.41811346420058554), pvalue=np.float64(0.6868617444682084), df=np.float64(8.0))
RF -TPR: TtestResult(statistic=np.float64(-1.5399299646013664), pvalue=np.float64(0.1621422186737848), df=np.float64(8.0))
RF - FPR: TtestResult(statistic=np.float64(3.495736160851175), pvalue=np.float64(0.008129739016635565), df=np.float64(8.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(5.0), pvalue=np.float64(0.14245669739409875))
LR -TPR: TtestResult(statistic=