In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rabieelkharoua/predict-liver-disease-1700-records-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/rabieelkharoua/predict-liver-disease-1700-records-dataset/versions/1


In [2]:
import os
os.listdir(path)

['Liver_disease_data.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/Liver_disease_data.csv')
df.head()

Unnamed: 0,Age,Gender,BMI,AlcoholConsumption,Smoking,GeneticRisk,PhysicalActivity,Diabetes,Hypertension,LiverFunctionTest,Diagnosis
0,58,0,35.857584,17.272828,0,1,0.65894,0,0,42.73424,1
1,71,1,30.73247,2.201266,0,1,1.670557,1,0,67.309822,1
2,48,0,19.971407,18.500944,0,0,9.928308,0,0,63.738956,0
3,34,1,16.615417,12.63287,0,0,5.630129,0,0,64.555873,1
4,62,1,16.06583,1.087815,0,1,3.566218,1,0,77.868689,1


In [5]:
df.shape

(1700, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 1700 non-null   int64  
 1   Gender              1700 non-null   int64  
 2   BMI                 1700 non-null   float64
 3   AlcoholConsumption  1700 non-null   float64
 4   Smoking             1700 non-null   int64  
 5   GeneticRisk         1700 non-null   int64  
 6   PhysicalActivity    1700 non-null   float64
 7   Diabetes            1700 non-null   int64  
 8   Hypertension        1700 non-null   int64  
 9   LiverFunctionTest   1700 non-null   float64
 10  Diagnosis           1700 non-null   int64  
dtypes: float64(4), int64(7)
memory usage: 146.2 KB


In [7]:
df['Gender'].unique()

array([0, 1])

In [8]:
y = df['Diagnosis'].values
X = df.drop('Diagnosis', axis=1)

In [9]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(1700, 10)
(1700, 10)


In [10]:
# build mask
gender_0_mask = df['Gender'] == 0
gender_1_mask = df['Gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  843
Male:  857


In [11]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [12]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.851351  0.842105  0.157895  0.148649      63      80   
1     2  Female  0.779221  0.913043  0.086957  0.220779      60      84   
2     3  Female  0.835616  0.895833  0.104167  0.164384      61      86   
3     4  Female  0.810127  0.797753  0.202247  0.189873      64      71   
4     5  Female  0.790123  0.873563  0.126437  0.209877      64      76   
5     1    Male  0.963636  0.725806  0.274194  0.036364     106      45   
6     2    Male  0.913793  0.678571  0.321429  0.086207     106      38   
7     3    Male  0.865385  0.701493  0.298507  0.134615      90      47   
8     4    Male  



In [13]:
results_df = pd.concat(results_list, ignore_index=True)

In [14]:
result_path = './results/K25_result.xlsx'
results_df.to_excel(result_path, index=False)

In [15]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.851351,0.842105,0.157895,0.148649,63,80,15,11,...,16,10,0.810811,0.821053,0.178947,0.189189,60,78,17,14
1,2,Female,0.779221,0.913043,0.086957,0.220779,60,84,8,17,...,5,18,0.779221,0.869565,0.130435,0.220779,60,80,12,17
2,3,Female,0.835616,0.895833,0.104167,0.164384,61,86,10,12,...,11,14,0.712329,0.875,0.125,0.287671,52,84,12,21
3,4,Female,0.810127,0.797753,0.202247,0.189873,64,71,18,15,...,20,13,0.683544,0.797753,0.202247,0.316456,54,71,18,25
4,5,Female,0.790123,0.873563,0.126437,0.209877,64,76,11,17,...,10,17,0.777778,0.850575,0.149425,0.222222,63,74,13,18


In [16]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(-4.876247289028811), pvalue=np.float64(0.0012301924748186035), df=np.float64(8.0))
SVM - FPR: TtestResult(statistic=np.float64(-5.001970209343365), pvalue=np.float64(0.001050238030800717), df=np.float64(8.0))
SVM - FN/FP: TtestResult(statistic=np.float64(2.6777071113791857), pvalue=np.float64(0.028025418882360428), df=np.float64(8.0))
DT -TPR: TtestResult(statistic=np.float64(-2.341908835128447), pvalue=np.float64(0.04727460998425441), df=np.float64(8.0))
DT - FPR: TtestResult(statistic=np.float64(-2.041620271035891), pvalue=np.float64(0.07548377118988663), df=np.float64(8.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
RF -TPR: TtestResult(statistic=np.float64(-2.933511109422888), pvalue=np.float64(0.018896327914488847), df=np.float64(8.0))
RF - FPR: TtestResult(statistic=np.float64(-2.887370401915957), pvalue=np.float64(0.020280818929884484), df=np.float64(8.0))
RF - FN/FP: TtestResult(statistic=np.floa