In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("fedesoriano/cirrhosis-prediction-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/fedesoriano/cirrhosis-prediction-dataset/versions/2


In [2]:
import os
os.listdir(path)

['cirrhosis.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/cirrhosis.csv')
df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [5]:
df.shape

(418, 20)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             418 non-null    int64  
 1   N_Days         418 non-null    int64  
 2   Status         418 non-null    object 
 3   Drug           312 non-null    object 
 4   Age            418 non-null    int64  
 5   Sex            418 non-null    object 
 6   Ascites        312 non-null    object 
 7   Hepatomegaly   312 non-null    object 
 8   Spiders        312 non-null    object 
 9   Edema          418 non-null    object 
 10  Bilirubin      418 non-null    float64
 11  Cholesterol    284 non-null    float64
 12  Albumin        418 non-null    float64
 13  Copper         310 non-null    float64
 14  Alk_Phos       312 non-null    float64
 15  SGOT           312 non-null    float64
 16  Tryglicerides  282 non-null    float64
 17  Platelets      407 non-null    float64
 18  Prothrombi

In [7]:
df.drop('ID', axis=1, inplace=True)

In [8]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276 entries, 0 to 275
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         276 non-null    int64  
 1   Status         276 non-null    object 
 2   Drug           276 non-null    object 
 3   Age            276 non-null    int64  
 4   Sex            276 non-null    object 
 5   Ascites        276 non-null    object 
 6   Hepatomegaly   276 non-null    object 
 7   Spiders        276 non-null    object 
 8   Edema          276 non-null    object 
 9   Bilirubin      276 non-null    float64
 10  Cholesterol    276 non-null    float64
 11  Albumin        276 non-null    float64
 12  Copper         276 non-null    float64
 13  Alk_Phos       276 non-null    float64
 14  SGOT           276 non-null    float64
 15  Tryglicerides  276 non-null    float64
 16  Platelets      276 non-null    float64
 17  Prothrombin    276 non-null    float64
 18  Stage     

In [10]:
df['Status'].unique()

array(['D', 'C', 'CL'], dtype=object)

In [11]:
enc = OneHotEncoder(categories='auto')
status = df['Status'].values.reshape(-1, 1)
enc.fit(status)
new_features = enc.get_feature_names_out()
print(new_features)
new_status = pd.DataFrame(enc.transform(status).toarray())

['x0_C' 'x0_CL' 'x0_D']


In [12]:
new_status.columns = ['C', 'CL', 'D']

In [13]:
df = pd.concat([df, new_status], axis=1)
df.drop('Status', axis=1, inplace=True)

In [14]:
df.head()

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,...,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,C,CL,D
0,400,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,...,156.0,1718.0,137.95,172.0,190.0,12.2,4.0,0.0,0.0,1.0
1,4500,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,...,54.0,7394.8,113.52,88.0,221.0,10.6,3.0,1.0,0.0,0.0
2,1012,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,...,210.0,516.0,96.1,55.0,151.0,12.0,4.0,0.0,0.0,1.0
3,1925,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,...,64.0,6121.8,60.63,92.0,183.0,10.3,4.0,0.0,0.0,1.0
4,1504,Placebo,13918,F,N,Y,Y,N,3.4,279.0,...,143.0,671.0,113.15,72.0,136.0,10.9,3.0,0.0,1.0,0.0


In [15]:
df['Drug'].unique()

array(['D-penicillamine', 'Placebo'], dtype=object)

In [16]:
enc2 = OneHotEncoder(categories='auto')
drug = df['Drug'].values.reshape(-1, 1)
enc2.fit(drug)
new_features = enc2.get_feature_names_out()
print(new_features)
new_drug = pd.DataFrame(enc2.transform(drug).toarray())

['x0_D-penicillamine' 'x0_Placebo']


In [17]:
new_drug.columns = ['D-penicillamine', 'Placebo']

In [18]:
df = pd.concat([df, new_drug], axis=1)
df.drop('Drug', axis=1, inplace=True)

In [19]:
df.head()

Unnamed: 0,N_Days,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,...,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,C,CL,D,D-penicillamine,Placebo
0,400,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,...,137.95,172.0,190.0,12.2,4.0,0.0,0.0,1.0,1.0,0.0
1,4500,20617,F,N,Y,Y,N,1.1,302.0,4.14,...,113.52,88.0,221.0,10.6,3.0,1.0,0.0,0.0,1.0,0.0
2,1012,25594,M,N,N,N,S,1.4,176.0,3.48,...,96.1,55.0,151.0,12.0,4.0,0.0,0.0,1.0,1.0,0.0
3,1925,19994,F,N,Y,Y,S,1.8,244.0,2.54,...,60.63,92.0,183.0,10.3,4.0,0.0,0.0,1.0,1.0,0.0
4,1504,13918,F,N,Y,Y,N,3.4,279.0,3.53,...,113.15,72.0,136.0,10.9,3.0,0.0,1.0,0.0,0.0,1.0


In [20]:
df['Edema'].unique()

array(['Y', 'N', 'S'], dtype=object)

In [21]:
df['Sex'] = df['Sex'].map({'F': 0, 'M': 1})
df['Ascites'] = df['Ascites'].map({'N': 0, 'Y': 1})
df['Hepatomegaly'] = df['Hepatomegaly'].map({'N': 0, 'Y': 1})
df['Spiders'] = df['Spiders'].map({'N': 0, 'Y': 1})
df['Edema'] = df['Edema'].map({'N': 0, 'S': 1, 'Y': 2})


In [22]:
df.head()

Unnamed: 0,N_Days,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,...,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,C,CL,D,D-penicillamine,Placebo
0,400,21464,0,1,1,1,2,14.5,261.0,2.6,...,137.95,172.0,190.0,12.2,4.0,0.0,0.0,1.0,1.0,0.0
1,4500,20617,0,0,1,1,0,1.1,302.0,4.14,...,113.52,88.0,221.0,10.6,3.0,1.0,0.0,0.0,1.0,0.0
2,1012,25594,1,0,0,0,1,1.4,176.0,3.48,...,96.1,55.0,151.0,12.0,4.0,0.0,0.0,1.0,1.0,0.0
3,1925,19994,0,0,1,1,1,1.8,244.0,2.54,...,60.63,92.0,183.0,10.3,4.0,0.0,0.0,1.0,1.0,0.0
4,1504,13918,0,0,1,1,0,3.4,279.0,3.53,...,113.15,72.0,136.0,10.9,3.0,0.0,1.0,0.0,0.0,1.0


In [23]:
y = df['Stage'].values
X = df.drop('Stage', axis=1)

In [24]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(276, 21)
(276, 19)


In [25]:
# build mask
gender_0_mask = df['Sex'] == 0
gender_1_mask = df['Sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  242
Male:  34


In [26]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [27]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group  SVM_TPR  SVM_TNR  SVM_FPR  SVM_FNR  SVM_TP  SVM_TN  SVM_FP  \
0     1  Female    1.000     0.00     0.00    0.000       1       0       0   
1     2  Female    0.875     0.10     0.90    0.125      14       1       9   
2     3  Female    1.000     0.00     1.00    0.000       1       0       1   
3     4  Female    1.000     0.00     1.00    0.000       3       0       1   
4     5  Female    1.000     0.00     1.00    0.000       2       0       1   
5     1    Male    0.000     0.00     1.00    0.000       0       0       1   
6     2    Male    0.000     0.00     0.00    0.000       0       0       0   
7     3    Male    0.000     0.00     1.00    0.000       0       0       1   
8     4    Male    0.000     0.00     0.00    0.000       0       0       0   
9     5    Male    1.000     0.25     0.75    0.000       2       1       3   

   SVM_FN  ...  ANN_FP  ANN

In [28]:
results_df = pd.concat(results_list, ignore_index=True)

In [29]:
result_path = './results/K38_result.xlsx'
results_df.to_excel(result_path, index=False)

In [30]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,1.0,0.0,0.0,0.0,1,0,0,0,...,1,0,0.1,1,0,0.9,1,3,0,9
1,2,Female,0.875,0.1,0.9,0.125,14,1,9,2,...,11,5,0.4,0,0,0.6,4,0,0,6
2,3,Female,1.0,0.0,1.0,0.0,1,0,1,0,...,1,0,0.428571,1,0,0.571429,3,2,0,4
3,4,Female,1.0,0.0,1.0,0.0,3,0,1,0,...,1,1,0.375,1,0,0.625,3,2,0,5
4,5,Female,1.0,0.0,1.0,0.0,2,0,1,0,...,2,0,0.2,1,0,0.8,1,3,0,4


In [31]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(22.0), pvalue=np.float64(0.03766692222862868))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(16.0), pvalue=np.float64(0.49782291323157524))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
DT -TPR: MannwhitneyuResult(statistic=np.float64(19.5), pvalue=np.float64(0.15028184023327876))
DT - FPR: MannwhitneyuResult(statistic=np.float64(22.5), pvalue=np.float64(0.019964453305216043))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(12.0), pvalue=np.float64(1.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(19.0), pvalue=np.float64(0.18122053638510982))
RF - FPR: MannwhitneyuResult(statistic=np.float64(14.0), pvalue=np.float64(0.8173613313851769))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(12.0), pvalue=np.float64(1.0))
LR -TPR: MannwhitneyuResult(statistic=np.float64(25.0), pvalue=np.float64(0.00878400037247741))
LR - FPR: MannwhitneyuResult(statistic=np.float64(22.5), p