In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
hcv_data = fetch_ucirepo(id=571) 
  
# data (as pandas dataframes) 
X = hcv_data.data.features 
y = hcv_data.data.targets 
  
# metadata 
print(hcv_data.metadata) 
  
# variable information 
print(hcv_data.variables) 


{'uci_id': 571, 'name': 'HCV data', 'repository_url': 'https://archive.ics.uci.edu/dataset/571/hcv+data', 'data_url': 'https://archive.ics.uci.edu/static/public/571/data.csv', 'abstract': 'The data set contains laboratory values of blood donors and Hepatitis C patients and demographic values like age.', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 615, 'num_features': 12, 'feature_types': ['Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['Category'], 'index_col': ['ID'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2020, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5D612', 'creators': ['Ralf Lichtinghagen', 'Frank Klawonn', 'Georg Hoffmann'], 'intro_paper': {'ID': 237, 'type': 'NATIVE', 'title': 'Using machine learning techniques to generate laboratory diagnostic pathways—a case study', 'authors': 'Georg F. Hoffmann, A. Bietenb

In [2]:
import pandas as pd

feature = pd.DataFrame(X)
feature.head()
target = pd.DataFrame(y)
df = pd.concat([feature, target], axis=1)
df.head()

Unnamed: 0,Age,Sex,ALB,ALP,AST,BIL,CHE,CHOL,CREA,CGT,PROT,ALT,Category
0,32,m,38.5,52.5,22.1,7.5,6.93,3.23,106.0,12.1,69.0,7.7,0=Blood Donor
1,32,m,38.5,70.3,24.7,3.9,11.17,4.8,74.0,15.6,76.5,18.0,0=Blood Donor
2,32,m,46.9,74.7,52.6,6.1,8.84,5.2,86.0,33.2,79.3,36.2,0=Blood Donor
3,32,m,43.2,52.0,22.6,18.9,7.33,4.74,80.0,33.8,75.7,30.6,0=Blood Donor
4,32,m,39.2,74.1,24.8,9.6,9.15,4.32,76.0,29.9,68.7,32.6,0=Blood Donor


In [3]:

import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df.shape

(615, 13)

In [5]:
df['Category'].unique()

array(['0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis',
       '2=Fibrosis', '3=Cirrhosis'], dtype=object)

In [6]:
df['Category'] = df['Category'].map({'0s=suspect Blood Donor': 0, '0=Blood Donor': 1, '1=Hepatitis': 2, '2=Fibrosis': 3, '3=Cirrhosis': 4 })

In [7]:
df['Sex'] = df['Sex'].map({'f': 0, 'm': 1})

In [8]:
df['Category'].unique()

array([1, 0, 2, 3, 4])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       615 non-null    int64  
 1   Sex       615 non-null    int64  
 2   ALB       614 non-null    float64
 3   ALP       597 non-null    float64
 4   AST       615 non-null    float64
 5   BIL       615 non-null    float64
 6   CHE       615 non-null    float64
 7   CHOL      605 non-null    float64
 8   CREA      615 non-null    float64
 9   CGT       615 non-null    float64
 10  PROT      614 non-null    float64
 11  ALT       614 non-null    float64
 12  Category  615 non-null    int64  
dtypes: float64(10), int64(3)
memory usage: 62.6 KB


In [10]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [11]:
y = df['Category'].values
X = df.drop('Category', axis=1)

In [12]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(589, 12)
(589, 12)


In [13]:
# build mask
gender_0_mask = df['Sex'] == 0
gender_1_mask = df['Sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  226
Male:  363


In [14]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [15]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR  SVM_TNR  SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  SVM_FP  \
0     1  Female  0.000000      1.0      0.0  1.000000       0      42       0   
1     2  Female  1.000000      0.0      0.0  0.000000      43       0       0   
2     3  Female  0.000000      1.0      0.0  1.000000       0      41       0   
3     4  Female  1.000000      1.0      0.0  0.000000       1      44       0   
4     5  Female  0.000000      1.0      0.0  1.000000       0      39       0   
5     1    Male  0.333333      1.0      0.0  0.666667       1      64       0   
6     2    Male  1.000000      0.0      1.0  0.000000      64       0       3   
7     3    Male  1.000000      0.0      0.0  0.000000      62       0       0   
8     4    Male  1.000000      0.0      0.0  0.000000      64       0       0   
9     5    Male  1.000000      0.0      1.0  0.000000      61       0       1   

   SV



In [16]:
results_df = pd.concat(results_list, ignore_index=True)

In [17]:
result_path = './results/u13_result.xlsx'
results_df.to_excel(result_path, index=False)

In [18]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.0,1,0,1.0,0,42,0,1,...,0,0,0.0,1.0,0.0,0.0,0,42,0,0
1,2,Female,1.0,0,0,0.0,43,0,0,0,...,1,0,1.0,0.0,0.0,0.0,42,0,0,0
2,3,Female,0.0,1,0,1.0,0,41,0,1,...,0,1,0.0,1.0,0.0,1.0,0,40,0,1
3,4,Female,1.0,1,0,0.0,1,44,0,0,...,0,0,1.0,0.977273,0.022727,0.0,1,43,1,0
4,5,Female,0.0,1,0,1.0,0,39,0,2,...,0,2,0.0,1.0,0.0,1.0,0,39,0,2


In [19]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.15149399240422012))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(7.5), pvalue=np.float64(0.17701598287480413))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(16.5), pvalue=np.float64(0.40648344235657985))
DT -TPR: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.15149399240422012))
DT - FPR: MannwhitneyuResult(statistic=np.float64(12.0), pvalue=np.float64(1.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.15149399240422012))
RF - FPR: MannwhitneyuResult(statistic=np.float64(2.5), pvalue=np.float64(0.02315095198524634))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(15.5), pvalue=np.float64(0.5186050164287256))
LR -TPR: MannwhitneyuResult(statistic=np.float64(3.0), pvalue=np.float64(0.03766692222862868))
LR - FPR: MannwhitneyuResult(statistic=np.float64(13.0), pvalue=