In [120]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("pkdarabi/diabetes-dataset-with-18-features")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/pkdarabi/diabetes-dataset-with-18-features/versions/1


In [121]:
import os
os.listdir(path)

['diabetes.csv']

In [122]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [123]:
df = pd.read_csv(f'{path}/diabetes.csv')
df.head()

Unnamed: 0,Age,Gender,BMI,SBP,DBP,FPG,Chol,Tri,HDL,LDL,ALT,BUN,CCR,FFPG,smoking,drinking,family_histroy,Diabetes
0,26,1,20.1,119,81,5.8,4.36,0.86,0.9,2.43,12.0,5.4,63.8,5.4,3.0,3.0,0,0
1,40,1,17.7,97,54,4.6,3.7,1.02,1.5,2.04,9.2,3.7,70.3,4.1,1.0,1.0,0,0
2,40,2,19.7,85,53,5.3,5.87,1.29,1.75,3.37,10.1,4.1,61.1,4.85,3.0,3.0,0,0
3,43,1,23.1,111,71,4.5,4.05,0.74,1.27,2.6,36.5,4.38,73.4,5.3,2.0,3.0,0,0
4,36,1,26.5,130,82,5.54,6.69,3.49,0.91,3.64,69.3,3.86,67.5,5.53,3.0,3.0,0,0


In [124]:
df.isna().sum().sum()

np.int64(0)

In [125]:
df.shape

(4303, 18)

In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4303 entries, 0 to 4302
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             4303 non-null   int64  
 1   Gender          4303 non-null   int64  
 2   BMI             4303 non-null   float64
 3   SBP             4303 non-null   int64  
 4   DBP             4303 non-null   int64  
 5   FPG             4303 non-null   float64
 6   Chol            4303 non-null   float64
 7   Tri             4303 non-null   float64
 8   HDL             4303 non-null   float64
 9   LDL             4303 non-null   float64
 10  ALT             4303 non-null   float64
 11  BUN             4303 non-null   float64
 12  CCR             4303 non-null   float64
 13  FFPG            4303 non-null   float64
 14  smoking         4303 non-null   float64
 15  drinking        4303 non-null   float64
 16  family_histroy  4303 non-null   int64  
 17  Diabetes        4303 non-null   i

In [127]:
df['Diabetes'].value_counts()

Diabetes
0    3000
1    1303
Name: count, dtype: int64

In [128]:
df.head()

Unnamed: 0,Age,Gender,BMI,SBP,DBP,FPG,Chol,Tri,HDL,LDL,ALT,BUN,CCR,FFPG,smoking,drinking,family_histroy,Diabetes
0,26,1,20.1,119,81,5.8,4.36,0.86,0.9,2.43,12.0,5.4,63.8,5.4,3.0,3.0,0,0
1,40,1,17.7,97,54,4.6,3.7,1.02,1.5,2.04,9.2,3.7,70.3,4.1,1.0,1.0,0,0
2,40,2,19.7,85,53,5.3,5.87,1.29,1.75,3.37,10.1,4.1,61.1,4.85,3.0,3.0,0,0
3,43,1,23.1,111,71,4.5,4.05,0.74,1.27,2.6,36.5,4.38,73.4,5.3,2.0,3.0,0,0
4,36,1,26.5,130,82,5.54,6.69,3.49,0.91,3.64,69.3,3.86,67.5,5.53,3.0,3.0,0,0


In [129]:
df['Gender'].unique()

array([1, 2])

In [130]:
tmp = df['Gender'].unique()
df['Gender'] = df['Gender'].map({tmp[0]: 1, tmp[1]: 0})

In [131]:
df['Gender'].unique()

array([1, 0])

In [132]:
y = df['Diabetes'].values
X = df.drop('Diabetes', axis=1)

In [133]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(4303, 17)
(4303, 16)


In [134]:
# build mask
gender_0_mask = df['Gender'] == 0
gender_1_mask = df['Gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  1513
Male:  2790


In [135]:
print(set(y_Gender_0))
print(set(y_Gender_1))

{np.int64(0), np.int64(1)}
{np.int64(0), np.int64(1)}


In [136]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [137]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
    Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0      1  Female  0.808511  1.000000  0.000000  0.191489      38     105   
1      2  Female  0.893617  0.990476  0.009524  0.106383      42     104   
2      3  Female  0.893617  0.990476  0.009524  0.106383      42     104   
3      4  Female  0.847826  1.000000  0.000000  0.152174      39     105   
4      5  Female  0.869565  1.000000  0.000000  0.130435      40     105   
5      6  Female  0.847826  0.990476  0.009524  0.152174      39     104   
6      7  Female  0.913043  0.980952  0.019048  0.086957      42     103   
7      8  Female  0.913043  0.990476  0.009524  0.086957      42     104   
8      



In [138]:
results_df = pd.concat(results_list, ignore_index=True)

In [139]:
result_path = './results/K43_result.xlsx'
results_df.to_excel(result_path, index=False)

In [140]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.808511,1.0,0.0,0.191489,38,105,0,9,...,2,5,0.893617,0.971429,0.028571,0.106383,42,102,3,5
1,2,Female,0.893617,0.990476,0.009524,0.106383,42,104,1,5,...,1,2,0.914894,0.942857,0.057143,0.085106,43,99,6,4
2,3,Female,0.893617,0.990476,0.009524,0.106383,42,104,1,5,...,1,6,0.957447,0.961905,0.038095,0.042553,45,101,4,2
3,4,Female,0.847826,1.0,0.0,0.152174,39,105,0,7,...,0,7,0.869565,0.961905,0.038095,0.130435,40,101,4,6
4,5,Female,0.869565,1.0,0.0,0.130435,40,105,0,6,...,3,5,0.891304,0.961905,0.038095,0.108696,41,101,4,5


In [141]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)

SVM -TPR: TtestResult(statistic=np.float64(0.6650214996026382), pvalue=np.float64(0.5144678620386383), df=np.float64(18.0))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(14.0), pvalue=np.float64(0.006619981416021894))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(70.0), pvalue=np.float64(0.1401665724324623))
DT -TPR: TtestResult(statistic=np.float64(3.07301381876741), pvalue=np.float64(0.006554776152348277), df=np.float64(18.0))
DT - FPR: TtestResult(statistic=np.float64(-0.9378595324394087), pvalue=np.float64(0.3607313457500003), df=np.float64(18.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(40.0), pvalue=np.float64(0.472509068324386))
RF -TPR: TtestResult(statistic=np.float64(1.642811889145048), pvalue=np.float64(0.11777573982655325), df=np.float64(18.0))
RF - FPR: TtestResult(statistic=np.float64(-1.7947068563571063), pvalue=np.float64(0.0895124808226588), df=np.float64(18.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(58.0), pvalue=np.float64(0.570460