In [26]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("houcembenmansour/predict-diabetes-based-on-diagnostic-measures")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/houcembenmansour/predict-diabetes-based-on-diagnostic-measures/versions/1


In [27]:
import os
os.listdir(path)

['diabetes.csv']

In [28]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [29]:
df = pd.read_csv(f'{path}/diabetes.csv')
df.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,1,193,77,49,39,19,female,61,119,225,118,70,32,38,84,No diabetes
1,2,146,79,41,36,19,female,60,135,264,108,58,33,40,83,No diabetes
2,3,217,75,54,4,20,female,67,187,293,110,72,40,45,89,No diabetes
3,4,226,97,70,32,20,female,64,114,196,122,64,31,39,79,No diabetes
4,5,164,91,67,24,20,female,70,141,202,122,86,32,39,82,No diabetes


In [30]:
df.shape

(390, 16)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient_number   390 non-null    int64 
 1   cholesterol      390 non-null    int64 
 2   glucose          390 non-null    int64 
 3   hdl_chol         390 non-null    int64 
 4   chol_hdl_ratio   390 non-null    object
 5   age              390 non-null    int64 
 6   gender           390 non-null    object
 7   height           390 non-null    int64 
 8   weight           390 non-null    int64 
 9   bmi              390 non-null    object
 10  systolic_bp      390 non-null    int64 
 11  diastolic_bp     390 non-null    int64 
 12  waist            390 non-null    int64 
 13  hip              390 non-null    int64 
 14  waist_hip_ratio  390 non-null    object
 15  diabetes         390 non-null    object
dtypes: int64(11), object(5)
memory usage: 48.9+ KB


In [32]:
df.drop('patient_number', axis=1, inplace=True)

In [33]:
df['gender'].unique()

array(['female', 'male'], dtype=object)

In [34]:
df['gender'] = df['gender'].map({'female': 0, 'male': 1})

In [35]:
df['diabetes'].unique()

array(['No diabetes', 'Diabetes'], dtype=object)

In [36]:
df['diabetes'] = df['diabetes'].map({'No diabetes': 0, 'Diabetes': 1})

In [37]:
df.head()

Unnamed: 0,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,193,77,49,39,19,0,61,119,225,118,70,32,38,84,0
1,146,79,41,36,19,0,60,135,264,108,58,33,40,83,0
2,217,75,54,4,20,0,67,187,293,110,72,40,45,89,0
3,226,97,70,32,20,0,64,114,196,122,64,31,39,79,0
4,164,91,67,24,20,0,70,141,202,122,86,32,39,82,0


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cholesterol      390 non-null    int64 
 1   glucose          390 non-null    int64 
 2   hdl_chol         390 non-null    int64 
 3   chol_hdl_ratio   390 non-null    object
 4   age              390 non-null    int64 
 5   gender           390 non-null    int64 
 6   height           390 non-null    int64 
 7   weight           390 non-null    int64 
 8   bmi              390 non-null    object
 9   systolic_bp      390 non-null    int64 
 10  diastolic_bp     390 non-null    int64 
 11  waist            390 non-null    int64 
 12  hip              390 non-null    int64 
 13  waist_hip_ratio  390 non-null    object
 14  diabetes         390 non-null    int64 
dtypes: int64(12), object(3)
memory usage: 45.8+ KB


In [39]:
df['chol_hdl_ratio'] = df['chol_hdl_ratio'].str.replace(',', '.')
df.head()


Unnamed: 0,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,193,77,49,3.9,19,0,61,119,225,118,70,32,38,84,0
1,146,79,41,3.6,19,0,60,135,264,108,58,33,40,83,0
2,217,75,54,4.0,20,0,67,187,293,110,72,40,45,89,0
3,226,97,70,3.2,20,0,64,114,196,122,64,31,39,79,0
4,164,91,67,2.4,20,0,70,141,202,122,86,32,39,82,0


In [40]:
df['bmi'] = df['bmi'].str.replace(',', '.')
df.head()


Unnamed: 0,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,193,77,49,3.9,19,0,61,119,22.5,118,70,32,38,84,0
1,146,79,41,3.6,19,0,60,135,26.4,108,58,33,40,83,0
2,217,75,54,4.0,20,0,67,187,29.3,110,72,40,45,89,0
3,226,97,70,3.2,20,0,64,114,19.6,122,64,31,39,79,0
4,164,91,67,2.4,20,0,70,141,20.2,122,86,32,39,82,0


In [41]:
df['waist_hip_ratio'] = df['waist_hip_ratio'].str.replace(',', '.')
df.head()


Unnamed: 0,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,193,77,49,3.9,19,0,61,119,22.5,118,70,32,38,0.84,0
1,146,79,41,3.6,19,0,60,135,26.4,108,58,33,40,0.83,0
2,217,75,54,4.0,20,0,67,187,29.3,110,72,40,45,0.89,0
3,226,97,70,3.2,20,0,64,114,19.6,122,64,31,39,0.79,0
4,164,91,67,2.4,20,0,70,141,20.2,122,86,32,39,0.82,0


In [42]:
df['gender'].unique()

array([0, 1])

In [43]:
y = df['diabetes'].values
X = df.drop('diabetes', axis=1)

In [44]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  228
Male:  162


In [45]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [46]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.571429  1.000000  0.000000  0.428571       4      39   
1     2  Female  0.500000  1.000000  0.000000  0.500000       4      38   
2     3  Female  0.250000  1.000000  0.000000  0.750000       1      42   
3     4  Female  0.333333  0.974359  0.025641  0.666667       2      38   
4     5  Female  0.444444  1.000000  0.000000  0.555556       4      36   
5     1    Male  0.200000  1.000000  0.000000  0.800000       1      28   
6     2    Male  0.400000  1.000000  0.000000  0.600000       2      28   
7     3    Male  0.600000  0.962963  0.037037  0.400000       3      26   
8     4    Male  0.333333  0.965517  0.034483  0.666667       1      28   
9     5    Male  0.375000  1.000000  0.000000  0.625000       3      24   

   SVM_FP  SVM_FN  ...  ANN_FP  ANN_FN    NB_TPR    NB_TNR    NB_FPR  \



In [47]:
results_df = pd.concat(results_list, ignore_index=True)

In [48]:
result_path = './results/K44_result.xlsx'
results_df.to_excel(result_path, index=False)

In [49]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.571429,1.0,0.0,0.428571,4,39,0,3,...,4,3,0.857143,0.897436,0.102564,0.142857,6,35,4,1
1,2,Female,0.5,1.0,0.0,0.5,4,38,0,4,...,0,3,0.875,0.947368,0.052632,0.125,7,36,2,1
2,3,Female,0.25,1.0,0.0,0.75,1,42,0,3,...,1,3,0.5,0.97619,0.02381,0.5,2,41,1,2
3,4,Female,0.333333,0.974359,0.025641,0.666667,2,38,1,4,...,1,5,0.5,0.897436,0.102564,0.5,3,35,4,3
4,5,Female,0.444444,1.0,0.0,0.555556,4,36,0,5,...,0,3,0.777778,1.0,0.0,0.222222,7,36,0,2


In [50]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(0.4412025282959616), pvalue=np.float64(0.670747261900752), df=np.float64(8.0))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(9.0), pvalue=np.float64(0.440686016488678))
SVM - FN/FP: TtestResult(statistic=np.float64(0.4522670168943982), pvalue=np.float64(0.6630881703967537), df=np.float64(8.0))
DT -TPR: TtestResult(statistic=np.float64(0.899893475191568), pvalue=np.float64(0.39445758468010383), df=np.float64(8.0))
DT - FPR: TtestResult(statistic=np.float64(-0.5539830513292591), pvalue=np.float64(0.5947321246126831), df=np.float64(8.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(11.5), pvalue=np.float64(0.9155299733767719))
RF -TPR: TtestResult(statistic=np.float64(-0.0014849549199781446), pvalue=np.float64(0.9988515392473816), df=np.float64(8.0))
RF - FPR: TtestResult(statistic=np.float64(-1.0993171128133319), pvalue=np.float64(0.30360806946047214), df=np.float64(8.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(17.0), pva