In [25]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("priyamchoksi/100000-diabetes-clinical-dataset")

print("Path to dataset files:", path)

In [26]:
import os
os.listdir(path)

In [27]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [29]:
df = pd.read_csv(f'{path}/diabetes_dataset.csv')
df.head()

In [30]:
df.shape

(100000, 16)

In [31]:
df.drop('location', axis=1, inplace=True)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  100000 non-null  int64  
 1   gender                100000 non-null  object 
 2   age                   100000 non-null  float64
 3   race:AfricanAmerican  100000 non-null  int64  
 4   race:Asian            100000 non-null  int64  
 5   race:Caucasian        100000 non-null  int64  
 6   race:Hispanic         100000 non-null  int64  
 7   race:Other            100000 non-null  int64  
 8   hypertension          100000 non-null  int64  
 9   heart_disease         100000 non-null  int64  
 10  smoking_history       100000 non-null  object 
 11  bmi                   100000 non-null  float64
 12  hbA1c_level           100000 non-null  float64
 13  blood_glucose_level   100000 non-null  int64  
 14  diabetes              100000 non-null  int64  
dtypes

In [33]:
df.head()

Unnamed: 0,year,gender,age,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [34]:
df['gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [35]:
df['gender'].value_counts()

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [36]:
df = df[df['gender'] != 'Other']
df.head()

Unnamed: 0,year,gender,age,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [37]:
df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})

In [38]:
df['smoking_history'] = df['smoking_history'].map({'No Info': 0, 'never': 1, 'not current': 2, 'ever': 3, 'former': 4, 'current': 5})

In [39]:
df['smoking_history'].unique()

array([1, 2, 5, 0, 3, 4])

In [40]:
y = df['diabetes'].values
X = df.drop('diabetes', axis=1)

In [41]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(99982, 14)
(99982, 12)


In [42]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  58552
Male:  41430


In [43]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=20, shuffle=True, random_state=seed)

In [44]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KN

In [45]:
results_df = pd.concat(results_list, ignore_index=True)

In [46]:
result_path = './results/k14_result.xlsx'
results_df.to_excel(result_path, index=False)

In [47]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.558442,0.999629,0.000371,0.441558,129,2696,1,102,...,6,71,0.593074,0.985169,0.014831,0.406926,137,2657,40,94
1,2,Female,0.530233,0.999631,0.000369,0.469767,114,2712,1,101,...,6,78,0.581395,0.990417,0.009583,0.418605,125,2687,26,90
2,3,Female,0.634855,1.0,0.0,0.365145,153,2687,0,88,...,2,61,0.676349,0.990696,0.009304,0.323651,163,2662,25,78
3,4,Female,0.608333,1.0,0.0,0.391667,146,2688,0,94,...,2,63,0.616667,0.988095,0.011905,0.383333,148,2656,32,92
4,5,Female,0.512563,1.0,0.0,0.487437,102,2729,0,97,...,1,70,0.557789,0.993038,0.006962,0.442211,111,2710,19,88


In [48]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(-0.5055428502854329), pvalue=np.float64(0.6160975993845232), df=np.float64(38.0))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(67.0), pvalue=np.float64(0.0002651965147012101))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(327.0), pvalue=np.float64(0.0006176025824289028))
DT -TPR: TtestResult(statistic=np.float64(0.8338206072746462), pvalue=np.float64(0.40959302046795354), df=np.float64(38.0))
DT - FPR: TtestResult(statistic=np.float64(-4.355248649874621), pvalue=np.float64(9.71050587196425e-05), df=np.float64(38.0))
DT - FN/FP: TtestResult(statistic=np.float64(-2.286051754425564), pvalue=np.float64(0.027915971153941125), df=np.float64(38.0))
RF -TPR: TtestResult(statistic=np.float64(0.8428208350064238), pvalue=np.float64(0.40460252180452594), df=np.float64(38.0))
RF - FPR: TtestResult(statistic=np.float64(-2.5867638147759715), pvalue=np.float64(0.013641919385428419), df=np.float64(38.0))
RF - FN/FP: MannwhitneyuResult(statistic=np