In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("iammustafatz/diabetes-prediction-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/iammustafatz/diabetes-prediction-dataset/versions/1


In [2]:
import os
os.listdir(path)

['diabetes_prediction_dataset.csv']

In [3]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

In [4]:
df = pd.read_csv(f'{path}/diabetes_prediction_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [6]:
df.shape

(100000, 9)

In [7]:
df['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [8]:
df['smoking_history'] = df['smoking_history'].map({'never': 0, 'No Info': 0, 'former': 1, 'not current': 2, 'ever': 3, 'current': 4})
df['gender'] = df['gender'].map({'Female': 0, "Male": 1})

In [9]:
y = df['diabetes'].values
X = df.drop('diabetes', axis=1)

In [10]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(100000, 8)
(100000, 6)


In [11]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  58552
Male:  41430


In [12]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=20, shuffle=True, random_state=seed)

In [13]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KN

In [14]:
results_df = pd.concat(results_list, ignore_index=True)

In [15]:
result_path = './results/k7_result.xlsx'
results_df.to_excel(result_path, index=False)

In [16]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.616114,1.0,0.0,0.383886,130,2717,0,81,...,0,60,0.625592,0.989326,0.010674,0.374408,132,2688,29,79
1,2,Female,0.593886,1.0,0.0,0.406114,136,2699,0,93,...,1,72,0.598253,0.991478,0.008522,0.401747,137,2676,23,92
2,3,Female,0.585714,1.0,0.0,0.414286,123,2718,0,87,...,1,70,0.571429,0.99117,0.00883,0.428571,120,2694,24,90
3,4,Female,0.58216,1.0,0.0,0.41784,124,2715,0,89,...,2,70,0.577465,0.988214,0.011786,0.422535,123,2683,32,90
4,5,Female,0.598214,0.99963,0.00037,0.401786,134,2703,1,90,...,1,74,0.571429,0.990385,0.009615,0.428571,128,2678,26,96


In [17]:
label = 'Female'

# fl.perform_t_tests(df, 'SVM', label)
# fl.perform_t_tests(df, 'DT', label)
# fl.perform_t_tests(df, 'RF', label)
# fl.perform_t_tests(df, 'LR', label)
# fl.perform_t_tests(df, 'KNN', label)
# fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


NB -TPR: TtestResult(statistic=np.float64(-1.7681181283212213), pvalue=np.float64(0.0850695795958085), df=np.float64(38.0))
NB - FPR: TtestResult(statistic=np.float64(-4.322235557029687), pvalue=np.float64(0.0001675281069840318), df=np.float64(28.82201737223724))
NB - FN/FP: TtestResult(statistic=np.float64(0.958411505234668), pvalue=np.float64(0.34391378059283884), df=np.float64(38.0))
