In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("imtkaggleteam/diabetes")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/imtkaggleteam/diabetes/versions/1


In [2]:
import os
os.listdir(path)

['diabetes.csv']

In [3]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/diabetes.csv')
df.head()

Unnamed: 0,id,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,weight,frame,bp.1s,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn
0,1000,203.0,82,56.0,3.6,4.31,Buckingham,46,female,62.0,121.0,medium,118.0,59.0,,,29.0,38.0,720.0
1,1001,165.0,97,24.0,6.9,4.44,Buckingham,29,female,64.0,218.0,large,112.0,68.0,,,46.0,48.0,360.0
2,1002,228.0,92,37.0,6.2,4.64,Buckingham,58,female,61.0,256.0,large,190.0,92.0,185.0,92.0,49.0,57.0,180.0
3,1003,78.0,93,12.0,6.5,4.63,Buckingham,67,male,67.0,119.0,large,110.0,50.0,,,33.0,38.0,480.0
4,1005,249.0,90,28.0,8.9,7.72,Buckingham,64,male,68.0,183.0,medium,138.0,80.0,,,44.0,41.0,300.0


In [5]:
df.shape

(403, 19)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        403 non-null    int64  
 1   chol      402 non-null    float64
 2   stab.glu  403 non-null    int64  
 3   hdl       402 non-null    float64
 4   ratio     402 non-null    float64
 5   glyhb     390 non-null    float64
 6   location  403 non-null    object 
 7   age       403 non-null    int64  
 8   gender    403 non-null    object 
 9   height    398 non-null    float64
 10  weight    402 non-null    float64
 11  frame     391 non-null    object 
 12  bp.1s     398 non-null    float64
 13  bp.1d     398 non-null    float64
 14  bp.2s     141 non-null    float64
 15  bp.2d     141 non-null    float64
 16  waist     401 non-null    float64
 17  hip       401 non-null    float64
 18  time.ppn  400 non-null    float64
dtypes: float64(13), int64(3), object(3)
memory usage: 59.9+ KB


In [7]:
df.drop(['id', 'bp.2s', 'bp.2d'], axis=1, inplace=True)

In [8]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   chol      366 non-null    float64
 1   stab.glu  366 non-null    int64  
 2   hdl       366 non-null    float64
 3   ratio     366 non-null    float64
 4   glyhb     366 non-null    float64
 5   location  366 non-null    object 
 6   age       366 non-null    int64  
 7   gender    366 non-null    object 
 8   height    366 non-null    float64
 9   weight    366 non-null    float64
 10  frame     366 non-null    object 
 11  bp.1s     366 non-null    float64
 12  bp.1d     366 non-null    float64
 13  waist     366 non-null    float64
 14  hip       366 non-null    float64
 15  time.ppn  366 non-null    float64
dtypes: float64(11), int64(2), object(3)
memory usage: 45.9+ KB


In [10]:
df['location'].unique()

array(['Buckingham', 'Louisa'], dtype=object)

In [11]:
enc = OneHotEncoder(categories='auto')
location = df['location'].values.reshape(-1, 1)
enc.fit(location)
new_features = enc.get_feature_names_out()
print(new_features)
new_location = pd.DataFrame(enc.transform(location).toarray())

['x0_Buckingham' 'x0_Louisa']


In [12]:
new_location.columns = ['Buckingham', 'Louisa']

In [13]:
df = pd.concat([df, new_location], axis=1)
df.drop('location', axis=1, inplace=True)

In [14]:
df.head()

Unnamed: 0,chol,stab.glu,hdl,ratio,glyhb,age,gender,height,weight,frame,bp.1s,bp.1d,waist,hip,time.ppn,Buckingham,Louisa
0,203.0,82,56.0,3.6,4.31,46,female,62.0,121.0,medium,118.0,59.0,29.0,38.0,720.0,1.0,0.0
1,165.0,97,24.0,6.9,4.44,29,female,64.0,218.0,large,112.0,68.0,46.0,48.0,360.0,1.0,0.0
2,228.0,92,37.0,6.2,4.64,58,female,61.0,256.0,large,190.0,92.0,49.0,57.0,180.0,1.0,0.0
3,78.0,93,12.0,6.5,4.63,67,male,67.0,119.0,large,110.0,50.0,33.0,38.0,480.0,1.0,0.0
4,249.0,90,28.0,8.9,7.72,64,male,68.0,183.0,medium,138.0,80.0,44.0,41.0,300.0,1.0,0.0


In [15]:
df['gender'].unique()

array(['female', 'male'], dtype=object)

In [16]:
df['gender'] = df['gender'].map({'female': 0, 'male': 1})

In [17]:
df['frame'].unique()

array(['medium', 'large', 'small'], dtype=object)

In [18]:
df.head()

Unnamed: 0,chol,stab.glu,hdl,ratio,glyhb,age,gender,height,weight,frame,bp.1s,bp.1d,waist,hip,time.ppn,Buckingham,Louisa
0,203.0,82,56.0,3.6,4.31,46,0,62.0,121.0,medium,118.0,59.0,29.0,38.0,720.0,1.0,0.0
1,165.0,97,24.0,6.9,4.44,29,0,64.0,218.0,large,112.0,68.0,46.0,48.0,360.0,1.0,0.0
2,228.0,92,37.0,6.2,4.64,58,0,61.0,256.0,large,190.0,92.0,49.0,57.0,180.0,1.0,0.0
3,78.0,93,12.0,6.5,4.63,67,1,67.0,119.0,large,110.0,50.0,33.0,38.0,480.0,1.0,0.0
4,249.0,90,28.0,8.9,7.72,64,1,68.0,183.0,medium,138.0,80.0,44.0,41.0,300.0,1.0,0.0


In [19]:
df['frame'] = df['frame'].map({'small': 0, 'medium': 1, 'large': 2})

In [20]:
df['gender'].unique()

array([0, 1])

In [21]:
df.head()

Unnamed: 0,chol,stab.glu,hdl,ratio,glyhb,age,gender,height,weight,frame,bp.1s,bp.1d,waist,hip,time.ppn,Buckingham,Louisa
0,203.0,82,56.0,3.6,4.31,46,0,62.0,121.0,1,118.0,59.0,29.0,38.0,720.0,1.0,0.0
1,165.0,97,24.0,6.9,4.44,29,0,64.0,218.0,2,112.0,68.0,46.0,48.0,360.0,1.0,0.0
2,228.0,92,37.0,6.2,4.64,58,0,61.0,256.0,2,190.0,92.0,49.0,57.0,180.0,1.0,0.0
3,78.0,93,12.0,6.5,4.63,67,1,67.0,119.0,2,110.0,50.0,33.0,38.0,480.0,1.0,0.0
4,249.0,90,28.0,8.9,7.72,64,1,68.0,183.0,1,138.0,80.0,44.0,41.0,300.0,1.0,0.0


In [22]:
y = df['frame'].values
X = df.drop('frame', axis=1)

In [23]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(366, 16)
(366, 16)


In [24]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  214
Male:  152


In [25]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [26]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.950000  0.375000  0.625000  0.050000      19       6   
1     2  Female  0.750000  0.500000  0.500000  0.250000      15       6   
2     3  Female  0.916667  0.333333  0.666667  0.083333      22       3   
3     4  Female  0.950000  0.421053  0.578947  0.050000      19       8   
4     5  Female  0.880000  0.444444  0.555556  0.120000      22       4   
5     1    Male  1.000000  0.250000  0.750000  0.000000       8       2   
6     2    Male  0.857143  0.250000  0.750000  0.142857       6       2   
7     3    Male  0.833333  0.750000  0.250000  0.166667      10       6   
8     4    Male  1.000000  0.000000  1.000000  0.000000       5       0   
9     5    Male  0.583333  0.750000  0.250000  0.416667       7       3   

   SVM_FP  SVM_FN  ...  ANN_FP  ANN_FN    NB_TPR    NB_TNR    NB_FPR  \



In [27]:
results_df = pd.concat(results_list, ignore_index=True)

In [28]:
result_path = './results/k92_result.xlsx'
results_df.to_excel(result_path, index=False)

In [29]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.95,0.375,0.625,0.05,19,6,10,1,...,11,4,0.736842,0.625,0.375,0.263158,14,10,6,5
1,2,Female,0.75,0.5,0.5,0.25,15,6,6,5,...,7,8,0.5625,0.727273,0.272727,0.4375,9,8,3,7
2,3,Female,0.916667,0.333333,0.666667,0.083333,22,3,6,2,...,6,6,0.619048,0.444444,0.555556,0.380952,13,4,5,8
3,4,Female,0.95,0.421053,0.578947,0.05,19,8,11,1,...,8,2,0.5625,0.666667,0.333333,0.4375,9,10,5,7
4,5,Female,0.88,0.444444,0.555556,0.12,22,4,5,3,...,5,6,0.636364,0.555556,0.444444,0.363636,14,5,4,8


In [30]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(0.4075195845299958), pvalue=np.float64(0.6943135618746935), df=np.float64(8.0))
SVM - FPR: TtestResult(statistic=np.float64(-0.09669455796260747), pvalue=np.float64(0.9253474681835686), df=np.float64(8.0))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(13.0), pvalue=np.float64(1.0))
DT -TPR: TtestResult(statistic=np.float64(-0.4543876294939884), pvalue=np.float64(0.6616249989760133), df=np.float64(8.0))
DT - FPR: MannwhitneyuResult(statistic=np.float64(7.5), pvalue=np.float64(0.33978297435581883))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(18.0), pvalue=np.float64(0.30952380952380953))
RF -TPR: TtestResult(statistic=np.float64(0.7831449775259245), pvalue=np.float64(0.45609788149198827), df=np.float64(8.0))
RF - FPR: TtestResult(statistic=np.float64(1.8529745437176297), pvalue=np.float64(0.10101441124545682), df=np.float64(8.0))
RF - FN/FP: TtestResult(statistic=np.float64(-0.8013797315552177), pvalue=np.float64(0.446059039518693