In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
early_stage_diabetes_risk_prediction = fetch_ucirepo(id=529) 
  
# data (as pandas dataframes) 
X = early_stage_diabetes_risk_prediction.data.features 
y = early_stage_diabetes_risk_prediction.data.targets 
  
# metadata 
print(early_stage_diabetes_risk_prediction.metadata) 
  
# variable information 
print(early_stage_diabetes_risk_prediction.variables) 


{'uci_id': 529, 'name': 'Early Stage Diabetes Risk Prediction', 'repository_url': 'https://archive.ics.uci.edu/dataset/529/early+stage+diabetes+risk+prediction+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/529/data.csv', 'abstract': 'This dataset contains the sign and symptpom data of newly diabetic or would be diabetic patient. ', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 520, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Gender'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Mar 04 2024', 'dataset_doi': '10.24432/C5VG8H', 'creators': [], 'intro_paper': {'ID': 397, 'type': 'NATIVE', 'title': 'Likelihood Prediction of Diabetes at Early Stage Using Data Mining Techniques', 'authors': 'M. M. F. Islam, Rahatara Ferdousi, Sadikur Rahman, Humayra Yas

In [2]:
import pandas as pd

feature = pd.DataFrame(X)
feature.head()
target = pd.DataFrame(y)
df = pd.concat([feature, target], axis=1)
df.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [3]:

import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [5]:
df.shape

(520, 17)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   age                 520 non-null    int64 
 1   gender              520 non-null    object
 2   polyuria            520 non-null    object
 3   polydipsia          520 non-null    object
 4   sudden_weight_loss  520 non-null    object
 5   weakness            520 non-null    object
 6   polyphagia          520 non-null    object
 7   genital_thrush      520 non-null    object
 8   visual_blurring     520 non-null    object
 9   itching             520 non-null    object
 10  irritability        520 non-null    object
 11  delayed_healing     520 non-null    object
 12  partial_paresis     520 non-null    object
 13  muscle_stiffness    520 non-null    object
 14  alopecia            520 non-null    object
 15  obesity             520 non-null    object
 16  class               520 no

In [7]:
df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})
df['polyuria'] = df['polyuria'].map({'No': 0, 'Yes': 1})
df['polydipsia'] = df['polydipsia'].map({'No': 0, 'Yes': 1})
df['sudden_weight_loss'] = df['sudden_weight_loss'].map({'No': 0, 'Yes': 1})
df['weakness'] = df['weakness'].map({'No': 0, 'Yes': 1})
df['polyphagia'] = df['polyphagia'].map({'No': 0, 'Yes': 1})
df['genital_thrush'] = df['genital_thrush'].map({'No': 0, 'Yes': 1})
df['visual_blurring'] = df['visual_blurring'].map({'No': 0, 'Yes': 1})
df['itching'] = df['itching'].map({'No': 0, 'Yes': 1})
df['irritability'] = df['irritability'].map({'No': 0, 'Yes': 1})
df['delayed_healing'] = df['delayed_healing'].map({'No': 0, 'Yes': 1})
df['partial_paresis'] = df['partial_paresis'].map({'No': 0, 'Yes': 1})
df['muscle_stiffness'] = df['muscle_stiffness'].map({'No': 0, 'Yes': 1})
df['alopecia'] = df['alopecia'].map({'No': 0, 'Yes': 1})
df['obesity'] = df['obesity'].map({'No': 0, 'Yes': 1})
df['class'] = df['class'].map({'Negative': 0, 'Positive': 1})




In [8]:
y = df['class'].values
X = df.drop('class', axis=1)

In [9]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(520, 16)
(520, 16)


In [10]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  192
Male:  328


In [11]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [12]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN




Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  1.000000  0.666667  0.333333  0.000000      36       2   
1     2  Female  1.000000  0.600000  0.400000  0.000000      34       3   
2     3  Female  1.000000  0.500000  0.500000  0.000000      34       2   
3     4  Female  1.000000  0.666667  0.333333  0.000000      35       2   
4     5  Female  1.000000  1.000000  0.000000  0.000000      34       4   
5     1    Male  0.962963  0.974359  0.025641  0.037037      26      38   
6     2    Male  0.970588  1.000000  0.000000  0.029412      33      32   
7     3    Male  1.000000  0.975000  0.025000  0.000000      26      39   
8     4    Male  



In [13]:
results_df = pd.concat(results_list, ignore_index=True)

In [14]:
result_path = './results/U9_result.xlsx'
results_df.to_excel(result_path, index=False)

In [15]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,1.0,0.666667,0.333333,0.0,36,2,1,0,...,1,0,0.777778,1.0,0.0,0.222222,28,3,0,8
1,2,Female,1.0,0.6,0.4,0.0,34,3,2,0,...,2,1,0.911765,0.8,0.2,0.088235,31,4,1,3
2,3,Female,1.0,0.5,0.5,0.0,34,2,2,0,...,1,0,0.794118,0.75,0.25,0.205882,27,3,1,7
3,4,Female,1.0,0.666667,0.333333,0.0,35,2,1,0,...,1,0,0.942857,0.666667,0.333333,0.057143,33,2,1,2
4,5,Female,1.0,1.0,0.0,0.0,34,4,0,0,...,0,0,0.852941,1.0,0.0,0.147059,29,4,0,5


In [16]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(22.5), pvalue=np.float64(0.025369859822053694))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(21.5), pvalue=np.float64(0.06607531108982745))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(2.5), pvalue=np.float64(0.02480842454780953))
DT -TPR: MannwhitneyuResult(statistic=np.float64(18.5), pvalue=np.float64(0.2358613103342675))
DT - FPR: TtestResult(statistic=np.float64(1.7506706258586853), pvalue=np.float64(0.15319887406091715), df=np.float64(4.09682356726053))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(7.0), pvalue=np.float64(0.27964153403730163))
RF -TPR: MannwhitneyuResult(statistic=np.float64(21.5), pvalue=np.float64(0.057346851901366395))
RF - FPR: MannwhitneyuResult(statistic=np.float64(14.5), pvalue=np.float64(0.7240816609153895))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(5.5), pvalue=np.float64(0.14610046596342238))
LR -TPR: MannwhitneyuResult(statistic=np.float64(24.5), pvalue=np.float64(0.01470685