In [23]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier

In [2]:
# Load the processed dataset
df = pd.read_csv('../data/preprocessed/with_diabetes_status/dataset_with_diabetes_status.csv')

In [3]:
# Data inspection: view first 10 rows and shape
print("First ten rows of the dataset:")
display(df.head(10))

print("\nDataset dimensions:")
print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}")

First ten rows of the dataset:


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,diabetes_status
0,Female,80.0,0,1,never,25.19,6.6,140,0,stress induced type 2 diabetic
1,Female,54.0,0,0,No Info,27.32,6.6,80,0,stress induced type 2 diabetic
2,Male,28.0,0,0,never,27.32,5.7,158,0,stress induced prediabetic
3,Female,36.0,0,0,current,23.45,5.0,155,0,non diabetic
4,Male,76.0,1,1,current,20.14,4.8,155,0,non diabetic
5,Female,20.0,0,0,never,27.32,6.6,85,0,stress induced type 2 diabetic
6,Female,44.0,0,0,never,19.31,6.5,200,1,diabetic
7,Female,79.0,0,0,No Info,23.86,5.7,85,0,stress induced prediabetic
8,Male,42.0,0,0,never,33.64,4.8,145,0,non diabetic
9,Female,32.0,0,0,never,27.32,5.0,100,0,non diabetic



Dataset dimensions:
Number of rows: 100000, Number of columns: 10


In [4]:
# Data inspection: data information
print("\nDataset information:")
df.info()


Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
 9   diabetes_status      100000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 7.6+ MB


In [5]:
# Data inspection: statistical summary
print("\nStatistical summary:")
display(df.describe())


Statistical summary:


Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [6]:
# Data inspection: unique values and counts
for column in df.columns:
    print(f"Unique values in '{column}':")
    print(df[column].unique())
    print(f"\nUnique value counts in '{column}':")
    print(df[column].value_counts())
    print("\n")

Unique values in 'gender':
['Female' 'Male' 'Other']

Unique value counts in 'gender':
gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64


Unique values in 'age':
[80.   54.   28.   36.   76.   20.   44.   79.   42.   32.   53.   78.
 67.   15.   37.   40.    5.   69.   72.    4.   30.   45.   43.   50.
 41.   26.   34.   73.   77.   66.   29.   60.   38.    3.   57.   74.
 19.   46.   21.   59.   27.   13.   56.    2.    7.   11.    6.   55.
  9.   62.   47.   12.   68.   75.   22.   58.   18.   24.   17.   25.
  0.08 33.   16.   61.   31.    8.   49.   39.   65.   14.   70.    0.56
 48.   51.   71.    0.88 64.   63.   52.    0.16 10.   35.   23.    0.64
  1.16  1.64  0.72  1.88  1.32  0.8   1.24  1.    1.8   0.48  1.56  1.08
  0.24  1.4   0.4   0.32  1.72  1.48]

Unique value counts in 'age':
age
80.00    5621
51.00    1619
47.00    1574
48.00    1568
53.00    1542
         ... 
0.48       83
1.00       83
0.40       66
0.16       59
0.08       36
Name:

In [7]:
# Drop the diabetes and hba1c columns
df = df.drop('diabetes', axis=1)

In [8]:
# Drop records where gender is 'Other'
df = df[df['gender'] != 'Other']

In [9]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=False)

In [10]:
# Define the order of categories for diabetes_status
status_order = ['non diabetic', 'stress induced prediabetic', 'stress induced type 2 diabetic', 'prediabetic', 'diabetic']

# Create a mapping for the specified order
status_mapping = {status: i for i, status in enumerate(status_order)}

# Map 'diabetes_status' to the numeric encoding
df['diabetes_status'] = df['diabetes_status'].map(status_mapping)

In [11]:
# Move the encoded 'diabetes_status' to the right
cols = df.columns.tolist()
cols.append(cols.pop(cols.index('diabetes_status')))
df = df[cols]

In [12]:
# Data re inspection: view first 10 rows and shape
print("First ten rows of the dataset:")
display(df.head(10))

print("\nDataset dimensions:")
print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}")

First ten rows of the dataset:


Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,diabetes_status
0,80.0,0,1,25.19,6.6,140,True,False,False,False,False,False,True,False,2
1,54.0,0,0,27.32,6.6,80,True,False,True,False,False,False,False,False,2
2,28.0,0,0,27.32,5.7,158,False,True,False,False,False,False,True,False,1
3,36.0,0,0,23.45,5.0,155,True,False,False,True,False,False,False,False,0
4,76.0,1,1,20.14,4.8,155,False,True,False,True,False,False,False,False,0
5,20.0,0,0,27.32,6.6,85,True,False,False,False,False,False,True,False,2
6,44.0,0,0,19.31,6.5,200,True,False,False,False,False,False,True,False,4
7,79.0,0,0,23.86,5.7,85,True,False,True,False,False,False,False,False,1
8,42.0,0,0,33.64,4.8,145,False,True,False,False,False,False,True,False,0
9,32.0,0,0,27.32,5.0,100,True,False,False,False,False,False,True,False,0



Dataset dimensions:
Number of rows: 99982, Number of columns: 15


In [13]:
# Data re inspection: data information
print("\nDataset information:")
df.info()


Dataset information:
<class 'pandas.core.frame.DataFrame'>
Index: 99982 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   age                          99982 non-null  float64
 1   hypertension                 99982 non-null  int64  
 2   heart_disease                99982 non-null  int64  
 3   bmi                          99982 non-null  float64
 4   HbA1c_level                  99982 non-null  float64
 5   blood_glucose_level          99982 non-null  int64  
 6   gender_Female                99982 non-null  bool   
 7   gender_Male                  99982 non-null  bool   
 8   smoking_history_No Info      99982 non-null  bool   
 9   smoking_history_current      99982 non-null  bool   
 10  smoking_history_ever         99982 non-null  bool   
 11  smoking_history_former       99982 non-null  bool   
 12  smoking_history_never        99982 non-null  bool   
 13 

In [14]:
# Data re inspection: statistical summary
print("\nStatistical summary:")
display(df.describe())


Statistical summary:


Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes_status
count,99982.0,99982.0,99982.0,99982.0,99982.0,99982.0,99982.0
mean,41.888076,0.074863,0.039427,27.320757,5.527529,138.05781,0.99944
std,22.517206,0.263172,0.19461,6.636853,1.070665,40.709469,1.065033
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,1.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,1.0
max,80.0,1.0,1.0,95.69,9.0,300.0,4.0


In [15]:
# Data re inspection: unique values and counts
for column in df.columns:
    print(f"Unique values in '{column}':")
    print(df[column].unique())
    print(f"\nUnique value counts in '{column}':")
    print(df[column].value_counts())
    print("\n")

Unique values in 'age':
[80.   54.   28.   36.   76.   20.   44.   79.   42.   32.   53.   78.
 67.   15.   37.   40.    5.   69.   72.    4.   30.   45.   43.   50.
 41.   26.   34.   73.   77.   66.   29.   60.   38.    3.   57.   74.
 19.   46.   21.   59.   27.   13.   56.    2.    7.   11.    6.   55.
  9.   62.   47.   12.   68.   75.   22.   58.   18.   24.   17.   25.
  0.08 33.   16.   61.   31.    8.   49.   39.   65.   14.   70.    0.56
 48.   51.   71.    0.88 64.   63.   52.    0.16 10.   35.   23.    0.64
  1.16  1.64  0.72  1.88  1.32  0.8   1.24  1.    1.8   0.48  1.56  1.08
  0.24  1.4   0.4   0.32  1.72  1.48]

Unique value counts in 'age':
age
80.00    5621
51.00    1619
47.00    1572
48.00    1568
49.00    1541
         ... 
0.48       83
1.00       83
0.40       66
0.16       59
0.08       36
Name: count, Length: 102, dtype: int64


Unique values in 'hypertension':
[0 1]

Unique value counts in 'hypertension':
hypertension
0    92497
1     7485
Name: count, dtype: 

In [17]:
# Separate features and target
X = df.drop(columns=['diabetes_status'])
y = df['diabetes_status']

In [18]:
# Initialize SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [19]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [25]:
# List of models to evaluate
models = [
    AdaBoostClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    VotingClassifier(estimators=[
        ('knn', KNeighborsClassifier()),
        ('rf', RandomForestClassifier())
    ], voting='hard'),
    StackingClassifier(estimators=[
        ('knn', KNeighborsClassifier()),
        ('rf', RandomForestClassifier())
    ], final_estimator=LogisticRegression()),
]

In [27]:
def evaluate_models(models, X, y, kf):
    results = {}
    
    for model in models:
        model_name = type(model).__name__
        print(f"Evaluating {model_name}...")
        
        fold_accuracies = []
        fold_f1_scores = []
        fold_reports = []
        
        for fold, (train_index, test_index) in enumerate(kf.split(X)):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            report = classification_report(y_test, y_pred)
            
            fold_accuracies.append(accuracy)
            fold_f1_scores.append(f1)
            fold_reports.append(report)
            
            print(f"Fold {fold + 1} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
            print(f"Classification Report for Fold {fold + 1}:\n{report}\n")
        
        mean_accuracy = np.mean(fold_accuracies)
        std_accuracy = np.std(fold_accuracies)
        mean_f1 = np.mean(fold_f1_scores)
        std_f1 = np.std(fold_f1_scores)
        
        print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} (± {std_accuracy:.4f}), Mean F1 Score: {mean_f1:.4f} (± {std_f1:.4f})\n")
        
        results[model_name] = {
            'mean_accuracy': mean_accuracy,
            'std_accuracy': std_accuracy,
            'mean_f1': mean_f1,
            'std_f1': std_f1,
            'reports': fold_reports
        }
    
    return results

In [29]:
# Call the function with the updated list of models, features, target, and KFold object
results = evaluate_models(models, X_res, y_res, kf)

Evaluating AdaBoostClassifier...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 1 - Accuracy: 0.5935, F1 Score: 0.4594
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7583
           1       0.00      0.00      0.00      7703
           2       0.49      1.00      0.66      7362
           3       0.50      1.00      0.66      7626
           4       0.00      0.00      0.00      7755

    accuracy                           0.59     38029
   macro avg       0.40      0.60      0.46     38029
weighted avg       0.39      0.59      0.46     38029




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 2 - Accuracy: 0.5974, F1 Score: 0.4640
Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7650
           1       0.50      1.00      0.66      7541
           2       0.00      0.00      0.00      7678
           3       0.00      0.00      0.00      7631
           4       0.50      1.00      0.66      7529

    accuracy                           0.60     38029
   macro avg       0.40      0.60      0.47     38029
weighted avg       0.40      0.60      0.46     38029




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 3 - Accuracy: 0.5977, F1 Score: 0.4644
Classification Report for Fold 3:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7683
           1       0.00      0.00      0.00      7585
           2       0.00      0.00      0.00      7714
           3       0.50      1.00      0.67      7544
           4       0.49      1.00      0.66      7503

    accuracy                           0.60     38029
   macro avg       0.40      0.60      0.47     38029
weighted avg       0.40      0.60      0.46     38029




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 4 - Accuracy: 0.5967, F1 Score: 0.4626
Classification Report for Fold 4:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7479
           1       0.00      0.00      0.00      7676
           2       0.00      0.00      0.00      7662
           3       0.50      1.00      0.66      7556
           4       0.50      1.00      0.67      7656

    accuracy                           0.60     38029
   macro avg       0.40      0.60      0.47     38029
weighted avg       0.40      0.60      0.46     38029




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 5 - Accuracy: 0.5981, F1 Score: 0.4646
Classification Report for Fold 5:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7634
           1       0.50      1.00      0.66      7524
           2       0.00      0.00      0.00      7613
           3       0.00      0.00      0.00      7672
           4       0.50      1.00      0.67      7586

    accuracy                           0.60     38029
   macro avg       0.40      0.60      0.47     38029
weighted avg       0.40      0.60      0.46     38029



AdaBoostClassifier - Mean Accuracy: 0.5967 (± 0.0016), Mean F1 Score: 0.4630 (± 0.0019)

Evaluating KNeighborsClassifier...
Fold 1 - Accuracy: 0.8481, F1 Score: 0.8470
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.95      0.88      0.91      7583
           1       0.72      0.65      0.68      7703
           2       0.75      0.81      0.78      7362
           3