In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score, f1_score

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, PassiveAggressiveClassifier, Perceptron, RidgeClassifier, RidgeClassifierCV, SGDClassifier

In [2]:
# Load the processed dataset
df = pd.read_csv('../data/preprocessed/with_diabetes_status/dataset_with_diabetes_status.csv')

In [3]:
# Drop the diabetes and hba1c columns
df = df.drop('diabetes', axis=1)

In [4]:
# Drop records where gender is 'Other'
df = df[df['gender'] != 'Other']

In [5]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=False)

In [6]:
# Define the order of categories for diabetes_status
status_order = ['non diabetic', 'stress induced prediabetic', 'stress induced type 2 diabetic', 'prediabetic', 'diabetic']

# Create a mapping for the specified order
status_mapping = {status: i for i, status in enumerate(status_order)}

# Map 'diabetes_status' to the numeric encoding
df['diabetes_status'] = df['diabetes_status'].map(status_mapping)

In [7]:
# Move the encoded 'diabetes_status' to the right
cols = df.columns.tolist()
cols.append(cols.pop(cols.index('diabetes_status')))
df = df[cols]

In [8]:
# Separate features and target
X = df.drop(columns=['diabetes_status'])
y = df['diabetes_status']

In [9]:
# Initialize SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [10]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
# List of models to evaluate
models = [
    LogisticRegression(),
    LogisticRegressionCV(),
    PassiveAggressiveClassifier(),
    Perceptron(),
    RidgeClassifier(),
    RidgeClassifierCV(),
    SGDClassifier(),
]

In [12]:
def evaluate_models(models, X, y, kf):
    results = {}
    
    for model in models:
        model_name = type(model).__name__
        print(f"Evaluating {model_name}...")
        
        fold_accuracies = []
        fold_f1_scores = []
        fold_reports = []
        
        for fold, (train_index, test_index) in enumerate(kf.split(X)):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            report = classification_report(y_test, y_pred)
            
            fold_accuracies.append(accuracy)
            fold_f1_scores.append(f1)
            fold_reports.append(report)
            
            print(f"Fold {fold + 1} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
            print(f"Classification Report for Fold {fold + 1}:\n{report}\n")
        
        mean_accuracy = np.mean(fold_accuracies)
        std_accuracy = np.std(fold_accuracies)
        mean_f1 = np.mean(fold_f1_scores)
        std_f1 = np.std(fold_f1_scores)
        
        print(f"\n{model_name} - Mean Accuracy: {mean_accuracy:.4f} (± {std_accuracy:.4f}), Mean F1 Score: {mean_f1:.4f} (± {std_f1:.4f})\n")
        
        results[model_name] = {
            'mean_accuracy': mean_accuracy,
            'std_accuracy': std_accuracy,
            'mean_f1': mean_f1,
            'std_f1': std_f1,
            'reports': fold_reports
        }
    
    return results

In [13]:
# Call the function with the updated list of models, features, target, and KFold object
results = evaluate_models(models, X_res, y_res, kf)

Evaluating LogisticRegression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1 - Accuracy: 0.4738, F1 Score: 0.4741
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.64      0.58      0.60      7583
           1       0.39      0.37      0.38      7703
           2       0.38      0.34      0.36      7362
           3       0.57      0.56      0.57      7626
           4       0.41      0.51      0.46      7755

    accuracy                           0.47     38029
   macro avg       0.48      0.47      0.47     38029
weighted avg       0.48      0.47      0.47     38029




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2 - Accuracy: 0.4760, F1 Score: 0.4650
Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       0.68      0.51      0.58      7650
           1       0.36      0.17      0.23      7541
           2       0.42      0.52      0.47      7678
           3       0.53      0.60      0.57      7631
           4       0.40      0.57      0.47      7529

    accuracy                           0.48     38029
   macro avg       0.48      0.48      0.46     38029
weighted avg       0.48      0.48      0.46     38029




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3 - Accuracy: 0.4502, F1 Score: 0.4505
Classification Report for Fold 3:
              precision    recall  f1-score   support

           0       0.64      0.44      0.52      7683
           1       0.36      0.29      0.32      7585
           2       0.46      0.46      0.46      7714
           3       0.50      0.57      0.53      7544
           4       0.35      0.50      0.41      7503

    accuracy                           0.45     38029
   macro avg       0.47      0.45      0.45     38029
weighted avg       0.47      0.45      0.45     38029




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 4 - Accuracy: 0.4304, F1 Score: 0.4262
Classification Report for Fold 4:
              precision    recall  f1-score   support

           0       0.55      0.45      0.50      7479
           1       0.36      0.22      0.27      7676
           2       0.46      0.54      0.50      7662
           3       0.49      0.50      0.49      7556
           4       0.32      0.44      0.37      7656

    accuracy                           0.43     38029
   macro avg       0.44      0.43      0.43     38029
weighted avg       0.44      0.43      0.43     38029




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 5 - Accuracy: 0.4748, F1 Score: 0.4675
Classification Report for Fold 5:
              precision    recall  f1-score   support

           0       0.63      0.56      0.59      7634
           1       0.29      0.19      0.23      7524
           2       0.42      0.52      0.46      7613
           3       0.58      0.61      0.60      7672
           4       0.42      0.49      0.45      7586

    accuracy                           0.47     38029
   macro avg       0.47      0.47      0.47     38029
weighted avg       0.47      0.47      0.47     38029



LogisticRegression - Mean Accuracy: 0.4610 (± 0.0181), Mean F1 Score: 0.4567 (± 0.0170)

Evaluating LogisticRegressionCV...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 1 - Accuracy: 0.7046, F1 Score: 0.6863
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      7583
           1       0.54      0.79      0.64      7703
           2       0.63      0.25      0.36      7362
           3       0.73      0.81      0.77      7626
           4       0.75      0.77      0.76      7755

    accuracy                           0.70     38029
   macro avg       0.71      0.70      0.68     38029
weighted avg       0.71      0.70      0.69     38029




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 2 - Accuracy: 0.7440, F1 Score: 0.7429
Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      7650
           1       0.61      0.70      0.65      7541
           2       0.64      0.53      0.58      7678
           3       0.78      0.82      0.80      7631
           4       0.74      0.74      0.74      7529

    accuracy                           0.74     38029
   macro avg       0.74      0.74      0.74     38029
weighted avg       0.75      0.74      0.74     38029




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 3 - Accuracy: 0.7368, F1 Score: 0.7355
Classification Report for Fold 3:
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      7683
           1       0.62      0.68      0.65      7585
           2       0.65      0.54      0.59      7714
           3       0.75      0.80      0.77      7544
           4       0.73      0.77      0.75      7503

    accuracy                           0.74     38029
   macro avg       0.74      0.74      0.74     38029
weighted avg       0.74      0.74      0.74     38029




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 4 - Accuracy: 0.7440, F1 Score: 0.7435
Classification Report for Fold 4:
              precision    recall  f1-score   support

           0       0.97      0.90      0.94      7479
           1       0.61      0.70      0.65      7676
           2       0.66      0.53      0.59      7662
           3       0.77      0.83      0.80      7556
           4       0.74      0.76      0.75      7656

    accuracy                           0.74     38029
   macro avg       0.75      0.75      0.74     38029
weighted avg       0.75      0.74      0.74     38029




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 5 - Accuracy: 0.7376, F1 Score: 0.7342
Classification Report for Fold 5:
              precision    recall  f1-score   support

           0       0.95      0.90      0.93      7634
           1       0.59      0.81      0.68      7524
           2       0.68      0.46      0.55      7613
           3       0.79      0.82      0.81      7672
           4       0.71      0.69      0.70      7586

    accuracy                           0.74     38029
   macro avg       0.75      0.74      0.73     38029
weighted avg       0.75      0.74      0.73     38029



LogisticRegressionCV - Mean Accuracy: 0.7334 (± 0.0147), Mean F1 Score: 0.7285 (± 0.0214)

Evaluating PassiveAggressiveClassifier...
Fold 1 - Accuracy: 0.6900, F1 Score: 0.6705
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93      7583
           1       0.54      0.73      0.62      7703
           2       0.51      0.60      0.55      7362
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 5 - Accuracy: 0.5360, F1 Score: 0.4391
Classification Report for Fold 5:
              precision    recall  f1-score   support

           0       0.46      1.00      0.63      7634
           1       0.55      0.51      0.53      7524
           2       0.70      0.19      0.29      7613
           3       0.00      0.00      0.00      7672
           4       0.60      0.99      0.75      7586

    accuracy                           0.54     38029
   macro avg       0.46      0.54      0.44     38029
weighted avg       0.46      0.54      0.44     38029



Perceptron - Mean Accuracy: 0.5698 (± 0.0510), Mean F1 Score: 0.5050 (± 0.0729)

Evaluating RidgeClassifier...
Fold 1 - Accuracy: 0.7426, F1 Score: 0.7329
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      7583
           1       0.69      0.50      0.58      7703
           2       0.65      0.60      0.63      7362
           3       0.76  



Fold 1 - Accuracy: 0.7473, F1 Score: 0.7537
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      7583
           1       0.58      0.71      0.64      7703
           2       0.50      0.63      0.56      7362
           3       0.86      0.53      0.66      7626
           4       0.99      0.88      0.93      7755

    accuracy                           0.75     38029
   macro avg       0.78      0.75      0.75     38029
weighted avg       0.78      0.75      0.75     38029


Fold 2 - Accuracy: 0.7617, F1 Score: 0.7660
Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      7650
           1       0.58      0.72      0.64      7541
           2       0.54      0.62      0.58      7678
           3       0.87      0.58      0.70      7631
           4       0.96      0.90      0.93      7529

    accuracy                



Fold 4 - Accuracy: 0.7159, F1 Score: 0.7108
Classification Report for Fold 4:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      7479
           1       0.59      0.41      0.48      7676
           2       0.46      0.85      0.59      7662
           3       0.94      0.41      0.57      7556
           4       0.95      0.92      0.94      7656

    accuracy                           0.72     38029
   macro avg       0.78      0.72      0.71     38029
weighted avg       0.78      0.72      0.71     38029






Fold 5 - Accuracy: 0.7690, F1 Score: 0.7715
Classification Report for Fold 5:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      7634
           1       0.58      0.78      0.66      7524
           2       0.57      0.59      0.58      7613
           3       0.89      0.58      0.70      7672
           4       0.96      0.91      0.93      7586

    accuracy                           0.77     38029
   macro avg       0.79      0.77      0.77     38029
weighted avg       0.79      0.77      0.77     38029



SGDClassifier - Mean Accuracy: 0.7353 (± 0.0321), Mean F1 Score: 0.7278 (± 0.0501)

