# Diabetes Prediction Model with ML Classifiers

This script performs **feature scaling, class balancing (SMOTE), model evaluation, and hyperparameter tuning** for diabetes prediction. It evaluates multiple classifiers (Logistic Regression, Decision Tree, KNN, Naive Bayes, Neural Network) and tunes **Random Forest, SVM, and XGBoost** for improved performance.


In [None]:
# Updated imports with consistent organization
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv('/wins_encoded_data.csv')

# Display class distribution to compute class imbalance ratio
class_proportion = df['Diabetic'].value_counts(normalize=True)
print("Class Proportion:\n", class_proportion)

# Split features and target variable
X = df.drop('Diabetic', axis=1)  # Features
y = df['Diabetic']  # Target variable

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Balance dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split resampled data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Function to check for overfitting
def check_overfitting(train_acc, val_acc, threshold=0.05):
    diff = train_acc - val_acc
    if diff > threshold:
        return f"Overfitting detected: Train-Val Accuracy Difference = {diff:.4f} > Threshold = {threshold}"
    else:
        return f"No significant overfitting: Train-Val Accuracy Difference = {diff:.4f}"

# Evaluate non-tuned models
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', random_state=42),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', max_depth=10, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Neural Network": MLPClassifier(
        hidden_layer_sizes=(100,),
        max_iter=300,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42
    )
}

print("\nEvaluating Non-Tuned Models:")
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...")
    model.fit(X_train, y_train)

    # Predict on train, validation, and test sets
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    # Evaluate accuracies
    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Overfitting Check: {check_overfitting(train_accuracy, val_accuracy)}")
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_test_pred)}")

# Hyperparameter tuning for advanced models
param_grids = {
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': ['balanced']
    },
    "SVM (RBF Kernel)": {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'class_weight': ['balanced']
    },
    "XGBoost": {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'scale_pos_weight': [1, 2, 3]
    }
}

print("\nTuning Models:")
for model_name, param_grid in param_grids.items():
    print(f"\nTuning {model_name}...")
    if model_name == "Random Forest":
        model = RandomForestClassifier(random_state=42)
    elif model_name == "SVM (RBF Kernel)":
        model = SVC(kernel='rbf', random_state=42)
    elif model_name == "XGBoost":
        model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=20,
        cv=5,
        scoring='accuracy',
        random_state=42,
        n_jobs=-1
    )
    search.fit(X_train, y_train)

    # Evaluate tuned model
    y_train_pred = search.best_estimator_.predict(X_train)
    y_val_pred = search.best_estimator_.predict(X_val)
    y_test_pred = search.best_estimator_.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Best Parameters for {model_name}: {search.best_params_}")
    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Overfitting Check: {check_overfitting(train_accuracy, val_accuracy)}")
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_test_pred)}")


Class Proportion:
 Diabetic
0    0.762663
2    0.206338
1    0.025479
3    0.005519
Name: proportion, dtype: float64





Evaluating Non-Tuned Models:

Evaluating Logistic Regression...
Train Accuracy: 0.4975
Validation Accuracy: 0.5020
Test Accuracy: 0.5001
Overfitting Check: No significant overfitting: Train-Val Accuracy Difference = -0.0045
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.52      0.33      0.41      6704
           1       0.39      0.22      0.28      6752
           2       0.48      0.59      0.53      6792
           3       0.54      0.86      0.66      6782

    accuracy                           0.50     27030
   macro avg       0.49      0.50      0.47     27030
weighted avg       0.49      0.50      0.47     27030


Evaluating Decision Tree...
Train Accuracy: 0.6632
Validation Accuracy: 0.6515
Test Accuracy: 0.6542
Overfitting Check: No significant overfitting: Train-Val Accuracy Difference = 0.0117
Classification Report for Decision Tree:
              precision    recall  f1-score   support

         



# Conclusion

The best performing model is **Random Forest (Tuned)** with a test accuracy of **91.26%**, though it suffers from overfitting. **K-Nearest Neighbors** performed well without overfitting, achieving a test accuracy of **84.96%**. **Naive Bayes** had the lowest performance with **45.79%** accuracy.
