# Binary Classification - Diabetes Prediction

This notebook implements binary classification for diabetes prediction with confidence estimation.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
import warnings
warnings.filterwarnings('ignore')
print('Libraries loaded!')

## Load Data

In [None]:
df = pd.read_csv('../data/processed/diabetes_binary.csv')
print(f'Dataset shape: {df.shape}')
print(f'Target distribution:\n{df["Outcome"].value_counts()}')
df.head()

## Data Preparation

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

## Model Training and Evaluation

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42)
}

results = {}
for name, model in models.items():
    print(f'\nTraining {name}...')
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    results[name] = {
        'model': model,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba),
        'y_pred': y_pred,
        'y_proba': y_proba
    }
    
print('\nTraining complete!')

In [None]:
# Performance comparison
perf_df = pd.DataFrame({name: {k: v for k, v in data.items() if k not in ['model', 'y_pred', 'y_proba']} for name, data in results.items()}).T
print('\nModel Performance Comparison:')
print(perf_df.round(4))

In [None]:
# Visualize performance
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
metrics = ['accuracy', 'precision', 'recall', 'f1']
for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    values = [results[m][metric] for m in models.keys()]
    ax.bar(models.keys(), values, color=['#3498db', '#2ecc71', '#f39c12', '#e74c3c'])
    ax.set_title(f'{metric.capitalize()} Comparison', fontsize=12, fontweight='bold')
    ax.set_ylabel(metric.capitalize())
    ax.set_ylim([0, 1])
    ax.tick_params(axis='x', rotation=45)
    for i, v in enumerate(values):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center')
plt.tight_layout()
plt.show()

## Confusion Matrices

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for idx, (name, data) in enumerate(results.items()):
    ax = axes[idx // 2, idx % 2]
    cm = confusion_matrix(y_test, data['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(f'{name}\nConfusion Matrix')
    ax.set_ylabel('True Label')
    ax.set_xlabel('Predicted Label')
plt.tight_layout()
plt.show()

## ROC Curves

In [None]:
plt.figure(figsize=(10, 8))
for name, data in results.items():
    fpr, tpr, _ = roc_curve(y_test, data['y_proba'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {data["roc_auc"]:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Binary Classification', fontsize=14, fontweight='bold')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.show()

## Confidence Estimation Techniques

### 1. Prediction Probabilities

In [None]:
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
best_model = results[best_model_name]['model']
print(f'Best model: {best_model_name}')

# Get prediction probabilities
proba = best_model.predict_proba(X_test_scaled)
confidence = np.max(proba, axis=1)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(confidence, bins=30, edgecolor='black')
plt.xlabel('Confidence')
plt.ylabel('Frequency')
plt.title(f'{best_model_name}\nPrediction Confidence Distribution')

plt.subplot(1, 2, 2)
plt.scatter(range(len(confidence)), confidence, c=y_test, cmap='RdYlGn', alpha=0.6)
plt.xlabel('Sample Index')
plt.ylabel('Confidence')
plt.title('Confidence by Sample')
plt.colorbar(label='True Label')
plt.tight_layout()
plt.show()

### 2. Calibration Curves

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
for idx, (name, data) in enumerate(results.items()):
    ax = axes[idx // 2, idx % 2]
    prob_true, prob_pred = calibration_curve(y_test, data['y_proba'], n_bins=10)
    ax.plot(prob_pred, prob_true, marker='o', linewidth=2, label=name)
    ax.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
    ax.set_xlabel('Predicted Probability')
    ax.set_ylabel('True Probability')
    ax.set_title(f'{name}\nCalibration Curve')
    ax.legend()
    ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

### 3. Calibrated Predictions

In [None]:
# Apply calibration
calibrated_model = CalibratedClassifierCV(best_model, method='sigmoid', cv=5)
calibrated_model.fit(X_train_scaled, y_train)

y_proba_calibrated = calibrated_model.predict_proba(X_test_scaled)[:, 1]

# Compare calibration
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Before calibration
prob_true, prob_pred = calibration_curve(y_test, results[best_model_name]['y_proba'], n_bins=10)
axes[0].plot(prob_pred, prob_true, marker='o', linewidth=2, label='Before Calibration')
axes[0].plot([0, 1], [0, 1], 'k--', label='Perfect')
axes[0].set_xlabel('Predicted Probability')
axes[0].set_ylabel('True Probability')
axes[0].set_title('Before Calibration')
axes[0].legend()
axes[0].grid(alpha=0.3)

# After calibration
prob_true_cal, prob_pred_cal = calibration_curve(y_test, y_proba_calibrated, n_bins=10)
axes[1].plot(prob_pred_cal, prob_true_cal, marker='o', linewidth=2, label='After Calibration', color='green')
axes[1].plot([0, 1], [0, 1], 'k--', label='Perfect')
axes[1].set_xlabel('Predicted Probability')
axes[1].set_ylabel('True Probability')
axes[1].set_title('After Calibration')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

### 4. Bootstrap Confidence Intervals

In [None]:
from sklearn.utils import resample

n_iterations = 100
scores = []

for _ in range(n_iterations):
    X_boot, y_boot = resample(X_test_scaled, y_test, random_state=_)
    y_pred_boot = best_model.predict(X_boot)
    scores.append(accuracy_score(y_boot, y_pred_boot))

mean_score = np.mean(scores)
ci_lower = np.percentile(scores, 2.5)
ci_upper = np.percentile(scores, 97.5)

print(f'Bootstrap Accuracy: {mean_score:.4f}')
print(f'95% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]')

plt.figure(figsize=(10, 5))
plt.hist(scores, bins=20, edgecolor='black', alpha=0.7)
plt.axvline(mean_score, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_score:.4f}')
plt.axvline(ci_lower, color='green', linestyle='--', linewidth=2, label=f'CI Lower: {ci_lower:.4f}')
plt.axvline(ci_upper, color='green', linestyle='--', linewidth=2, label=f'CI Upper: {ci_upper:.4f}')
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.title('Bootstrap Accuracy Distribution')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

## Summary

In this notebook, we:
1. Trained multiple classification models
2. Evaluated performance metrics
3. Implemented confidence estimation techniques:
   - Prediction probabilities
   - Calibration curves
   - Calibrated predictions
   - Bootstrap confidence intervals

The best performing model can now be used for diabetes prediction with confidence estimates.