# Corrected Diabetes Modeling Notebook

This notebook is a cleaned and corrected version of the uploaded notebook. Main fixes applied:

- Missing imports added
- Scaling and preprocessing occur _after_ train/test split using `ColumnTransformer` + `Pipeline` (no data leakage)
- `stratify` used in train/test split
- Avoided SMOTE on one-hot encoded features; used `class_weight='balanced'` for classifiers (alternative SMOTENC commented)
- Correct ROC AUC computation using probabilities
- Saved the full pipeline (preprocessing + model) for inference
- Fixed undefined variables like `numeric_features`

You can run the cells top-to-bottom. Adjust hyperparameter grids as needed.

In [None]:

# Imports and settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, roc_curve, confusion_matrix, classification_report)

import joblib
import warnings
warnings.filterwarnings('ignore')

# display settings
%matplotlib inline


In [None]:

# Load data (adjust path if needed)
df = pd.read_csv('data/diabetes_prediction_dataset.csv')
print('Original shape:', df.shape)
df.head()


In [None]:

# Basic cleaning / feature engineering (non-leaky)
# Remove 'Other' gender and map to numeric
df = df[df['gender'] != 'Other'].copy()
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

# Replace 'No Info' with 'unknown' in smoking_history
df['smoking_history'] = df['smoking_history'].replace('No Info', 'unknown')

# Keep raw copies (for reference only) but do not use raw columns directly in model pipeline below
df['age_raw'] = df['age']
df['bmi_raw'] = df['bmi']
df['glucose_raw'] = df['blood_glucose_level']
df['HbA1c_raw'] = df['HbA1c_level']

# Derived features (these are okay -- they are deterministic transformations of existing features)
df['bmi_per_age'] = df['bmi_raw'] / (df['age_raw'].replace(0, np.nan))
df['glucose_HbA1c_ratio'] = df['glucose_raw'] / (df['HbA1c_raw'].replace(0, np.nan))
df['is_elderly'] = (df['age_raw'] > 60).astype(int)
df['is_obese'] = (df['bmi_raw'] >= 30).astype(int)

# Fill any infinite / NaN results from divisions
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

print('After basic cleaning:', df.shape)
df.head()


In [None]:

# Define feature lists
numeric_features = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'bmi_per_age', 'glucose_HbA1c_ratio']
categorical_features = ['smoking_history', 'gender', 'hypertension', 'heart_disease', 'is_elderly', 'is_obese']

# Keep X and y
X = df[numeric_features + categorical_features].copy()
y = df['diabetes'].astype(int).values

# Stratified split to preserve class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11, stratify=y)

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)


In [None]:

# Preprocessing: scale numeric, one-hot encode categorical
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
])

# Build pipeline with classifier. Use class_weight to handle imbalance (avoids SMOTE pitfalls)
pipe = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced', random_state=42))
])

# Fit pipeline on training data
pipe.fit(X_train, y_train)

# Predict probabilities and labels on test set
y_proba = pipe.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.5).astype(int)

# Correct metrics
print('Classification report:')
print(classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('ROC AUC:', roc_auc_score(y_test, y_proba))


In [None]:

def print_metrics(y_true, y_pred, y_proba=None):
    print('Accuracy:', accuracy_score(y_true, y_pred))
    print('Precision:', precision_score(y_true, y_pred, zero_division=0))
    print('Recall:', recall_score(y_true, y_pred, zero_division=0))
    print('F1:', f1_score(y_true, y_pred, zero_division=0))
    if y_proba is not None:
        print('ROC AUC:', roc_auc_score(y_true, y_proba))
        fpr, tpr, _ = roc_curve(y_true, y_proba)
        plt.figure(figsize=(6,4))
        plt.plot(fpr, tpr, label=f'ROC (AUC={roc_auc_score(y_true, y_proba):.3f})')
        plt.plot([0,1],[0,1],'--')
        plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.legend();
        plt.show()

print_metrics(y_test, y_pred, y_proba)


In [None]:

# Example grid search (use small grid to save time)
param_grid = {
    'clf__C': [0.01, 0.1, 1],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear']
}

grid = GridSearchCV(pipe, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print('Best params:', grid.best_params_)
best_pipe = grid.best_estimator_

# Evaluate best pipeline
y_proba_gs = best_pipe.predict_proba(X_test)[:,1]
y_pred_gs = (y_proba_gs > 0.5).astype(int)
print_metrics(y_test, y_pred_gs, y_proba_gs)


In [None]:

# Train RandomForest and GradientBoosting as examples
rf_pipe = Pipeline([
    ('preproc', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42))
])
rf_pipe.fit(X_train, y_train)
y_proba_rf = rf_pipe.predict_proba(X_test)[:,1]
y_pred_rf = (y_proba_rf > 0.5).astype(int)
print('\nRandom Forest metrics:')
print_metrics(y_test, y_pred_rf, y_proba_rf)

gb_pipe = Pipeline([
    ('preproc', preprocessor),
    ('clf', GradientBoostingClassifier(n_estimators=200, random_state=42))
])
gb_pipe.fit(X_train, y_train)
y_proba_gb = gb_pipe.predict_proba(X_test)[:,1]
y_pred_gb = (y_proba_gb > 0.5).astype(int)
print('\nGradient Boosting metrics:')
print_metrics(y_test, y_pred_gb, y_proba_gb)


In [None]:

# Save the best pipeline (grid search best or the original pipeline)
pipeline_to_save = grid.best_estimator_ if 'grid' in globals() else pipe
joblib.dump(pipeline_to_save, 'diabetes_pipeline.joblib')
print('Saved pipeline to diabetes_pipeline.joblib')


In [None]:

# Example: load pipeline and predict on a single raw sample (values are raw, not scaled)
loaded = joblib.load('diabetes_pipeline.joblib')

sample = X_test.iloc[0:1]
print('Sample raw values:\n', sample.to_dict(orient='records'))
print('Pred proba:', loaded.predict_proba(sample)[:,1])
print('Pred label:', loaded.predict(sample))
