In [2]:

# ‚îÄ‚îÄ‚îÄ STEP 1: Load Diabetes Dataset ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix,
                              roc_auc_score, accuracy_score)
 
# Use sklearn's diabetes dataset (or load Pima dataset from file)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
cols = ['Pregnancies','Glucose','BloodPressure','SkinThickness',
        'Insulin','BMI','DiabetesPedigree','Age','Diabetes']


In [5]:

# If no internet, create representative synthetic data:
np.random.seed(42)
n = 768
df = pd.DataFrame({
    'Glucose':          np.concatenate([np.random.normal(90, 15, 500),  np.random.normal(140, 25, 268)]),
    'BMI':              np.concatenate([np.random.normal(25, 4, 500),   np.random.normal(33, 6, 268)]),
    'Age':              np.concatenate([np.random.normal(28, 8, 500),   np.random.normal(36, 10, 268)]),
    'BloodPressure':    np.concatenate([np.random.normal(70, 10, 500),  np.random.normal(80, 12, 268)]),
    'Insulin':          np.concatenate([np.random.normal(80, 40, 500),  np.random.normal(160, 80, 268)]),
    'DiabetesPedigree': np.concatenate([np.random.normal(0.3, 0.2, 500), np.random.normal(0.6, 0.3, 268)]),
    'Diabetes':         [0]*500 + [1]*268
}).clip(0)
 
print(f"Dataset: {df.shape[0]} patients, {(df.Diabetes==1).sum()} diabetic")
print(df.groupby('Diabetes').mean().round(2))
 


Dataset: 768 patients, 268 diabetic
          Glucose    BMI    Age  BloodPressure  Insulin  DiabetesPedigree
Diabetes                                                                 
0           90.10  25.44  28.19          70.44    81.25              0.29
1          138.82  33.63  35.66          79.24   155.79              0.60


In [None]:
# ‚îÄ‚îÄ‚îÄ STEP 2: Handle Missing Values & Prepare Data ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# In clinical data, 0 values for Glucose/BMI/BP are physiologically impossible
# Replace with NaN, then impute with median (common in biomedical ML)

clinical_cols = ['Glucose', 'BloodPressure', 'BMI', 'Insulin']
for col in clinical_cols:
    if col in df.columns:
        df[col] = df[col].replace(0, np.nan)
        df[col].fillna(df[col].median(), inplace=True)
 
 
features = ['Glucose', 'BMI', 'Age', 'BloodPressure', 'Insulin', 'DiabetesPedigree']
X = df[features]
y = df['Diabetes']
 


In [6]:
X

Unnamed: 0,Glucose,BMI,Age,BloodPressure,Insulin,DiabetesPedigree
0,97.450712,22.492132,32.108801,89.956675,88.093152,0.177135
1,87.926035,32.249794,23.738393,101.099186,145.274276,0.158042
2,99.715328,27.831008,18.640665,76.067231,50.678680,0.495778
3,112.845448,22.750133,5.021902,68.168034,152.722477,0.025651
4,86.487699,27.529631,27.779881,75.345056,111.006220,0.621756
...,...,...,...,...,...,...
763,137.598503,22.389364,20.969197,80.220820,128.195379,0.993508
764,168.731833,34.939006,46.519476,82.703169,259.012301,0.272358
765,122.420589,32.114384,26.019394,88.312673,190.381411,0.523897
766,139.125288,30.203781,32.160287,64.768034,82.556339,0.496451


In [7]:
y

0      0
1      0
2      0
3      0
4      0
      ..
763    1
764    1
765    1
766    1
767    1
Name: Diabetes, Length: 768, dtype: int64

In [8]:
 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [18]:

# ‚îÄ‚îÄ‚îÄ STEP 3: Train Decision Tree ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Limit depth to prevent overfitting (clinically interpretable tree)
dt_model = DecisionTreeClassifier(max_depth=50, min_samples_split=10,
                                   random_state=42, class_weight='balanced')

dt_model.fit(X_train, y_train)

dt_pred = dt_model.predict(X_test)
dt_pred

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1],
      dtype=int64)

In [20]:
print("=== Decision Tree Results ===")
print(f"Accuracy:    {accuracy_score(y_test, dt_pred):.3f}")
print(f"AUC-ROC:     {roc_auc_score(y_test, dt_model.predict_proba(X_test)[:,1]):.3f}")
print("\nFeature Importances (clinical relevance):")
for feat, imp in sorted(zip(features, dt_model.feature_importances_),
                         key=lambda x: x[1], reverse=True):
    bar = '‚ñà' * int(imp * 30)
    print(f"  {feat:<20}: {imp:.3f}  {bar}")

=== Decision Tree Results ===
Accuracy:    0.961
AUC-ROC:     0.960

Feature Importances (clinical relevance):
  Glucose             : 0.678  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  BMI                 : 0.184  ‚ñà‚ñà‚ñà‚ñà‚ñà
  DiabetesPedigree    : 0.052  ‚ñà
  Insulin             : 0.050  ‚ñà
  Age                 : 0.021  
  BloodPressure       : 0.015  


In [23]:
# ‚îÄ‚îÄ‚îÄ STEP 4: Train Random Forest ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
rf_model = RandomForestClassifier(
    n_estimators=200,        # 200 decision trees
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced', # Important for imbalanced medical data!
    random_state=42,
    n_jobs=-1                # Use all CPU cores
)

rf_model.fit(X_train, y_train)
rf_pred  = rf_model.predict(X_test)
rf_proba = rf_model.predict_proba(X_test)[:,1]
 
print("=== Random Forest Results ===")
print(f"Accuracy:    {accuracy_score(y_test, rf_pred):.3f}")
print(f"AUC-ROC:     {roc_auc_score(y_test, rf_proba):.3f}")
print("\n" + classification_report(y_test, rf_pred,
      target_names=['Non-Diabetic', 'Diabetic']))


=== Random Forest Results ===
Accuracy:    0.981
AUC-ROC:     0.999

              precision    recall  f1-score   support

Non-Diabetic       0.98      0.99      0.99       100
    Diabetic       0.98      0.96      0.97        54

    accuracy                           0.98       154
   macro avg       0.98      0.98      0.98       154
weighted avg       0.98      0.98      0.98       154



In [26]:
# ‚îÄ‚îÄ‚îÄ STEP 5: Clinical Feature Importance Analysis ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print("=== üî¨ Random Forest Feature Importance ===")
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)
 
for _, row in importance_df.iterrows():
    bar = '‚ñà' * int(row.Importance * 40)
    print(f"  {row.Feature:<20}: {row.Importance:.3f}  {bar}")

=== üî¨ Random Forest Feature Importance ===
  Glucose             : 0.430  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  BMI                 : 0.211  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  DiabetesPedigree    : 0.141  ‚ñà‚ñà‚ñà‚ñà‚ñà
  Insulin             : 0.131  ‚ñà‚ñà‚ñà‚ñà‚ñà
  Age                 : 0.045  ‚ñà
  BloodPressure       : 0.042  ‚ñà


In [None]:
print("\nüîÆ Risk Prediction for 3 Patient Profiles:")
profiles = pd.DataFrame({
    'Glucose': [95, 155, 200],
    'BMI':     [22, 30, 38],
    'Age':     [25, 45, 55],
    'BloodPressure': [70, 82, 88],
    'Insulin': [85, 140, 250],
    'DiabetesPedigree': [0.2, 0.5, 0.9],
})

for i, (_, row) in enumerate(profiles.iterrows()):
    prob = rf_model.predict_proba(profiles.iloc[[i]])[0,1]
    risk = "HIGH RISK ‚ö†Ô∏è" if prob > 0.6 else "MODERATE ‚ö°" if prob > 0.3 else "LOW RISK ‚úì"
    print(f"  Patient {i+1}: Glucose={row.Glucose}, BMI={row.BMI:.0f}, Age={row.Age}"
          f"  ‚Üí Diabetes probability: {prob:.1%}  [{risk}]")



üîÆ Risk Prediction for 3 Patient Profiles:
  Patient 1: Glucose=95.0, BMI=22, Age=25.0  ‚Üí Diabetes probability: 0.3%  [LOW RISK ‚úì]
  Patient 2: Glucose=155.0, BMI=30, Age=45.0  ‚Üí Diabetes probability: 93.3%  [HIGH RISK ‚ö†Ô∏è]
  Patient 3: Glucose=200.0, BMI=38, Age=55.0  ‚Üí Diabetes probability: 99.5%  [HIGH RISK ‚ö†Ô∏è]


In [35]:

# ‚îÄ‚îÄ‚îÄ MODEL COMPARISON ON BIOMEDICAL DATA ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
 
# Load breast cancer dataset
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
 
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN (k=7)':           KNeighborsClassifier(n_neighbors=7),
    'Decision Tree':       DecisionTreeClassifier(max_depth=5, random_state=42),
    'Random Forest':       RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM':                 SVC(kernel='rbf', probability=True)
}
 
# Use stratified 10-fold cross-validation (standard in clinical ML)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
 
print("=== üè• Model Comparison: Breast Cancer Diagnosis ===")
print(f"{'Model':<25} {'Accuracy':>10} {'¬±Std':>8} {'AUC-ROC':>10} {'Sensitivity':>12}")
print("-" * 70)


=== üè• Model Comparison: Breast Cancer Diagnosis ===
Model                       Accuracy     ¬±Std    AUC-ROC  Sensitivity
----------------------------------------------------------------------


In [34]:
results = {}
for name, model in models.items():
    # Pipeline ensures scaling is done inside each fold (prevents data leakage!)
    pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])
    scores = cross_validate(pipe, X, y, cv=cv,
                            scoring=['accuracy', 'roc_auc', 'recall'],
                            return_train_score=True)
    results[name] = scores
    print(f"{name:<25} {scores['test_accuracy'].mean():>10.3f} "
          f"{scores['test_accuracy'].std():>8.3f} "
          f"{scores['test_roc_auc'].mean():>10.3f} "
          f"{scores['test_recall'].mean():>12.3f}")

Logistic Regression            0.975    0.020      0.995        0.989
KNN (k=7)                      0.965    0.021      0.986        0.992
Decision Tree                  0.932    0.034      0.917        0.955
Random Forest                  0.956    0.024      0.989        0.969
SVM                            0.975    0.020      0.996        0.983


In [36]:
# ‚îÄ‚îÄ‚îÄ VISUALIZE AND INTERPRET RESULTS ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Create comparison dataframe
comparison = pd.DataFrame({
    name: {
        'Mean Accuracy': scores['test_accuracy'].mean(),
        'Std Accuracy':  scores['test_accuracy'].std(),
        'AUC-ROC':       scores['test_roc_auc'].mean(),
        'Sensitivity':   scores['test_recall'].mean(),
        'Train Accuracy': scores['train_accuracy'].mean(),
    }
    for name, scores in results.items()
}).T.round(4)
 


In [37]:
print("\nüìä Overfitting Check (Train vs Test Accuracy):")
comparison['Overfit_Gap'] = comparison['Train Accuracy'] - comparison['Mean Accuracy']
print(comparison[['Mean Accuracy', 'Train Accuracy', 'Overfit_Gap']].round(4))
 



üìä Overfitting Check (Train vs Test Accuracy):
                     Mean Accuracy  Train Accuracy  Overfit_Gap
Logistic Regression         0.9754          0.9887       0.0133
KNN (k=7)                   0.9649          0.9721       0.0072
Decision Tree               0.9315          0.9924       0.0609
Random Forest               0.9561          1.0000       0.0439
SVM                         0.9754          0.9869       0.0115


In [38]:
print("\nüèÜ Best Model by AUC-ROC:")
best = comparison['AUC-ROC'].idxmax()
print(f"  ‚Üí {best}: AUC-ROC = {comparison.loc[best, 'AUC-ROC']:.3f}")
print(f"     Clinical interpretation: Excellent diagnostic discrimination")



üèÜ Best Model by AUC-ROC:
  ‚Üí SVM: AUC-ROC = 0.996
     Clinical interpretation: Excellent diagnostic discrimination


In [None]:
 
# 6.6 Hands-On Project 6: Hyperparameter Tuning for Diagnostic Models
# ‚îÄ‚îÄ‚îÄ GRID SEARCH FOR CLINICAL MODEL OPTIMIZATION ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
 
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
 
# Define hyperparameter search space
param_grid = {
    'model__n_estimators':    [50, 100, 200],
    'model__max_depth':       [5, 10, 15, None],
    'model__min_samples_split': [2, 5, 10],
    'model__class_weight':    ['balanced', None]
}
 
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42))
])
 
# Optimize for AUC-ROC (standard for clinical diagnostic models)
grid_search = GridSearchCV(
    pipe, param_grid, cv=5, scoring='roc_auc',
    n_jobs=-1, verbose=1
)
 
print("üîç Running Grid Search (optimizing AUC-ROC for cancer diagnosis)...")
grid_search.fit(X_train, y_train)
 
print(f"\n‚úÖ Best Parameters: {grid_search.best_params_}")
print(f"   Best CV AUC-ROC: {grid_search.best_score_:.4f}")
 
# Final evaluation on held-out test set
best_model = grid_search.best_estimator_
test_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
print(f"\nüè• Final Test Performance:")
print(f"   AUC-ROC on held-out patients: {test_auc:.4f}")
print(classification_report(y_test, best_model.predict(X_test),
      target_names=['Malignant', 'Benign']))
 
