In [None]:
# Import libraries
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, roc_auc_score, roc_curve, 
                            precision_recall_curve, average_precision_score, 
                            confusion_matrix, ConfusionMatrixDisplay)
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split data into train (70%) and test (30%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
# Initialize model (Random Forest for demonstration)
model = RandomForestClassifier(random_state=42)

In [None]:
# ======================
# 1. Cross-Validation
# ======================
print("\n=== Cross-Validation ===")
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, 
                           cv=stratified_kfold, scoring='accuracy')

print(f"Fold Accuracies: {cv_scores}")
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

In [None]:
# ======================
# 2. Model Training & Evaluation
# ======================
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # Probabilities for positive class

In [None]:
# Classification Report
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
print("\n=== Confusion Matrix ===")
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm, display_labels=['Benign', 'Malignant']).plot()
plt.title("Confusion Matrix")
plt.show()

In [None]:
# ======================
# 3. ROC & Precision-Recall Curves
# ======================
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# ROC & AUC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)
ax1.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
ax1.plot([0, 1], [0, 1], 'k--')
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curve')
ax1.legend(loc='lower right')
ax1.grid()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_prob)
avg_precision = average_precision_score(y_test, y_prob)
ax2.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.set_title('Precision-Recall Curve')
ax2.legend(loc='upper right')
ax2.grid()

plt.tight_layout()
plt.show()

In [None]:
# ======================
# 4. Bias-Variance Analysis (Bonus)
# ======================
print("\n=== Bias-Variance Analysis ===")
max_depths = [1, 3, 5, 10, 20, None]
train_scores, test_scores = [], []

for depth in max_depths:
    model = RandomForestClassifier(max_depth=depth, random_state=42)
    model.fit(X_train, y_train)
    train_scores.append(model.score(X_train, y_train))
    test_scores.append(model.score(X_test, y_test))

plt.figure(figsize=(8, 5))
plt.plot(max_depths[:-1], train_scores[:-1], 'o-', label='Train Accuracy')
plt.plot(max_depths[:-1], test_scores[:-1], 'o-', label='Test Accuracy')
plt.axvline(x=5, color='gray', linestyle='--', alpha=0.5)
plt.xlabel('Max Tree Depth (Complexity)')
plt.ylabel('Accuracy')
plt.title('Bias-Variance Tradeoff')
plt.legend()
plt.grid()
plt.show()