# ðŸ©º Breast Cancer Classification â€” Analysis Notebook

End-to-end analysis of the **Wisconsin Breast Cancer Dataset** using four ML models:
Logistic Regression, Random Forest, SVM, and XGBoost.

---

In [None]:
import sys
from pathlib import Path

# Ensure project root is on path
ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer

sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)
pd.set_option("display.max_columns", 35)

print(f"Project root: {ROOT}")

## 1 Â· Load Dataset

In [None]:
from data.loader import load_config, load_data, get_feature_importances

config = load_config(ROOT / "config" / "config.yaml")
bundle = load_data(config, scale=True, export_csv=False)

print(f"Train: {bundle.X_train.shape}  |  Test: {bundle.X_test.shape}")
print(f"Features: {len(bundle.feature_names)}")
print(f"Classes: {bundle.target_names}")
bundle.X_train.head()

## 2 Â· Exploratory Data Analysis

In [None]:
# Raw (unscaled) data for EDA
bunch = load_breast_cancer(as_frame=True)
X_raw = bunch.data
y_raw = bunch.target
target_names = list(bunch.target_names)

print(X_raw.describe().T.round(2))

In [None]:
# Class distribution
counts = y_raw.value_counts().sort_index()
labels = [target_names[i] for i in counts.index]

fig, ax = plt.subplots(figsize=(6, 4))
bars = ax.bar(labels, counts.values, color=["#e74c3c", "#2ecc71"], edgecolor="white", width=0.5)
for b, c in zip(bars, counts.values):
    ax.text(b.get_x() + b.get_width() / 2, b.get_height() + 5, str(c),
            ha="center", fontweight="bold", fontsize=13)
ax.set_title("Class Distribution", fontsize=14, fontweight="bold")
ax.set_ylabel("Count")
sns.despine()
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap â€” top 15 features
importance = get_feature_importances(X_raw, y_raw)
top15 = importance.head(15).index.tolist()
corr = X_raw[top15].corr()

fig, ax = plt.subplots(figsize=(10, 8))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="RdBu_r",
            center=0, square=True, linewidths=0.5, ax=ax)
ax.set_title("Correlation Heatmap â€” Top 15 Features", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()

In [None]:
# Box plots â€” top 5 discriminative features
top5 = importance.head(5).index.tolist()
plot_df = X_raw[top5].copy()
plot_df["class"] = y_raw.map(lambda v: target_names[v])

fig, axes = plt.subplots(1, 5, figsize=(20, 5))
palette = {"malignant": "#e74c3c", "benign": "#2ecc71"}
for ax, feat in zip(axes, top5):
    sns.boxplot(data=plot_df, x="class", y=feat, palette=palette, ax=ax, width=0.5)
    ax.set_title(feat, fontsize=11, fontweight="bold")
    ax.set_xlabel("")
fig.suptitle("Top 5 Discriminative Features", fontsize=14, fontweight="bold", y=1.02)
plt.tight_layout()
plt.show()

## 3 Â· Model Training & Cross-Validation

In [None]:
from pipeline.train import train_all

models, cv_results = train_all(config)
cv_results.round(4)

## 4 Â· Evaluation

In [None]:
from pipeline.evaluate import evaluate_all

eval_results = evaluate_all(config)
eval_results.round(4)

In [None]:
# Display saved ROC curve
from IPython.display import Image, display

roc_path = ROOT / "outputs" / "plots" / "roc_curves.png"
if roc_path.exists():
    display(Image(filename=str(roc_path), width=600))
else:
    print("ROC curve plot not found â€” run evaluate_all() first.")

## 5 Â· Prediction on New Samples

In [None]:
from pipeline.predict import predict

# Use the first sample from the original dataset
X_orig, y_orig = load_breast_cancer(return_X_y=True)
sample = X_orig[0].tolist()

print(f"True label: {target_names[y_orig[0]]}")
print()

for model_name in ["logistic_regression", "random_forest", "svm", "xgboost"]:
    result = predict(model_name, sample, config)
    print(f"{model_name:25s} â†’ {result['class_label']:10s}  "
          f"P(benign)={result['probability_benign']:.4f}  "
          f"P(malig.)={result['probability_malignant']:.4f}")

## 6 Â· Model Comparison Summary

In [None]:
# Visual comparison
if not eval_results.empty:
    metrics_to_plot = ["accuracy", "f1", "roc_auc"]
    fig, ax = plt.subplots(figsize=(10, 5))

    x = np.arange(len(eval_results))
    width = 0.25
    colours = ["#3498db", "#e74c3c", "#2ecc71"]

    for i, metric in enumerate(metrics_to_plot):
        ax.bar(x + i * width, eval_results[metric], width,
               label=metric.upper(), color=colours[i], edgecolor="white")

    ax.set_xticks(x + width)
    ax.set_xticklabels(eval_results["model"], fontsize=11)
    ax.set_ylim(0.90, 1.01)
    ax.set_ylabel("Score")
    ax.set_title("Model Comparison", fontsize=14, fontweight="bold")
    ax.legend()
    sns.despine()
    plt.tight_layout()
    plt.show()

---

**End of analysis.** All trained models, plots and reports are saved under `outputs/`.