# Baseline Supervised Classifier â€” Exploration

Lightweight notebook for interactive exploration.  
All reusable logic lives in `src/`. This notebook only does:

1. Config loading
2. Feature preparation
3. Training (calls `src.models.train`)
4. Evaluation
5. **Visualization** (the only unique content)

In [1]:
import sys
from pathlib import Path
try:
    import pyvips  # type: ignore
    _HAS_PYVIPS = True
except Exception:
    pyvips = None
    _HAS_PYVIPS = False

PROJECT_ROOT = Path("..").resolve()
sys.path.insert(0, str(PROJECT_ROOT))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from src.config import load_paths, load_params
from src.data.features import prepare_features
from src.models.train import cross_validate, predict_external
from src.evaluation.grading import agg_from_tiles
from src.evaluation.metrics import evaluate_slide_predictions


In [2]:
from viz.overlay_thumbnail_masks import run

image_path = Path("../tmp/Subset3_Train_1_Akoya.tiff")

mask_dir = Path("../tmp/Subset3_Train_1_Akoya")

out_dir = Path("../tmp/img_test_outputs")

thumb_path, overlay_path = run(image_path=image_path,mask_dir=mask_dir,
    scale=0.02,out_dir=out_dir, alpha=0.35,prefer_backend="auto",with_legend=True,)


In [3]:
for image_num in [2,3,4]:
    image_path = Path(f"../tmp/Subset3_Train_{image_num}_Akoya.tiff")
    mask_dir = Path(f"../tmp/Subset3_Train_{image_num}_Akoya")  
    thumb_path, overlay_path = run(image_path=image_path,mask_dir=mask_dir,
        scale=0.02,out_dir=out_dir, alpha=0.35,prefer_backend="auto",with_legend=True,)

In [None]:
# --- Configuration ---
ENV = "nscc"              # change to "local" for local development
MODEL = "XGB"             # "LR_SGD", "XGB", or "MLP"
COMBINE_2048 = True
L2_NORM = True

paths = load_paths(ENV)
params = load_params()
print("work_root:", paths["work_root"])

In [None]:
# --- Prepare features ---
norm_mode = params["training"].get("norm_mode", "multiscale")
aggc_feats, tcga_feats = prepare_features(
    paths["features"],
    combine_2048=COMBINE_2048,
    l2_norm=L2_NORM,
    norm_mode=norm_mode,
)
print(f"AGGC: {aggc_feats.shape}, TCGA: {tcga_feats.shape}")

In [None]:
# --- Load metadata ---
df_aggc_idx = pd.read_csv(paths["features"]["aggc_1024"]["index_csv"])
tile_npz = np.load(paths["features"]["aggc_1024"]["tile_npz"])
Y = tile_npz["targets"]
y_major = Y.argmax(axis=1)
print("y_major distribution:", np.bincount(y_major))

df_tcga_idx = pd.read_csv(paths["features"]["tcga_1024"]["index_csv"])

truth_tcga_df = pd.read_csv(paths["metadata"]["tcga_truth"])
filter_col = params["evaluation"].get("tcga_filter_col")
filter_val = params["evaluation"].get("tcga_filter_val")
if filter_col:
    truth_tcga_df = truth_tcga_df[truth_tcga_df[filter_col] == filter_val]

truth_aggc_df = pd.read_csv(paths["metadata"]["aggc_meta"])

In [None]:
# --- Train with cross-validation ---
model_params = params["models"][MODEL]
cv_result = cross_validate(
    aggc_feats, y_major, df_aggc_idx,
    MODEL, model_params,
    n_splits=params["training"]["n_splits"],
    seed=params["training"]["seed"],
    split_level=params["training"].get("split_level", "slide"),
)

m = cv_result["tile_metrics"]
print(f"OOF accuracy     = {m['acc']:.4f}")
print(f"OOF balanced acc = {m['bacc']:.4f}")
print(f"OOF macro AUC    = {m['auc_macro']:.4f}")
print(f"OOF macro F1     = {m['f1_macro']:.4f}")
print(f"OOF weighted F1  = {m['f1_weighted']:.4f}")

In [None]:
# --- Confusion matrix (AGGC OOF) ---
target_names = params["grading"]["target_names"]
cm = confusion_matrix(y_major, cv_result["oof_pred_labels"])

fig, ax = plt.subplots(figsize=(5, 4))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot(cmap=plt.cm.Blues, ax=ax)
plt.title("Confusion Matrix: Tiles AGGC (OOF)")
plt.tight_layout()
plt.show()

In [None]:
# --- Predict on TCGA and evaluate ---
tcga_proba_mean, tcga_pred_labels = predict_external(
    cv_result["models"], tcga_feats
)

out_tcga = agg_from_tiles(df_tcga_idx, tcga_pred_labels)
out_aggc = agg_from_tiles(df_aggc_idx, cv_result["oof_pred_labels"])

print("--- TCGA slide-level ---")
results_tcga = evaluate_slide_predictions(out_tcga, truth_tcga_df)

print("\n--- AGGC slide-level (OOF) ---")
results_aggc = evaluate_slide_predictions(out_aggc, truth_aggc_df)