# Kernel Model (structured)
Refactored notebook that uses the modular `kernel_model` package instead of defining everything inline.
Set the config values (selector = MMR or abess, optional subsampling) then run the pipeline or the cached plotting cells below.

In [5]:
from pathlib import Path
from kernel_model.config import (
    KernelBankConfig,
    PatchExtractionConfig,
    PipelineConfig,
    SelectionConfig,
    SubsetSearchConfig,
    TrainingConfig,
)
from kernel_model.pipeline import run_pipeline
from kernel_model.models import fit_subset_model, find_best_low_corr_pair
from kernel_model.selection import build_feature_matrix, rank_kernels
from kernel_model.plots import plot_2d_scatter, plot_3d_scatter


## Configure the pipeline
Adjust any defaults. By default patch extraction is skipped (expects patches under `data/patches`).
Set `method='abess'` to use abess selection (requires `pip install abess`).
Use `train_subset_frac` or `train_subset_size` to downsample for quicker experiments.

In [6]:
cfg = PipelineConfig(
    data=PatchExtractionConfig(
        healthy_dir=Path('data/TIFF Images/Normal'),
        malignant_dir=Path('data/TIFF Images/Malignant'),
        annotation_dir=Path('data/TIFF Images/malignantAnnotation'),
        output_dir=Path('data/patches'),
        patch_size=128,
        n_inside_per_image=10,
        n_outside_per_image=10,
        red_threshold=150,
        max_tries=250,
        min_nonzero_frac=0.35,
        run_extraction=False,  # set True to regenerate patches
    ),
    bank=KernelBankConfig(),
    selection=SelectionConfig(
        method='mmr',  # or 'mmr' (pip install abess to use abess)
        topM=300,
        K=10,
    ),
    training=TrainingConfig(
        train_subset_frac=None,  # e.g., 0.25 to use 25% of data
        train_subset_size=None,  # or a fixed count (overrides frac if set)
    ),
    subset=SubsetSearchConfig(
        pair_corr_threshold=0.0,  # max |corr| allowed when searching best pair
    ),
    data_root=Path('data/patches'),
    out_dir=Path('results'),
    max_per_class=3000,
    resize_patch_size=128,
)
cfg


PipelineConfig(data=PatchExtractionConfig(healthy_dir=PosixPath('data/TIFF Images/Normal'), malignant_dir=PosixPath('data/TIFF Images/Malignant'), annotation_dir=PosixPath('data/TIFF Images/malignantAnnotation'), output_dir=PosixPath('data/patches'), patch_size=128, n_inside_per_image=10, n_outside_per_image=10, red_threshold=150, max_tries=250, min_pos_coverage=0.5, max_neg_coverage=0.05, near_neg_fraction=0.5, near_neg_radius=None, far_neg_radius=None, use_bbox_for_positives=True, min_nonzero_frac=0.35, min_intensity_rel=0.0, split_patients=True, train_frac=0.7, val_frac=0.15, test_frac=0.15, split_seed=42, run_extraction=False), bank=KernelBankConfig(families=('gaussian', 'anisotropic_gaussian', 'dog', 'log', 'gabor'), n_per_family=200, kernel_size=31), selection=SelectionConfig(method='mmr', response_fn='mean_abs', topM=300, K=10, lambda_mm=0.75, plot_top_kernels=20), training=TrainingConfig(epochs=60, batch_size=64, lr=0.0005, model_type='mlp', hidden_dims=(64, 32), dropout=0.2, s

## Run end-to-end
This mirrors the original notebook flow (patch load/extraction, kernel bank, responses, selection, training, plots).

In [3]:
result = run_pipeline(cfg)
result

2025-12-22 02:31:51,266 [INFO] Starting pipeline. Output: results/exp_007 Device: cuda Selection: abess


2025-12-22 02:31:52,118 [INFO] Loaded train split: Xin=860 Xout=880 resize=128 (patients in=86 out=88)
2025-12-22 02:31:52,336 [INFO] Loaded val split: Xin=190 Xout=190 resize=128 (patients in=19 out=19)
2025-12-22 02:31:52,550 [INFO] Loaded test split: Xin=180 Xout=180 resize=128 (patients in=18 out=18)
2025-12-22 02:31:52,628 [INFO] Built kernel bank: 1000 kernels
Kernels: 100%|██████████| 1000/1000 [01:32<00:00, 10.80it/s]
2025-12-22 02:33:26,556 [INFO] Patient-level grouping enabled (88 unique ids)
2025-12-22 02:33:26,583 [INFO] abess selected kernels (candidate idx): [113, 128, 131, 134, 138, 140, 147, 149, 155, 164, 175, 178, 179, 180, 181, 182, 192, 193, 197, 198]
2025-12-22 02:33:29,542 [INFO] Classifier results: AUC=0.8556 ACC=0.7722
Kernels: 100%|██████████| 1000/1000 [00:20<00:00, 47.84it/s]
2025-12-22 02:33:50,936 [INFO] Eval val: AUC=0.9149 ACC=0.8368
Kernels: 100%|██████████| 1000/1000 [00:19<00:00, 50.21it/s]
2025-12-22 02:34:10,876 [INFO] Eval test: AUC=0.8583 ACC=0.775

{'config': PipelineConfig(data=PatchExtractionConfig(healthy_dir=PosixPath('data/TIFF Images/Normal'), malignant_dir=PosixPath('data/TIFF Images/Malignant'), annotation_dir=PosixPath('data/TIFF Images/malignantAnnotation'), output_dir=PosixPath('data/patches'), patch_size=128, n_inside_per_image=10, n_outside_per_image=10, red_threshold=150, max_tries=250, min_pos_coverage=0.5, max_neg_coverage=0.05, near_neg_fraction=0.5, near_neg_radius=None, far_neg_radius=None, use_bbox_for_positives=True, min_nonzero_frac=0.35, min_intensity_rel=0.0, split_patients=True, train_frac=0.7, val_frac=0.15, test_frac=0.15, split_seed=42, run_extraction=False), bank=KernelBankConfig(families=('gaussian', 'anisotropic_gaussian', 'dog', 'log', 'gabor'), n_per_family=200, kernel_size=31), selection=SelectionConfig(method='abess', response_fn='mean_abs', topM=200, K=20, lambda_mm=0.75, plot_top_kernels=20), training=TrainingConfig(epochs=60, batch_size=64, lr=0.0005, model_type='mlp', hidden_dims=(64, 32), d

## Re-plot from cached features
Use `results/feature_cache.npz` to redraw the best pair/triple without recomputing the pipeline.

In [7]:
from pathlib import Path
from kernel_model.plots import plot_3d_scatter_interactive
import numpy as np
cfg.out_dir = Path("/home/masoud/Uni/Kernel Based Cancer Classification/results/exp_008/")

cache = np.load(cfg.out_dir / "feature_cache.npz", allow_pickle=True)
X_candidates = cache["X_candidates"]
y_labels = cache["y_labels"]
subset_results = cache["subset_results"].item()
candidate_kernel_idxs = cache["candidate_kernel_idxs"]

if 3 in subset_results:
    triple = subset_results[3]
    triple_kernel_idxs = [candidate_kernel_idxs[i] for i in triple["subset"]]
    plot_3d_scatter_interactive(
        X_candidates,
        y_labels,
        triple["subset"],
        triple_kernel_idxs,
        cfg.out_dir / "scatter_best_triple_interactive_CancerProjected.html",
        project_cancer_to_plane=True,
        project_healthy_to_plane=True,
    )
    print("Wrote interactive plot to scatter_best_triple_interactive.html")
else:
    print("No triple subset in cache")



Wrote interactive plot to scatter_best_triple_interactive.html


In [None]:
import json
import numpy as np

cache = np.load(cfg.out_dir / 'feature_cache.npz', allow_pickle=True)
X_candidates = cache['X_candidates']
y_labels = cache['y_labels']
subset_results = cache['subset_results'].item()
candidate_kernel_idxs = cache['candidate_kernel_idxs']

low_corr_pair = result.get('low_corr_pair') if 'result' in globals() else None
results_json = cfg.out_dir / 'results.json'
if low_corr_pair is None and results_json.exists():
    low_corr_pair = json.load(open(results_json)).get('low_corr_pair')
if low_corr_pair is None:
    low_corr_pair = find_best_low_corr_pair(
        X_candidates,
        y_labels,
        candidate_kernel_idxs,
        corr_threshold=cfg.subset.pair_corr_threshold,
        epochs=10,
        batch_size=cfg.training.batch_size,
        lr=cfg.training.lr,
        device='cpu',
        model_type=cfg.training.model_type,
        hidden_dims=cfg.training.hidden_dims,
        dropout=cfg.training.dropout,
        standardize=cfg.training.standardize_features,
        refit_best=False,
    )
if low_corr_pair and low_corr_pair.get('subset') is not None:
    status = 'within threshold' if low_corr_pair.get('threshold_met') else 'fallback (no pair met threshold)'
    print(f"Low-corr pair [{status}]: kernels {low_corr_pair['kernel_idxs']} cols {low_corr_pair['subset']} corr={low_corr_pair['corr']:.4f} auc={low_corr_pair['auc']:.4f}")
else:
    print('No low-corr pair found')

if 2 in subset_results:
    pair = subset_results[2]
    pair_kernel_idxs = [candidate_kernel_idxs[i] for i in pair['subset']]
    boundary_model = fit_subset_model(
        X_candidates,
        y_labels,
        pair['subset'],
        epochs=20,
        batch_size=cfg.training.batch_size,
        lr=cfg.training.lr,
        device='cpu',
        model_type=cfg.training.model_type,
        hidden_dims=cfg.training.hidden_dims,
        dropout=cfg.training.dropout,
        standardize=cfg.training.standardize_features,
    )
    plot_2d_scatter(
        X_candidates,
        y_labels,
        pair['subset'],
        pair_kernel_idxs,
        cfg.out_dir / 'scatter_best_pair_cached.png',
        boundary=boundary_model,
        title='Best 2-kernel feature space (cached)',
    )

if 3 in subset_results:
    triple = subset_results[3]
    triple_kernel_idxs = [candidate_kernel_idxs[i] for i in triple['subset']]
    plot_3d_scatter(
        X_candidates, y_labels, triple['subset'], triple_kernel_idxs, cfg.out_dir / 'scatter_best_triple_cached.png'
    )
