### Prepare XeniumPR1_segger data for training in /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_segger/

In [37]:
# --- Standard library ---
import os
import json
import re
import shutil
from pathlib import Path
from typing import List, Dict, Tuple, Optional

# --- Third-party ---
import numpy as np
import pandas as pd
import scanpy as sc

# --- HEST ---
from hest import iter_hest
from hest.utils import get_k_genes
from hest.HESTData import create_splits


# ---------- helpers ----------
def _sanitize_tag(s: str, maxlen: int = 8) -> str:
    s2 = re.sub(r'[^A-Za-z0-9]', '', s)
    return s2.upper()[:maxlen] or "R"

def _extract_pr_number(path: Path) -> Optional[int]:
    """
    Look for 'XeniumPR<digit>' pattern in the path (case-insensitive).
    Returns int digit (1..9) or None.
    """
    m = re.search(r'XeniumPR(\d)', str(path), flags=re.IGNORECASE)
    return int(m.group(1)) if m else None

def _extract_slide_number(root: Path) -> Optional[str]:
    """
    Look for 'slideN' pattern in the root folder name and return the digit as string.
    If not found, try to infer from name like 'S1' or 's1' inside the folder name.
    """
    n = root.name.lower()
    m = re.search(r'slide[_\-]?(\d+)', n)
    if m:
        return m.group(1)
    m2 = re.search(r'\bS(\d+)\b', root.name, flags=re.IGNORECASE)
    if m2:
        return m2.group(1)
    return None

def _discover_samples_from_roots(
    roots: List[Path],
    ids: Optional[List[str]] = None,
) -> Dict[str, Dict[str, Path]]:
    """
    Discover samples under multiple roots and merge into a single map.

    Naming rule (applies to PR1, PR2, PR3, etc.):
      new_id = 'XeniumPR{n}S{slide_number}{ROI}'

    If slide number is not found, it falls back to root tag.
    Example:
        XeniumPR1/slide1/ROI3  → XeniumPR1S1ROI3
        XeniumPR2/slide2/ROI5  → XeniumPR2S2ROI5
        XeniumPR3/slideX/ROI7  → XeniumPR3SxROI7
    """
    roots = [Path(r) for r in roots]
    roots = [r for r in roots if r.exists() and r.is_dir()]
    collected = []

    if ids is None:
        for r in sorted(roots, key=lambda p: str(p)):
            for p in sorted([d for d in r.iterdir() if d.is_dir()], key=lambda d: d.name):
                collected.append((r, p.name))
    else:
        for sid in sorted(ids):
            for r in sorted(roots, key=lambda p: str(p)):
                if (r / sid).is_dir():
                    collected.append((r, sid))

    samples: Dict[str, Dict[str, Path]] = {}
    for root, sid in collected:
        sdir = root / sid
        adata = sdir / "aligned_adata.h5ad"
        if not adata.exists():
            continue

        # pick patch .h5
        patch_h5 = None
        patches_dir = sdir / "patches"
        if patches_dir.exists():
            cands = sorted(patches_dir.glob("*.h5"))
            if cands:
                exact = [c for c in cands if c.name == f"{sid}.h5"]
                patch_h5 = exact[0] if exact else cands[0]

        # pick vis .png
        vis_png = None
        vis_dir = sdir / "patches_vis"
        if vis_dir.exists():
            cands = sorted(vis_dir.glob("*.png"))
            if cands:
                exact = [c for c in cands if c.name == f"{sid}_patch_vis.png"]
                vis_png = exact[0] if exact else cands[0]

        # --- Naming rule ---
        pr_num = _extract_pr_number(root)
        slide_num = _extract_slide_number(root) or _sanitize_tag(root.name, 3)

        if pr_num is not None:
            prefix = f"XeniumPR{pr_num}S{slide_num}"
        else:
            # fallback for unknown roots
            prefix = f"{_sanitize_tag(root.name)}S{slide_num}"

        new_id = f"{prefix}{sid}"
        if new_id in samples:
            raise ValueError(
                f"Duplicate renamed sample id '{new_id}' (collision between roots for sid='{sid}')."
            )

        samples[new_id] = {"adata": adata, "patch": patch_h5, "vis": vis_png}

    return samples



def _transfer(src: Optional[Path], dst: Path, label: str, symlink: bool, missing_list: list):
    if src is None or not Path(src).exists():
        missing_list.append((dst.stem, label, str(src) if src is not None else "<none>"))
        return
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists():
        dst.unlink()
    if symlink:
        try:
            os.symlink(src, dst)
        except FileExistsError:
            pass
    else:
        shutil.copy(src, dst)


def write_var_k_genes_from_paths(
    adata_paths,
    k,
    criteria,
    min_cells_pct,
    var_out_path,
    all_genes_out_path=None,
    exclude_keywords=None,
    filtered_common_out_path=None
#     union_genes_out_path=None,
):
    """
    Load all adatas, call HEST's get_k_genes() for top-k genes,
    and also save:
      - union of all genes across samples
      - all common genes (keyword-filtered, no expression threshold)
      - filtered common genes using min_cells_pct across each sample

    Returns:
        (var_k_genes, all_common_genes, filtered_common_genes, gene_union)
    """
    import json, warnings
    import numpy as np
    import scanpy as sc
    from pathlib import Path
    from hest.utils import get_k_genes

    if exclude_keywords is None:
        exclude_keywords = ["NegControl", "Codeword", "Intergenic_Region", "Control", "BLANK"]

    warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")

    # ---- Load all adatas
    adata_list = []
    for p in adata_paths:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=FutureWarning)
            ad = sc.read_h5ad(str(p))
        adata_list.append(ad)

    # ---- Top-k variable/mean genes
    var_k_genes = get_k_genes(
        adata_list,
        k,
        criteria,
        save_dir=str(var_out_path),
        min_cells_pct=min_cells_pct,
    )

    # ---- UNION of all genes
    gene_union = set()
    for ad in adata_list:
        gene_union |= set(ad.var_names)
    gene_union = sorted(gene_union)

    # ---- ALL common genes (keyword-filtered only)
    common_genes = set(adata_list[0].var_names)
    for ad in adata_list[1:]:
        common_genes &= set(ad.var_names)

    def _keep_keyword(gene: str) -> bool:
        return not any(kw in gene for kw in exclude_keywords)

    all_common_genes = sorted([g for g in common_genes if _keep_keyword(g)])

    # ---- Filtered common genes (expression threshold per sample)
    filtered_sets = []
    for ad in adata_list:
        ad_tmp = ad[:, :].copy()
        min_cells = int(np.ceil(min_cells_pct * ad_tmp.n_obs)) if min_cells_pct else 0
        if min_cells > 0:
            sc.pp.filter_genes(ad_tmp, min_cells=min_cells)
        filtered_sets.append(set(ad_tmp.var_names))

    filtered_common = set.intersection(*filtered_sets) if filtered_sets else set()
    filtered_common_genes = sorted(
        [g for g in filtered_common if ("BLANK" not in g and "Control" not in g)]
    )

    # ---- Write JSONs
    out_dir = Path(var_out_path).parent

#     if union_genes_out_path is None:
#         union_genes_out_path = out_dir / "union_genes.json"
#     with open(union_genes_out_path, "w") as f:
#         json.dump({"genes": gene_union}, f)

    if all_genes_out_path is None:
        all_genes_out_path = out_dir / "all_genes.json"
    with open(all_genes_out_path, "w") as f:
        json.dump({"genes": all_common_genes}, f)

    if filtered_common_out_path is None:
        filtered_common_out_path = out_dir / f"common_genes_{min_cells_pct}.json"
    with open(filtered_common_out_path, "w") as f:
        json.dump(
            {"genes": filtered_common_genes, "min_cells_pct": min_cells_pct}, f
        )

    print(
        f"[INFO] Wrote {var_out_path} (top-{k}, criteria={criteria}); "
#         f"{union_genes_out_path} (union={len(gene_union)}); "
        f"{all_genes_out_path} (all_common={len(all_common_genes)}); "
        f"{filtered_common_out_path} (filtered_common={len(filtered_common_genes)}, "
        f"min_cells_pct={min_cells_pct})"
    )

    return var_k_genes, all_common_genes, filtered_common_genes


# ---------- main entry ----------

def create_benchmark_data_multislide(
    save_dir: str | Path,
    K: int,
    base_root: str | Path = "sftp://login1.molbiol.ox.ac.uk/ceph/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_segger",
    slide_subdirs: List[str] | tuple = ("slide1", "slide2"),
    ids: Optional[List[str]] = None,
    gene_k: int = 50,
    gene_criteria: str = "var",
    min_cells_pct: float = 0.10,
    symlink: bool = False,
    seed: int = 0,
):
    """
    Build a HEST benchmark package from both slide1 and slide2 under the XeniumPR1_segger tree
    (or any set of slide subfolders you pass), without relying on a prebuilt metadata DF.

    Expected layout:
        <base_root>/slide1/<sample_id>/...
        <base_root>/slide2/<sample_id>/...

    Output tree:
      <save_dir>/
        var_50genes.json
        splits/...
        patches/<id>.h5
        patches/vis/<id>.png
        adata/<id>.h5ad

    Args:
        save_dir: destination directory for the assembled benchmark package
        K: number of folds for HEST's create_splits
        base_root: base directory containing slide subfolders
        slide_subdirs: which slide folders to include (defaults to ["slide1", "slide2"])
        ids: optional list of sample IDs to include (if None, auto-discovers)
        gene_k: number of variable genes to select
        gene_criteria: criteria for get_k_genes (e.g., "var")
        symlink: if True, symlink files instead of copying
        seed: RNG seed used to deterministically shuffle within groups before splitting
    """
    
    from hest.HESTData import create_splits

    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # 1) Build slide roots list and discover samples across them
    base_root = Path(base_root)
    roots = [base_root / sd for sd in slide_subdirs]
    print(f"[INFO] Using slide roots: {roots}")

    samples = _discover_samples_from_roots(roots, ids=ids)
    if not samples:
        raise ValueError(
            f"No valid samples (with aligned_adata.h5ad) found under any of: {roots}."
        )
    discovered_ids = sorted(samples.keys())
    print(f"[INFO] Discovered {len(discovered_ids)} samples: {discovered_ids}")

    # 2) Minimal metadata DF for splitting (patient from prefix; dataset_title from base folder name)
    def _infer_patient(sid: str) -> str:
        return sid.split("_")[0] if "_" in sid else sid

    dataset_title = base_root.name or "xenium"
    meta = pd.DataFrame(
        {
            "id": discovered_ids,
            "patient": [_infer_patient(s) for s in discovered_ids],
            "dataset_title": [dataset_title] * len(discovered_ids),
        }
    )

    # 3) Compute var_k genes → var_50genes.json
    adata_paths = [samples[sid]["adata"] for sid in discovered_ids]
    var_json = save_dir / f"var_{gene_k}genes.json"
    write_var_k_genes_from_paths(adata_paths, gene_k, gene_criteria, min_cells_pct,var_json)
    print(f"[INFO] Wrote {var_json}")

    # 4) K-fold splits using HEST's create_splits
    #    Group by (dataset_title, patient)
    group = meta.groupby(["dataset_title", "patient"])["id"].agg(list).to_dict()

    # Deterministic shuffle within each group
    rng = np.random.RandomState(seed)
    for key, id_list in group.items():
        rng.shuffle(id_list)

    splits_dir = save_dir / "splits"
    splits_dir.mkdir(parents=True, exist_ok=True)
    create_splits(str(splits_dir), group, K=K)
    print(f"[INFO] Wrote {K}-fold splits to {splits_dir}")

    # 5) Copy/symlink assets
    (save_dir / "patches").mkdir(exist_ok=True, parents=True)
    (save_dir / "patches" / "vis").mkdir(exist_ok=True, parents=True)
    (save_dir / "adata").mkdir(exist_ok=True, parents=True)

    missing: List[tuple] = []
    for sid in discovered_ids:
        info = samples[sid]
        _transfer(info.get("patch"), save_dir / "patches" / f"{sid}.h5", "patch", symlink, missing)
        _transfer(info.get("vis"), save_dir / "patches" / "vis" / f"{sid}.png", "vis", symlink, missing)
        _transfer(info.get("adata"), save_dir / "adata" / f"{sid}.h5ad", "adata", symlink, missing)

    if missing:
        print("[WARN] Missing files:")
        for sid, lbl, path in missing:
            print(f"  - {sid} [{lbl}] → {path}")

    print(f"✅ Benchmark dataset created at {save_dir}")



In [24]:
# create_benchmark_data_multislide(
#     save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR1",
#     K=15, 
#     base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1",
#     gene_k=50,
#     gene_criteria="var",
#     symlink=False,            # set True to save disk space
#     seed=0                    # controls fold assignment deterministically
# )
# The constructed dataset gives error "Values [...], from ..., are not valid obs/ var names or indices." -> try to copy the files from other existing dataset like XeniumPR1_broad instead 

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1/slide2')]
[INFO] Discovered 15 samples: ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7']
min_cells is  110.0
min_cells is  88.0
min_cells is  162.0
min_cells is  187.0
min_cells is  208.0
min_cells is  453.0
min_cells is  353.0
min_cells is  469.0
min_cells is  480.0
min_cells is  424.0
min_cells is  282.0
min_cells is  465.0
min_cells is  182.0
min_cells is  226.0
min_cells is  296.0
[32m13:44:17[0m | [1mINFO[0m | [1mFound 3065 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m13:44:21[0m | [1mINFO[0m | [1mselected genes ['BACH2', 'BANK1', 'BCL11A', 'BIRC5', 'BMP7', 'CA12', 'CCL19', 'CD19', 'CD22', 'CD72', 'CD79B', 'CD83', 'CHGA', 'CIDEC', 'CLEC4M', 'COL7A1', 'DTX1', 'EMP1', 'ENPEP', 'FCMR', 'G0S2', 'GBP5', 'GREM2', 'HK2', 'ITLN1', 'LCN2', 'LMO2', 'MS4A1', 'MYBL2', 'PARP15', 'PLA2G2A', 'PLAU', 'PLAUR', 'RRM2', 'SCIMP', 'SDK2', 'SELENBP1', 'SELP', 'SLC26A2', 'SLC9A3', 'SPP1', 'SYP', 'TCF7', 'TLR2', 'TNFRSF13C', 'TUBB2B', 'TXK', 'TYMS', 'UCHL1', 'WDFY4'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1/union_genes.json (union=5161); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1/all_genes.json (all_common=5100); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1/common_genes_0.1.json (filtered_common=3065, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1/var_50genes.json
Split 0/15
train set is  ['XeniumPR1S1R

# Prepare Xenium 50 um data for evaluation

In [2]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_50um",
    K=15, # 5 patients with 7 samples
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_50um",
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_50um/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_50um/slide2')]
[INFO] Discovered 15 samples: ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7']
min_cells is  393.0
min_cells is  305.0
min_cells is  700.0
min_cells is  814.0
min_cells is  764.0
min_cells is  1729.0
min_cells is  1393.0
min_cells is  1860.0
min_cells is  1727.0
min_cells is  1482.0
min_cells is  954.0
min_cells is  1550.0
min_cells is  599.0
min_cells is  808.0
min_cells is  972.0
[32m19:58:08[0m | [1mINFO[0m | [1mFound 1516 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m19:58:12[0m | [1mINFO[0m | [1mselected genes ['ADM', 'BMP2', 'CA12', 'CASP7', 'CCL21', 'CD55', 'CD79A', 'CTSH', 'CYCS', 'DGAT1', 'DNMT1', 'EMP1', 'ERN1', 'ERRFI1', 'F3', 'FLNB', 'FUCA1', 'GNA11', 'HMGA1', 'HMGCR', 'HPGD', 'IGFBP2', 'IKZF3', 'IRF8', 'ISG20', 'LPIN1', 'MAP1B', 'MAP4K2', 'MCM5', 'MXD1', 'NRN1', 'NUDT1', 'PDE4B', 'PLVAP', 'RBM38', 'RPA3', 'S100B', 'SELENBP1', 'SGK1', 'SLC12A2', 'SRI', 'STMN1', 'SYVN1', 'TENT5C', 'TFEB', 'TNFRSF21', 'TOPBP1', 'TRPM4', 'UGDH', 'XBP1'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/xeniumPR1_50um/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/xeniumPR1_50um/all_genes.json (all_common=5100); /project/simmons_hts/kxu/hest/eval/data/xeniumPR1_50um/filtered_genes_minpct10.json (filtered_common=1516, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/xeniumPR1_50um/var_50genes.json
Split 0/15
train set is  ['XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S

### prepare for XeniumPR1_50um_0.25_um_px

In [19]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_50um_0.25_um_px",
    K=15, # 5 patients with 7 samples
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_50um_0.25_um_px",
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_50um_0.25_um_px/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_50um_0.25_um_px/slide2')]
[INFO] Discovered 15 samples: ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7']
min_cells is  393.0
min_cells is  305.0
min_cells is  700.0
min_cells is  814.0
min_cells is  764.0
min_cells is  1729.0
min_cells is  1393.0
min_cells is  1860.0
min_cells is  1727.0
min_cells is  1482.0
min_cells is  954.0
min_cells is  1550.0
min_cells is  599.0
min_cells is  808.0
min_cells is  972.0
[32m16:03:42[0m | [1mINFO[0m | [1mFound 1516 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m16:03:49[0m | [1mINFO[0m | [1mselected genes ['ADM', 'BMP2', 'CA12', 'CASP7', 'CCL21', 'CD55', 'CD79A', 'CTSH', 'CYCS', 'DGAT1', 'DNMT1', 'EMP1', 'ERN1', 'ERRFI1', 'F3', 'FLNB', 'FUCA1', 'GNA11', 'HMGA1', 'HMGCR', 'HPGD', 'IGFBP2', 'IKZF3', 'IRF8', 'ISG20', 'LPIN1', 'MAP1B', 'MAP4K2', 'MCM5', 'MXD1', 'NRN1', 'NUDT1', 'PDE4B', 'PLVAP', 'RBM38', 'RPA3', 'S100B', 'SELENBP1', 'SGK1', 'SLC12A2', 'SRI', 'STMN1', 'SYVN1', 'TENT5C', 'TFEB', 'TNFRSF21', 'TOPBP1', 'TRPM4', 'UGDH', 'XBP1'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_50um_0.25_um_px/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_50um_0.25_um_px/all_genes.json (all_common=5100); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_50um_0.25_um_px/common_genes_0.1.json (filtered_common=1516, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_50um_0.25_um_px/var_50genes.json
Split 0/15
train set is  ['XeniumPR1S1ROI2', 'XeniumPR1S1

In [3]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um_0.125_um_px",
    K=15, # 5 patients with 7 samples
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_25um_0.125_um_px",
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_25um_0.125_um_px/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_25um_0.125_um_px/slide2')]
[INFO] Discovered 15 samples: ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7']
min_cells is  1246.0
min_cells is  935.0
min_cells is  2265.0
min_cells is  2879.0
min_cells is  2520.0
min_cells is  5742.0
min_cells is  5545.0
min_cells is  7264.0
min_cells is  5697.0
min_cells is  4844.0
min_cells is  2993.0
min_cells is  4942.0
min_cells is  1858.0
min_cells is  2816.0
min_cells is  3117.0
[32m18:58:25[0m | [1mINFO[0m | [1mFound 338 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m18:58:29[0m | [1mINFO[0m | [1mselected genes ['ABCA1', 'ACTN4', 'ADAMTS1', 'ALDH2', 'ANKRD13A', 'ATP1A1', 'C1QBP', 'C7', 'CCT5', 'CDKN1A', 'CLPTM1L', 'COL4A1', 'COL4A2', 'COL5A1', 'COL5A2', 'CPT1A', 'CXCR4', 'DDX27', 'FGL2', 'FLOT2', 'GHITM', 'GRN', 'HADHB', 'HPCAL1', 'HSPD1', 'HYOU1', 'IL32', 'IMPDH2', 'IRF1', 'LGMN', 'MVP', 'NDRG1', 'PARP1', 'PDGFRA', 'PIK3IP1', 'PLP2', 'PRNP', 'PTPN1', 'PYGB', 'RHOC', 'SAT2', 'SH3BP5', 'SMARCA4', 'SREBF2', 'SRI', 'SSRP1', 'THBS1', 'TUFM', 'VDAC2', 'XBP1'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um_0.125_um_px/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um_0.125_um_px/all_genes.json (all_common=5100); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um_0.125_um_px/common_genes_0.1.json (filtered_common=338, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um_0.125_um_px/var_50genes.json
Split 0/15
train set is  ['XeniumPR1S1ROI

In [6]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um",
    K=15, # 5 patients with 7 samples
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_25um",
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_25um/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_25um/slide2')]
[INFO] Discovered 15 samples: ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7']
min_cells is  1246.0
min_cells is  935.0
min_cells is  2265.0
min_cells is  2879.0
min_cells is  2520.0
min_cells is  5742.0
min_cells is  5545.0
min_cells is  7264.0
min_cells is  5697.0
min_cells is  4844.0
min_cells is  2993.0
min_cells is  4942.0
min_cells is  1858.0
min_cells is  2816.0
min_cells is  3117.0
[32m15:40:20[0m | [1mINFO[0m | [1mFound 338 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m15:41:14[0m | [1mINFO[0m | [1mselected genes ['ABCA1', 'ACTN4', 'ADAMTS1', 'ALDH2', 'ANKRD13A', 'ATP1A1', 'C1QBP', 'C7', 'CCT5', 'CDKN1A', 'CLPTM1L', 'COL4A1', 'COL4A2', 'COL5A1', 'COL5A2', 'CPT1A', 'CXCR4', 'DDX27', 'FGL2', 'FLOT2', 'GHITM', 'GRN', 'HADHB', 'HPCAL1', 'HSPD1', 'HYOU1', 'IL32', 'IMPDH2', 'IRF1', 'LGMN', 'MVP', 'NDRG1', 'PARP1', 'PDGFRA', 'PIK3IP1', 'PLP2', 'PRNP', 'PTPN1', 'PYGB', 'RHOC', 'SAT2', 'SH3BP5', 'SMARCA4', 'SREBF2', 'SRI', 'SSRP1', 'THBS1', 'TUFM', 'VDAC2', 'XBP1'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um/all_genes.json (all_common=5100); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um/common_genes_0.1.json (filtered_common=338, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_25um/var_50genes.json
Split 0/15
train set is  ['XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'Xeniu

# XeniumPR2

In [6]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR2",
    K=8, # 5 patients with 7 samples
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR2",
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR2/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR2/slide2')]
[INFO] Discovered 8 samples: ['XeniumPR2S1ROI1', 'XeniumPR2S1ROI2', 'XeniumPR2S1ROI3', 'XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'XeniumPR2S1ROI6', 'XeniumPR2S1ROI7', 'XeniumPR2S1ROI8']
min_cells is  463.0
min_cells is  391.0
min_cells is  293.0
min_cells is  278.0
min_cells is  465.0
min_cells is  108.0
min_cells is  120.0
min_cells is  251.0
[32m16:10:46[0m | [1mINFO[0m | [1mFound 3333 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m16:10:48[0m | [1mINFO[0m | [1mselected genes ['ADRB2', 'ARL4D', 'BAG3', 'CCL8', 'CCNB1', 'CD22', 'CDA', 'CEACAM6', 'CXCL8', 'CXCL9', 'DMKN', 'DUOX2', 'FOSL1', 'HBEGF', 'HCAR2', 'HCAR3', 'HES1', 'HSPA6', 'IGSF3', 'IL11', 'IL1B', 'IL1RN', 'JUP', 'LCN2', 'LGALSL', 'LTB4R', 'LTB4R2', 'MMP12', 'MS4A1', 'NDRG4', 'NECTIN1', 'NLRP3', 'OSM', 'PC', 'PTHLH', 'PTPRZ1', 'SERPINA3', 'SLC5A1', 'SLC7A5', 'SPP1', 'TFPI2', 'TIAM1', 'TMEM45A', 'TNFRSF13C', 'TPX2', 'TUBB2A', 'TUBB2B', 'TUBB3', 'UCHL1', 'UHRF1'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR2/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR2/all_genes.json (all_common=5100); /project/simmons_hts/kxu/hest/eval/data/XeniumPR2/common_genes_0.1.json (filtered_common=3333, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR2/var_50genes.json
Split 0/8
train set is  ['XeniumPR2S1ROI2', 'XeniumPR2S1ROI3', 'XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'Xenium

In [37]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR3",
    K=8, # 5 patients with 7 samples
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR3",
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR3/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR3/slide2')]
[INFO] Discovered 8 samples: ['XeniumPR3S1ROI1', 'XeniumPR3S1ROI2', 'XeniumPR3S1ROI3', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI5', 'XeniumPR3S1ROI6', 'XeniumPR3S1ROI7', 'XeniumPR3S1ROI8']
min_cells is  337.0
min_cells is  115.0
min_cells is  303.0
min_cells is  329.0
min_cells is  263.0
min_cells is  306.0
min_cells is  169.0
min_cells is  224.0
[32m16:44:09[0m | [1mINFO[0m | [1mFound 2172 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m16:44:11[0m | [1mINFO[0m | [1mselected genes ['ABCD3', 'ACE', 'ADAMTS2', 'ANPEP', 'CCL21', 'CXCR4', 'DPT', 'EPHX2', 'FLNB', 'FSCN1', 'FUCA1', 'GPRC5C', 'GREM2', 'H19', 'HMGCR', 'HPGD', 'IGFBP2', 'IL16', 'MALL', 'MAOA', 'MAP7', 'MYH10', 'NFATC1', 'NGFR', 'NQO1', 'NTN1', 'ORAI2', 'PIK3CD', 'PIK3IP1', 'POSTN', 'PPP1R16B', 'PRKCB', 'RACGAP1', 'RGS5', 'RSPO3', 'S100B', 'SELENBP1', 'SEMA4D', 'SERPINA3', 'SGK1', 'SHROOM1', 'SLC20A2', 'SMOC2', 'SOCS1', 'TFEB', 'THBS2', 'TNFRSF25', 'TOX', 'TRAF5', 'WFDC1'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR3/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR3/all_genes.json (all_common=5100); /project/simmons_hts/kxu/hest/eval/data/XeniumPR3/common_genes_0.1.json (filtered_common=2172, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR3/var_50genes.json
Split 0/8
train set is  ['XeniumPR3S1ROI2', 'XeniumPR3S1ROI3', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI5', '

# XeniumPR4

In [27]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR4",
    K=20, # 5 patients with 7 samples
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR4",
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR4/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR4/slide2')]
[INFO] Discovered 20 samples: ['XeniumPR4S1ROI1', 'XeniumPR4S1ROI10', 'XeniumPR4S1ROI2', 'XeniumPR4S1ROI3', 'XeniumPR4S1ROI4', 'XeniumPR4S1ROI5', 'XeniumPR4S1ROI6', 'XeniumPR4S1ROI7', 'XeniumPR4S1ROI8', 'XeniumPR4S1ROI9', 'XeniumPR4S2ROI1', 'XeniumPR4S2ROI10', 'XeniumPR4S2ROI2', 'XeniumPR4S2ROI3', 'XeniumPR4S2ROI4', 'XeniumPR4S2ROI5', 'XeniumPR4S2ROI6', 'XeniumPR4S2ROI7', 'XeniumPR4S2ROI8', 'XeniumPR4S2ROI9']
min_cells is  220.0
min_cells is  147.0
min_cells is  183.0
min_cells is  204.0
min_cells is  221.0
min_cells is  112.0
min_cells is  122.0
min_cells is  184.0
min_cells is  177.0
min_cells is  108.0
min_cells is  141.0
min_cells is  111.0
min_cells is  210.0
min_cells is  238.0
min_cells is  203.0
min_cells is  226.0
min_cells is  202.0
min_cells is  190.0
min_cells is  177.0
min_cells is  131.0
[32m18:10:40

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m18:10:42[0m | [1mINFO[0m | [1mselected genes ['ALDH1B1', 'AXL', 'CD55', 'CHGA', 'CLDN5', 'COL18A1', 'COL4A1', 'DERL3', 'ENG', 'EPHB2', 'FADS2', 'GIP', 'GP2', 'GZMA', 'H2AFX', 'HHIP', 'HMGA1', 'IGFBP2', 'IKZF3', 'IL16', 'ISG15', 'ITGAL', 'ITGB7', 'KPNA2', 'KRT20', 'LAG3', 'MCM5', 'MRC2', 'MSI1', 'MYLK', 'NCF1', 'NKG7', 'PIM2', 'PLAT', 'PLAUR', 'POU2AF1', 'REG4', 'SCNN1A', 'SLC11A2', 'SORD', 'SOX4', 'SST', 'STAB1', 'TCF4', 'TENT5C', 'TNFAIP2', 'TNXB', 'TRAP1', 'TRBC1', 'TRPM5'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR4/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR4/all_genes.json (all_common=4994); /project/simmons_hts/kxu/hest/eval/data/XeniumPR4/common_genes_0.1.json (filtered_common=2002, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR4/var_50genes.json
Split 0/20
train set is  ['XeniumPR4S1ROI10', 'XeniumPR4S1ROI2', 'XeniumPR4S1ROI3', 'XeniumPR4S1ROI4', 'XeniumPR4S1ROI5', 'X

✅ Benchmark dataset created at /project/simmons_hts/kxu/hest/eval/data/XeniumPR4


# XeniumPR5

In [30]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR5",
    K=21, # 5 patients with 7 samples
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR5",
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR5/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR5/slide2')]
[INFO] Discovered 21 samples: ['XeniumPR5S1ROI1', 'XeniumPR5S1ROI10', 'XeniumPR5S1ROI2', 'XeniumPR5S1ROI3', 'XeniumPR5S1ROI4', 'XeniumPR5S1ROI5', 'XeniumPR5S1ROI6', 'XeniumPR5S1ROI7', 'XeniumPR5S1ROI8', 'XeniumPR5S1ROI9', 'XeniumPR5S2ROI1', 'XeniumPR5S2ROI10', 'XeniumPR5S2ROI11', 'XeniumPR5S2ROI2', 'XeniumPR5S2ROI3', 'XeniumPR5S2ROI4', 'XeniumPR5S2ROI5', 'XeniumPR5S2ROI6', 'XeniumPR5S2ROI7', 'XeniumPR5S2ROI8', 'XeniumPR5S2ROI9']
min_cells is  136.0
min_cells is  190.0
min_cells is  81.0
min_cells is  191.0
min_cells is  137.0
min_cells is  235.0
min_cells is  196.0
min_cells is  116.0
min_cells is  252.0
min_cells is  172.0
min_cells is  154.0
min_cells is  151.0
min_cells is  150.0
min_cells is  151.0
min_cells is  141.0
min_cells is  167.0
min_cells is  292.0
min_cells is  136.0
min_cells is  138.0
min_cells is  9

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m16:55:54[0m | [1mINFO[0m | [1mselected genes ['ADAMDEC1', 'ADGRA2', 'AQP10', 'CCL28', 'CD4', 'CD55', 'CHGA', 'COL18A1', 'COL4A1', 'COL4A2', 'COL5A1', 'DEFA6', 'DERL3', 'ENPP2', 'EPHB2', 'FOXQ1', 'GBP1', 'GIP', 'IFI27', 'IMPDH2', 'ITGA2', 'ITGB6', 'JAK3', 'LAMA3', 'LMNB1', 'MCM5', 'MCM7', 'MSI1', 'MUC2', 'OAS3', 'PFKP', 'PIM2', 'PLA2G2A', 'PLAUR', 'PLVAP', 'REG4', 'RRM1', 'SCNN1A', 'SEMA4C', 'SLC11A2', 'SLC20A2', 'SORD', 'SST', 'STAB1', 'TCF21', 'TENT5C', 'TNFAIP2', 'TNXB', 'TRAP1', 'VEGFA'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR5/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR5/all_genes.json (all_common=4934); /project/simmons_hts/kxu/hest/eval/data/XeniumPR5/common_genes_0.1.json (filtered_common=1478, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR5/var_50genes.json
Split 0/21
train set is  ['XeniumPR5S1ROI10', 'XeniumPR5S1ROI2', 'XeniumPR5S1ROI3', 'XeniumPR5S1ROI4', 'Xeniu

Split 18/21
train set is  ['XeniumPR5S1ROI1', 'XeniumPR5S1ROI10', 'XeniumPR5S1ROI2', 'XeniumPR5S1ROI3', 'XeniumPR5S1ROI4', 'XeniumPR5S1ROI5', 'XeniumPR5S1ROI6', 'XeniumPR5S1ROI7', 'XeniumPR5S1ROI8', 'XeniumPR5S1ROI9', 'XeniumPR5S2ROI1', 'XeniumPR5S2ROI10', 'XeniumPR5S2ROI11', 'XeniumPR5S2ROI2', 'XeniumPR5S2ROI3', 'XeniumPR5S2ROI4', 'XeniumPR5S2ROI5', 'XeniumPR5S2ROI6', 'XeniumPR5S2ROI8', 'XeniumPR5S2ROI9']

test set is  ['XeniumPR5S2ROI7']

Split 19/21
train set is  ['XeniumPR5S1ROI1', 'XeniumPR5S1ROI10', 'XeniumPR5S1ROI2', 'XeniumPR5S1ROI3', 'XeniumPR5S1ROI4', 'XeniumPR5S1ROI5', 'XeniumPR5S1ROI6', 'XeniumPR5S1ROI7', 'XeniumPR5S1ROI8', 'XeniumPR5S1ROI9', 'XeniumPR5S2ROI1', 'XeniumPR5S2ROI10', 'XeniumPR5S2ROI11', 'XeniumPR5S2ROI2', 'XeniumPR5S2ROI3', 'XeniumPR5S2ROI4', 'XeniumPR5S2ROI5', 'XeniumPR5S2ROI6', 'XeniumPR5S2ROI7', 'XeniumPR5S2ROI9']

test set is  ['XeniumPR5S2ROI8']

Split 20/21
train set is  ['XeniumPR5S1ROI1', 'XeniumPR5S1ROI10', 'XeniumPR5S1ROI2', 'XeniumPR5S1ROI3', 'Xeniu

## XeniumPR1-3_cell

In [27]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_cell",
    K=15, 
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_cell",
    gene_k=10,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0,
    min_cells_pct=0
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_cell/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_cell/slide2')]
[INFO] Discovered 15 samples: ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7']
[32m21:25:40[0m | [1mINFO[0m | [1mFound 16 common genes[0m
[32m21:25:40[0m | [1mINFO[0m | [1mselected genes ['B-cells', 'CD4+_Tcells', 'CD8+_Tcells', 'Glia', 'Lymphatics', 'Macrophages', 'Mast', 'Muscularis', 'Pericytes', 'Plasma'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_cell/var_10genes.json (top-10, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_cell/all_genes.json (all_common=16); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1_cell/com

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Split 4/15
train set is  ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7']

test set is  ['XeniumPR1S1ROI5']

Split 5/15
train set is  ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7']

test set is  ['XeniumPR1S1ROI6']

Split 6/15
train set is  ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7']

test set is  ['XeniumPR1S1ROI7']

Split 7/15
train se

In [28]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR2_cell",
    K=8, 
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR2_cell",
    gene_k=7,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0,
    min_cells_pct=0
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR2_cell/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR2_cell/slide2')]
[INFO] Discovered 8 samples: ['XeniumPR2S1ROI1', 'XeniumPR2S1ROI2', 'XeniumPR2S1ROI3', 'XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'XeniumPR2S1ROI6', 'XeniumPR2S1ROI7', 'XeniumPR2S1ROI8']
[32m21:26:02[0m | [1mINFO[0m | [1mFound 16 common genes[0m
[32m21:26:02[0m | [1mINFO[0m | [1mselected genes ['B-cells', 'CD4+_Tcells', 'CD8+_Tcells', 'DCs', 'Endothelium', 'Epithelium', 'Glia'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR2_cell/var_7genes.json (top-7, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR2_cell/all_genes.json (all_common=16); /project/simmons_hts/kxu/hest/eval/data/XeniumPR2_cell/common_genes_0.1.json (filtered_common=16, min_cells_pct=0)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR2_cell/var_7genes.json
Split 0/8
train set is  ['XeniumP

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Split 6/8
train set is  ['XeniumPR2S1ROI1', 'XeniumPR2S1ROI2', 'XeniumPR2S1ROI3', 'XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'XeniumPR2S1ROI6', 'XeniumPR2S1ROI8']

test set is  ['XeniumPR2S1ROI7']

Split 7/8
train set is  ['XeniumPR2S1ROI1', 'XeniumPR2S1ROI2', 'XeniumPR2S1ROI3', 'XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'XeniumPR2S1ROI6', 'XeniumPR2S1ROI7']

test set is  ['XeniumPR2S1ROI8']

[INFO] Wrote 8-fold splits to /project/simmons_hts/kxu/hest/eval/data/XeniumPR2_cell/splits
✅ Benchmark dataset created at /project/simmons_hts/kxu/hest/eval/data/XeniumPR2_cell


In [29]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR3_cell",
    K=8, 
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumPR3_cell",
    gene_k=6,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0,
    min_cells_pct=0
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR3_cell/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumPR3_cell/slide2')]
[INFO] Discovered 8 samples: ['XeniumPR3S1ROI1', 'XeniumPR3S1ROI2', 'XeniumPR3S1ROI3', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI5', 'XeniumPR3S1ROI6', 'XeniumPR3S1ROI7', 'XeniumPR3S1ROI8']
[32m21:26:14[0m | [1mINFO[0m | [1mFound 16 common genes[0m
[32m21:26:14[0m | [1mINFO[0m | [1mselected genes ['B-cells', 'CD4+_Tcells', 'CD8+_Tcells', 'DCs', 'Endothelium', 'Epithelium'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR3_cell/var_6genes.json (top-6, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR3_cell/all_genes.json (all_common=16); /project/simmons_hts/kxu/hest/eval/data/XeniumPR3_cell/common_genes_0.1.json (filtered_common=16, min_cells_pct=0)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR3_cell/var_6genes.json
Split 0/8
train set is  ['XeniumPR3S1ROI2

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Split 5/8
train set is  ['XeniumPR3S1ROI1', 'XeniumPR3S1ROI2', 'XeniumPR3S1ROI3', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI5', 'XeniumPR3S1ROI7', 'XeniumPR3S1ROI8']

test set is  ['XeniumPR3S1ROI6']

Split 6/8
train set is  ['XeniumPR3S1ROI1', 'XeniumPR3S1ROI2', 'XeniumPR3S1ROI3', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI5', 'XeniumPR3S1ROI6', 'XeniumPR3S1ROI8']

test set is  ['XeniumPR3S1ROI7']

Split 7/8
train set is  ['XeniumPR3S1ROI1', 'XeniumPR3S1ROI2', 'XeniumPR3S1ROI3', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI5', 'XeniumPR3S1ROI6', 'XeniumPR3S1ROI7']

test set is  ['XeniumPR3S1ROI8']

[INFO] Wrote 8-fold splits to /project/simmons_hts/kxu/hest/eval/data/XeniumPR3_cell/splits
✅ Benchmark dataset created at /project/simmons_hts/kxu/hest/eval/data/XeniumPR3_cell


# create XeniumPR folder containing all prime runs

In [30]:
metadata = pd.read_csv("/project/simmons_hts/kxu/hest/hest_directory.csv")
metadata

Unnamed: 0,sample_id,roi,slide,patient_id,sample_code,panel,technology,run_id,directory,wsi,...,location,sample_name,phenotype_montreal,matched_xenium,num_patches_100um,num_patches_50um,num_patches_50um_0.25_um_px,num_patches_25um,num_patches_25um_0.125_um_px,num_patches_cell_100um
0,XeniumPR1S1ROI1,1,1,CAM006,XEN_ST_5K_SLIDE 1,5k,10x Xenium,PR1,/project/simmons_hts/jpark/1_project/0_xenium/...,CAM006_Xenium5K_post_HnE.ome.tif,...,Colon,,,,684.0,2727.0,2603.0,10534.0,9950.0,608.0
1,XeniumPR1S1ROI2,2,1,TIP877,XEN_ST_5K_SLIDE 1,5k,10x Xenium,PR1,/project/simmons_hts/jpark/1_project/0_xenium/...,TIP877_Xenium5K_post_HnE.ome.tif,...,Colon,,,,482.0,1886.0,1838.0,7393.0,7130.0,429.0
2,XeniumPR1S1ROI3,3,1,GI9389,XEN_ST_5K_SLIDE 1,5k,10x Xenium,PR1,/project/simmons_hts/jpark/1_project/0_xenium/...,GI9389_Xenium5K_post_HnE.ome.tif,...,Colon,,,,1168.0,4627.0,4502.0,18007.0,17368.0,930.0
3,XeniumPR1S1ROI4,4,1,GI9077,XEN_ST_5K_SLIDE 1,5k,10x Xenium,PR1,/project/simmons_hts/jpark/1_project/0_xenium/...,GI9077_Xenium5K_post_HnE.ome.tif,...,Colon,,,,1253.0,5010.0,4903.0,19937.0,19360.0,1064.0
4,XeniumPR1S1ROI5,5,1,GI9612,XEN_ST_5K_SLIDE 1,5k,10x Xenium,PR1,/project/simmons_hts/jpark/1_project/0_xenium/...,GI9612_Xenium5K_post_HnE.ome.tif,...,Colon,,,,893.0,3520.0,3289.0,13692.0,12449.0,756.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,VisiumR6S1ROI4,4,1,JR_23234_23,R6B12,whole transcriptome,10x Visium,R6,/ceph/project/simmons_hts/shared/27_07_2023_CD...,,...,Enterocutaneous,B12,Fistulating (B3),,4213.0,,,,,
151,VisiumR6S2ROI1,1,2,JR_18076_22,R6C1,whole transcriptome,10x Visium,R6,/ceph/project/simmons_hts/shared/27_07_2023_CD...,,...,Anal,C1,Fistulating (B3) A2L2B1p,,3021.0,,,,,
152,VisiumR6S2ROI2,2,2,BAY_105338_20,R6C2,whole transcriptome,10x Visium,R6,/ceph/project/simmons_hts/shared/27_07_2023_CD...,,...,Transsphincteric,C2,Fistulating (B3),,2892.0,,,,,
153,VisiumR6S2ROI3,3,2,BAY_104603_20/JR_20291_22,R6C3,whole transcriptome,10x Visium,R6,/ceph/project/simmons_hts/shared/27_07_2023_CD...,,...,Ileocaecal/Perianal,C3,Fistulating (B3)/Fistulating (B3) A2L1B2p,,3693.0,,,,,


In [31]:
import re
import os
import json
import shutil
from pathlib import Path
from typing import List, Dict, Optional

import numpy as np
import pandas as pd
import scanpy as sc
from hest.utils import get_k_genes  # used by write_var_k_genes_from_paths; create_splits imported inside main


# ---------- helpers ----------

# def _sanitize_tag(s: str, maxlen: int = 8) -> str:
#     s2 = re.sub(r"[^A-Za-z0-9]", "", s)
#     return s2.upper()[:maxlen] or "R"

def _extract_pr_number(path: Path) -> Optional[int]:
    m = re.search(r"XeniumPR(\d+)", str(path), flags=re.IGNORECASE)
    return int(m.group(1)) if m else None

def _extract_slide_number_from_name(name: str) -> Optional[str]:
    """Extract slide number from a folder name like 'slide1', 'slide_2', 'S3', etc."""
    n = name.lower()
    m = re.search(r"slide[_\-]?(\d+)", n)
    if m:
        return m.group(1)
    m2 = re.search(r"\bS(\d+)\b", name, flags=re.IGNORECASE)
    if m2:
        return m2.group(1)
    return None

def _is_slide_like_folder(folder: Path) -> bool:
    """Return True if folder name looks like a slide folder and it contains subfolders."""
    if not folder.is_dir():
        return False
    if _extract_slide_number_from_name(folder.name) is not None:
        return True
    # also treat names starting with 'slide' case-insensitively
    if folder.name.lower().startswith("slide"):
        return True
    return False

def _expand_to_slide_paths(roots: List[Path]) -> List[Path]:
    """
    Given a list of roots, expand any PR-level root that contains slide-like subfolders
    into a list of slide paths. If a root already looks like a slide (contains aligned_adata.h5ad
    in its immediate subfolders), it's kept as-is.
    """
    expanded: List[Path] = []
    for r in roots:
        if not r.exists() or not r.is_dir():
            continue
        # find immediate subdirectories
        immediate_subdirs = sorted([d for d in r.iterdir() if d.is_dir()], key=lambda p: p.name)
        # if any immediate subdir looks like a slide and that slide has child sample folders, expand
        slide_candidates = [d for d in immediate_subdirs if _is_slide_like_folder(d)]
        if slide_candidates:
            # For each slide candidate, add it (but only if it contains sample subfolders)
            for s in slide_candidates:
                # if s contains at least one subdir with aligned_adata.h5ad, keep it
                has_sample_subdir = any((sd / "aligned_adata.h5ad").exists() for sd in sorted([d for d in s.iterdir() if d.is_dir()]))
                if has_sample_subdir:
                    expanded.append(s)
                else:
                    # if slide folder itself directly contains aligned_adata.h5ad files (uncommon), treat slide as sample root
                    if any((s / f).is_file() and f.endswith(".h5ad") for f in os.listdir(s)):
                        expanded.append(s)
        else:
            # No slide-like immediate subdirs. Check if this root itself directly contains sample subfolders (with aligned_adata.h5ad)
            has_direct_samples = any((d / "aligned_adata.h5ad").exists() for d in immediate_subdirs)
            if has_direct_samples:
                expanded.append(r)
            else:
                # fallback: if immediate_subdirs is non-empty, treat each immediate subdir as a slide candidate
                for d in immediate_subdirs:
                    if any((sd / "aligned_adata.h5ad").exists() for sd in sorted([sd for sd in d.iterdir() if sd.is_dir()])):
                        expanded.append(d)
    # dedupe while preserving order
    seen = set()
    uniq = []
    for p in expanded:
        if str(p) not in seen:
            uniq.append(p)
            seen.add(str(p))
    return uniq


def _discover_samples_from_slide_paths(
    slide_paths: List[Path],
    ids: Optional[List[str]] = None,
) -> Dict[str, Dict[str, Path]]:
    """
    Discover samples under *slide* paths (each slide path should contain sample subfolders).
    Returns mapping {new_id: {"adata": Path, "patch": Path|None, "vis": Path|None, "orig": orig_sid}}
    Naming: XeniumPR{n}S{slide}{ROI}
    """
    collected = []
    for sp in slide_paths:
        # sample subfolders are immediate children of the slide path
        for p in sorted([d for d in sp.iterdir() if d.is_dir()], key=lambda d: d.name):
            collected.append((sp, p.name))

    if ids is not None:
        # filter collected by ids list
        collected = [(sp, sid) for sp, sid in collected if sid in ids]

    samples: Dict[str, Dict[str, Path]] = {}
    for slide_path, sid in collected:
        sdir = slide_path / sid
        adata = sdir / "aligned_adata.h5ad"
        if not adata.exists():
            continue

        # find patch .h5 (optional)
        patch_h5 = None
        patches_dir = sdir / "patches"
        if patches_dir.exists():
            cands = sorted(patches_dir.glob("*.h5"))
            if cands:
                exact = [c for c in cands if c.name == f"{sid}.h5"]
                patch_h5 = exact[0] if exact else cands[0]

        # find vis png (optional)
        vis_png = None
        vis_dir = sdir / "patches_vis"
        if vis_dir.exists():
            cands = sorted(vis_dir.glob("*.png"))
            if cands:
                exact = [c for c in cands if c.name == f"{sid}_patch_vis.png"]
                vis_png = exact[0] if exact else cands[0]

        # determine PR number from slide_path or its ancestors
        pr_num = _extract_pr_number(slide_path)
        # if not present, try ancestors
        if pr_num is None:
            for ancestor in slide_path.parents:
                pr_num = _extract_pr_number(ancestor)
                if pr_num is not None:
                    break

        # slide number from slide path name (if can't find, use sanitized short tag)
        slide_num = _extract_slide_number_from_name(slide_path.name) or _sanitize_tag(slide_path.name, maxlen=3)

        if pr_num is not None:
            prefix = f"XeniumPR{pr_num}S{slide_num}"
        else:
            # fallback if no PR number found anywhere upstream
            prefix = f"{_sanitize_tag(slide_path.name, maxlen=6)}S{slide_num}"

        new_id = f"{prefix}{sid}"
        if new_id in samples:
            raise ValueError(f"Duplicate renamed sample id '{new_id}' (collision for sid='{sid}').")

        samples[new_id] = {"adata": adata, "patch": patch_h5, "vis": vis_png, "orig": sid}

    return samples


# copy directly from other eval folder
def create_benchmark_from_eval_dirs(
    save_dir: str | Path,
    K: int,
    eval_dirs: List[str | Path],
    gene_k: int = 50,
    gene_criteria: str = "var",
    min_cells_pct: float = 0.10,
    symlink: bool = False,
    seed: int = 0,
    metadata_csv: str = "/project/simmons_hts/kxu/hest/hest_directory.csv",
    dry_run: bool = False,
    exclude_ids: Optional[List[str]] = None
):
    """
    Build a merged benchmark package by copying (or symlinking) assets from one or more
    'eval' dataset folders that already contain:
        <eval_dir>/
            patches/
                *.h5
                vis/
                    *.png
            adata/
                *.h5ad

    Args:
        save_dir: destination directory to create merged dataset (will contain patches/, patches/vis/, adata/, splits/, var_*.json)
        K: number of folds (patient-level)
        eval_dirs: list of dataset root paths to copy from (e.g. XeniumPR2 eval folder)
        gene_k, gene_criteria: forwarded to get_k_genes
        symlink: if True, create symlinks instead of copying
        seed: RNG seed for deterministic fold assignment
        metadata_csv: CSV mapping sample_id -> patient_id
        dry_run: if True, only print planned actions without copying
    Returns:
        pd.DataFrame meta (columns: id, patient, dataset_title)
    """
    from hest.HESTData import create_splits

    save_dir = Path(save_dir)
    eval_dirs = [Path(x) for x in eval_dirs]
    # sanitise and check inputs
    existing = [d for d in eval_dirs if d.exists() and d.is_dir()]
    if not existing:
        raise ValueError(f"No valid eval_dirs found among: {eval_dirs}")
    print(f"[INFO] Using eval dirs: {existing}")

    # discover sample ids by scanning adata/ and patches/ for filenames
    discovered_ids = set()
    sample_sources = {}  # id -> dict(sources found)
    for d in existing:
        adata_dir = d / "adata"
        patches_dir = d / "patches"
        vis_dir = patches_dir / "vis"

        # adata
        if adata_dir.exists() and adata_dir.is_dir():
            for f in sorted(adata_dir.glob("*.h5ad")):
                sid = f.stem
                discovered_ids.add(sid)
                sample_sources.setdefault(sid, {}).setdefault("adata", []).append(f)

        # patches
        if patches_dir.exists() and patches_dir.is_dir():
            for f in sorted(patches_dir.glob("*.h5")):
                sid = f.stem
                discovered_ids.add(sid)
                sample_sources.setdefault(sid, {}).setdefault("patch", []).append(f)

            # vis images
            if vis_dir.exists() and vis_dir.is_dir():
                for f in sorted(vis_dir.glob("*.png")):
                    # allow vis file names like '<sid>_patch_vis.png' or '<sid>.png' or anything; map by stem heuristics
                    stem = f.stem
                    # normalize: if stem endswith '_patch_vis', strip it
                    stem_clean = re.sub(r"_?patch_vis$", "", stem, flags=re.IGNORECASE)
                    # sometimes vis is named '<sid>_patch_vis' or '<sid>'
                    sid = stem_clean
                    discovered_ids.add(sid)
                    sample_sources.setdefault(sid, {}).setdefault("vis", []).append(f)
                    
    # ---- Apply exclusion ----
    if exclude_ids:
        exclude_set = set(exclude_ids)
        before = len(discovered_ids)
        discovered_ids = [sid for sid in discovered_ids if sid not in exclude_set]

        missing_excludes = exclude_set - set(discovered_ids)
        if missing_excludes:
            print(f"[WARN] Some exclude_ids not found: {sorted(missing_excludes)}")

        removed = before - len(discovered_ids)
        print(f"[INFO] Excluded {removed} samples → remaining {len(discovered_ids)}")
        if removed > 0:
            for e in sorted(exclude_set & set(discovered_ids)):
                print(f"   - excluded: {e}")

    discovered_ids = sorted(discovered_ids)
    if not discovered_ids:
        raise ValueError("No samples discovered in provided eval_dirs (no *.h5ad or *.h5 files found).")
    print(f"[INFO] Discovered sample IDs ({len(discovered_ids)}): {discovered_ids}")

    # Prepare save_dir layout
    patches_out = save_dir / "patches"
    patches_vis_out = patches_out / "vis"
    adata_out = save_dir / "adata"
    for p in (patches_out, patches_vis_out, adata_out):
        if not dry_run:
            p.mkdir(parents=True, exist_ok=True)

    # Load metadata CSV mapping sample_id -> patient_id
    patient_map = {}
    meta_df_csv = None
    if Path(metadata_csv).exists():
        meta_df_csv = pd.read_csv(metadata_csv, dtype=str)
        if {"sample_id", "patient_id"}.issubset(meta_df_csv.columns):
            meta_df_csv["sample_id"] = meta_df_csv["sample_id"].astype(str).str.strip()
            meta_df_csv["patient_id"] = meta_df_csv["patient_id"].astype(str).str.strip()
            patient_map = dict(zip(meta_df_csv["sample_id"], meta_df_csv["patient_id"]))
            print(f"[INFO] Loaded {len(patient_map)} entries from {metadata_csv}")
        else:
            print(f"[WARN] metadata_csv missing columns 'sample_id'/'patient_id'; will fallback to automatic patient inference")
    else:
        print(f"[WARN] metadata_csv not found: {metadata_csv}; will fallback to automatic patient inference")

    # Copy / symlink files into save_dir using sample id as filename stem
    missing = []
    planned_actions = []
    for sid in discovered_ids:
        srcs = sample_sources.get(sid, {})
        # choose one adata: prefer first available
        adata_src = None
        if "adata" in srcs and srcs["adata"]:
            adata_src = srcs["adata"][0]
        # else fallback to none

        patch_src = None
        if "patch" in srcs and srcs["patch"]:
            patch_src = srcs["patch"][0]

        # vis: there may be multiple pngs per sample across eval_dirs — keep all but use a standardized name
        vis_srcs = srcs.get("vis", [])

        # plan copy/symlink
        if adata_src:
            dst = adata_out / f"{sid}.h5ad"
            planned_actions.append(("adata", adata_src, dst))
        else:
            # warn — adata missing for this sid
            missing.append((sid, "adata"))

        if patch_src:
            dst = patches_out / f"{sid}.h5"
            planned_actions.append(("patch", patch_src, dst))
        else:
            missing.append((sid, "patch"))

        # for vis, when multiple sources exist, copy each with a numeric suffix if needed
        for i, vs in enumerate(vis_srcs, start=1):
            # try base name '<sid>.png' then '<sid>_1.png', '<sid>_2.png'...
            if i == 1:
                dst = patches_vis_out / f"{sid}.png"
            else:
                dst = patches_vis_out / f"{sid}_{i}.png"
            planned_actions.append(("vis", vs, dst))

    # Show dry run summary
    print(f"[INFO] Planned actions: {len(planned_actions)} file operations; {len(missing)} missing types.")
    if dry_run:
        for act, src, dst in planned_actions[:200]:
            print(f"  - [{act}] {src} -> {dst}")
        if missing:
            print("[WARN] Missing items:")
            for sid, typ in missing[:50]:
                print(f"  - {sid}: missing {typ}")
        print("[INFO] dry_run=True → no files were copied.")
    else:
        # perform file ops
        for act, src, dst in planned_actions:
            try:
                _transfer(src, dst, act, symlink, [])  # we pass temporary missing list per transfer
            except Exception as e:
                print(f"[ERROR] transferring {src} -> {dst}: {e}")

    # Build metadata DataFrame: use discovered sample IDs and patient mapping (full sample id)
    patient_ids = []
    unresolved = []
    for sid in discovered_ids:
        pid = patient_map.get(sid)
        if pid is None:
            # fallback: try a stem match where original source had 'orig' info: try to find sample with full stem in filenames
            # attempt to match any filename that contains sid as suffix: useful if CSV used 'XeniumPR1S1ROI1' but discovered was 'ROI1' etc.
            # we'll try simple heuristics:
            matched = None
            if meta_df_csv is not None:
                # try find any csv sample_id that endswith sid
                candidates = [s for s in meta_df_csv["sample_id"].values if str(s).endswith(str(sid))]
                if candidates:
                    matched = candidates[0]
                    pid = patient_map.get(matched)
            if pid is None:
                # fallback to using prefix before '_' or the sid itself as patient
                pid = sid.split("_")[0] if "_" in sid else sid
                unresolved.append(sid)
        patient_ids.append(pid)

    meta = pd.DataFrame({"id": discovered_ids, "patient": patient_ids, "dataset_title": ["XeniumPR"] * len(discovered_ids)})

    print(f"[INFO] Built metadata: {len(meta)} samples, {meta['patient'].nunique()} unique patients.")
    print(meta.head(20).to_string(index=False))

    # write var_k genes (requires adata files to be present in save_dir or accessible)
    adata_paths = [adata_out / f"{sid}.h5ad" for sid in discovered_ids]
    # If dry_run, don't run get_k_genes; just return meta
    if dry_run:
        if unresolved:
            print(f"[WARN] {len(unresolved)} samples unresolved from metadata CSV (used fallback patient ids).")
        return meta

    var_json = save_dir / f"var_{gene_k}genes.json"
    write_var_k_genes_from_paths(adata_paths, gene_k, gene_criteria, min_cells_pct, var_json)
    print(f"[INFO] Wrote {var_json}")

    # patient-level splits
    group = meta.groupby(["dataset_title", "patient"])["id"].agg(list).to_dict()
    rng = np.random.RandomState(seed)
    for key, id_list in group.items():
        rng.shuffle(id_list)

    splits_dir = save_dir / "splits"
    splits_dir.mkdir(parents=True, exist_ok=True)
    create_splits(str(splits_dir), group, K=K)
    print(f"[INFO] Wrote {K}-fold patient-level splits to {splits_dir}")

    # final warnings about missing files
    if missing:
        print("[WARN] Some samples were missing adata/patch files (listing up to 50):")
        for sid, typ in missing[:50]:
            print(f"  - {sid}: missing {typ}")

    print(f"✅ Merged benchmark created at {save_dir}")
    return meta


In [21]:
meta = create_benchmark_from_eval_dirs(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR1-3",
    K=10,
    eval_dirs=["/project/simmons_hts/kxu/hest/eval/data/XeniumPR3",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR2",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR1",
    ],
    gene_k=50,
    symlink=False,
    seed=0,
    metadata_csv="/project/simmons_hts/kxu/hest/hest_directory.csv",
)

SyntaxError: unterminated string literal (detected at line 6) (19163659.py, line 6)

In [33]:
meta = create_benchmark_from_eval_dirs(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR4-5",
    K=15,
    eval_dirs=["/project/simmons_hts/kxu/hest/eval/data/XeniumPR4",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR5",
    ],
    gene_k=50,
    symlink=False,
    seed=0,
    metadata_csv="/project/simmons_hts/kxu/hest/hest_directory.csv",
)

[INFO] Using eval dirs: [PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR4'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR5')]
[INFO] Discovered sample IDs (41): ['XeniumPR4S1ROI1', 'XeniumPR4S1ROI10', 'XeniumPR4S1ROI2', 'XeniumPR4S1ROI3', 'XeniumPR4S1ROI4', 'XeniumPR4S1ROI5', 'XeniumPR4S1ROI6', 'XeniumPR4S1ROI7', 'XeniumPR4S1ROI8', 'XeniumPR4S1ROI9', 'XeniumPR4S2ROI1', 'XeniumPR4S2ROI10', 'XeniumPR4S2ROI2', 'XeniumPR4S2ROI3', 'XeniumPR4S2ROI4', 'XeniumPR4S2ROI5', 'XeniumPR4S2ROI6', 'XeniumPR4S2ROI7', 'XeniumPR4S2ROI8', 'XeniumPR4S2ROI9', 'XeniumPR5S1ROI1', 'XeniumPR5S1ROI10', 'XeniumPR5S1ROI2', 'XeniumPR5S1ROI3', 'XeniumPR5S1ROI4', 'XeniumPR5S1ROI5', 'XeniumPR5S1ROI6', 'XeniumPR5S1ROI7', 'XeniumPR5S1ROI8', 'XeniumPR5S1ROI9', 'XeniumPR5S2ROI1', 'XeniumPR5S2ROI10', 'XeniumPR5S2ROI11', 'XeniumPR5S2ROI2', 'XeniumPR5S2ROI3', 'XeniumPR5S2ROI4', 'XeniumPR5S2ROI5', 'XeniumPR5S2ROI6', 'XeniumPR5S2ROI7', 'XeniumPR5S2ROI8', 'XeniumPR5S2ROI9']
[INFO] Loaded 120 entries f

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m16:57:24[0m | [1mINFO[0m | [1mselected genes ['ADAM8', 'ADAMDEC1', 'AQP10', 'CCL28', 'CD4', 'CD55', 'CHGA', 'COL18A1', 'COL4A1', 'COL4A2', 'COL5A1', 'DEFA6', 'DERL3', 'DKC1', 'EPHB2', 'GBP1', 'GIP', 'IFI27', 'IGFBP2', 'JAK3', 'LAMA3', 'LRPPRC', 'MCM3', 'MCM5', 'MSI1', 'MUC2', 'NR4A1', 'OAS3', 'PIEZO1', 'PIK3R2', 'PIM2', 'PLA2G2A', 'PLAUR', 'REG1A', 'REG4', 'RRM1', 'SCNN1A', 'SLC11A2', 'SLC12A2', 'SLC20A2', 'SORD', 'SST', 'STAB1', 'TCF21', 'TENT5C', 'TMPRSS2', 'TNFAIP2', 'TNXB', 'TRAP1', 'TSPAN8'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR4-5/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR4-5/all_genes.json (all_common=4891); /project/simmons_hts/kxu/hest/eval/data/XeniumPR4-5/common_genes_0.1.json (filtered_common=1473, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR4-5/var_50genes.json
K=15 doesnt match the number of patients, try to distribute the patients instead
Split 0/21
tra

### Create XeniumPR external validation splits 

In [32]:
import pandas as pd
from pathlib import Path
import re

# create a folder for all 5 Xenium PRs 
#then manually overright train and test files to be train on coeliac test on IBD or vice versa
meta = create_benchmark_from_eval_dirs(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR1-5_ex_val",
    K=2,
    eval_dirs=["/project/simmons_hts/kxu/hest/eval/data/XeniumPR4",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR5",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR3",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR2",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_segger"
    ],
    gene_k=50,
    symlink=False,
    seed=0,
    metadata_csv="/project/simmons_hts/kxu/hest/hest_directory.csv",
)

# path to your splits folder
splits_dir = Path("/project/simmons_hts/kxu/hest/eval/data/XeniumPR1-5_ex_val/splits")

# Load existing splits (they each contain one column of sample names)
def load_csv(name):
    return pd.read_csv(splits_dir / name)['sample_id'].tolist()

train0 = load_csv("train_0.csv")
test0  = load_csv("test_0.csv")
train1 = load_csv("train_1.csv")
test1  = load_csv("test_1.csv")

all_samples = sorted(set(train0 + test0 + train1 + test1))

# classify into PR1-3 and PR4-5
pr1_3 = []
pr4_5 = []
other = []
for s in all_samples:
    m = re.search(r"XeniumPR(\d+)", s, re.IGNORECASE)
    if not m:
        other.append(s)
        continue
    pr = int(m.group(1))
    if 1 <= pr <= 3:
        pr1_3.append(s)
    elif 4 <= pr <= 5:
        pr4_5.append(s)
    else:
        other.append(s)

# Build dataframes with the 3-column structure
def make_df(samples):
    df = pd.DataFrame({"sample_id": samples})
    df["patches_path"] = df["sample_id"].apply(lambda x: f"patches/{x}.h5")
    df["expr_path"]    = df["sample_id"].apply(lambda x: f"adata/{x}.h5ad")
    return df

# Desired splits:
# split 0: PR1-3 are train, PR4-5 are test
# split 1: PR4-5 are train, PR1-3 are test
df_train_0 = make_df(pr1_3)
df_test_0  = make_df(pr4_5)

df_train_1 = make_df(pr4_5)
df_test_1  = make_df(pr1_3)

# Write files (tab-separated to match your example layout)
df_train_0.to_csv(splits_dir / "train_0.csv", index=False, sep=",")
df_test_0.to_csv (splits_dir / "test_0.csv",  index=False, sep=",")
df_train_1.to_csv(splits_dir / "train_1.csv", index=False, sep=",")
df_test_1.to_csv (splits_dir / "test_1.csv",  index=False, sep=",")

print("Wrote:")
print(f" - {splits_dir/'train_0.csv'} ({len(df_train_0)} rows)")
print(f" - {splits_dir/'test_0.csv'}  ({len(df_test_0)} rows)")
print(f" - {splits_dir/'train_1.csv'} ({len(df_train_1)} rows)")
print(f" - {splits_dir/'test_1.csv'}  ({len(df_test_1)} rows)")
if other:
    print(f"\nNote: {len(other)} samples did not match PR1-5 and were not included by default:\n {other}")

[INFO] Using eval dirs: [PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR4'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR5'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR3'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR2'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_segger')]
[INFO] Discovered sample IDs (72): ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7', 'XeniumPR2S1ROI1', 'XeniumPR2S1ROI2', 'XeniumPR2S1ROI3', 'XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'XeniumPR2S1ROI6', 'XeniumPR2S1ROI7', 'XeniumPR2S1ROI8', 'XeniumPR3S1ROI1', 'XeniumPR3S1ROI2', 'XeniumPR3S1ROI3', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI5', 'XeniumPR3S1ROI6', 'XeniumPR3S1ROI7', 'XeniumPR3S1ROI8', 'XeniumPR4S1ROI1', 'Xen

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m14:59:42[0m | [1mINFO[0m | [1mselected genes ['ACE', 'CAD', 'CAMK2D', 'CFB', 'CIITA', 'CSK', 'CTSH', 'DDR1', 'DNMT1', 'ERN1', 'F3', 'FLNB', 'FUCA1', 'GIMAP5', 'GIPC1', 'GNA11', 'HPGD', 'MCM3', 'MCM5', 'MCM7', 'MDM2', 'MEN1', 'MYO1E', 'NQO1', 'PARP1', 'PIM2', 'PLAUR', 'PPIF', 'PTCH1', 'PTPN6', 'RPA3', 'RRM1', 'SELENBP1', 'SEMA4D', 'SFXN1', 'SGK1', 'SHMT2', 'SHROOM1', 'SLC11A2', 'SLC12A2', 'SLC16A1', 'SLC20A2', 'SMAP2', 'SNHG1', 'SRI', 'TERF2', 'TNK2', 'UGDH', 'WDR74', 'WEE1'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1-5_ex_val/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1-5_ex_val/all_genes.json (all_common=4842); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1-5_ex_val/common_genes_0.1.json (filtered_common=1222, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1-5_ex_val/var_50genes.json
K=2 doesnt match the number of patients, try to distribute the patients instead
Split 0/2

### create leave one patient out cross validation

In [23]:
meta = create_benchmark_from_eval_dirs(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR_LOOCV",
    K=29,
    eval_dirs=["/project/simmons_hts/kxu/hest/eval/data/XeniumPR3",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR2",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR1",
    ],
    gene_k=50,
    symlink=False,
    seed=0,
    metadata_csv="/project/simmons_hts/kxu/hest/hest_directory.csv",
)

[INFO] Using eval dirs: [PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR3'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR2'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_segger')]
[INFO] Discovered sample IDs (31): ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7', 'XeniumPR2S1ROI1', 'XeniumPR2S1ROI2', 'XeniumPR2S1ROI3', 'XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'XeniumPR2S1ROI6', 'XeniumPR2S1ROI7', 'XeniumPR2S1ROI8', 'XeniumPR3S1ROI1', 'XeniumPR3S1ROI2', 'XeniumPR3S1ROI3', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI5', 'XeniumPR3S1ROI6', 'XeniumPR3S1ROI7', 'XeniumPR3S1ROI8']
[INFO] Loaded 79 entries from /project/simmons_hts/kxu/hest/hest_directory.csv
[INFO] Planned actions: 109 file operations; 0 missing types.
[INFO] Bui

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m18:32:16[0m | [1mINFO[0m | [1mselected genes ['ACE', 'ANXA1', 'ANXA3', 'APCDD1', 'BAG3', 'BCL6', 'CCL21', 'CDCA7L', 'DGAT1', 'DNMT1', 'EMP1', 'EPHX2', 'FSCN1', 'FUCA1', 'G0S2', 'GIMAP5', 'H19', 'LMO2', 'LPIN1', 'MALL', 'MAP4K2', 'MCM5', 'MCM6', 'NCF1', 'NFATC1', 'NPM3', 'NQO1', 'ORAI2', 'PHGDH', 'PIK3CD', 'PLAUR', 'PPP1R16B', 'PRKCB', 'PTGS2', 'PTPN6', 'PYGL', 'RACGAP1', 'RBM38', 'RFTN1', 'S100B', 'SELENBP1', 'SERPINA3', 'SLC12A2', 'SLC16A1', 'SNX29', 'SPP1', 'TOX', 'TUBA4A', 'TUBB2A', 'TWIST2'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR_LOOCV/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR_LOOCV/all_genes.json (all_common=5100); /project/simmons_hts/kxu/hest/eval/data/XeniumPR_LOOCV/common_genes_0.1.json (filtered_common=2155, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR_LOOCV/var_50genes.json
Split 0/29
train set is  ['XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'XeniumPR3S1ROI1', '

### create XeniumPR1-3_cell

In [41]:
meta = create_benchmark_from_eval_dirs(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumPR1-3_cell",
    K=15,
    eval_dirs=["/project/simmons_hts/kxu/hest/eval/data/XeniumPR3_cell",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR2_cell",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_cell",
    ],
    gene_k=8,
    min_cells_pct = 0.01,
    symlink=False,
    seed=0,
    metadata_csv="/project/simmons_hts/kxu/hest/hest_directory.csv",
)

[INFO] Using eval dirs: [PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR3_cell'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR2_cell'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumPR1_cell')]
[INFO] Discovered sample IDs (31): ['XeniumPR1S1ROI1', 'XeniumPR1S1ROI2', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI6', 'XeniumPR1S1ROI7', 'XeniumPR1S1ROI8', 'XeniumPR1S2ROI1', 'XeniumPR1S2ROI2', 'XeniumPR1S2ROI3', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR1S2ROI7', 'XeniumPR2S1ROI1', 'XeniumPR2S1ROI2', 'XeniumPR2S1ROI3', 'XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'XeniumPR2S1ROI6', 'XeniumPR2S1ROI7', 'XeniumPR2S1ROI8', 'XeniumPR3S1ROI1', 'XeniumPR3S1ROI2', 'XeniumPR3S1ROI3', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI5', 'XeniumPR3S1ROI6', 'XeniumPR3S1ROI7', 'XeniumPR3S1ROI8']
[INFO] Loaded 155 entries from /project/simmons_hts/kxu/hest/hest_directory.csv
[INFO] Planned actions: 93 file operations; 0 missing types.
[I

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1-3_cell/var_8genes.json (top-8, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1-3_cell/all_genes.json (all_common=16); /project/simmons_hts/kxu/hest/eval/data/XeniumPR1-3_cell/common_genes_0.01.json (filtered_common=8, min_cells_pct=0.01)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumPR1-3_cell/var_8genes.json
K=15 doesnt match the number of patients, try to distribute the patients instead
Split 0/15
train set is  ['XeniumPR2S1ROI4', 'XeniumPR2S1ROI5', 'XeniumPR3S1ROI1', 'XeniumPR1S2ROI1', 'XeniumPR1S1ROI1', 'XeniumPR3S1ROI4', 'XeniumPR3S1ROI6', 'XeniumPR3S1ROI5', 'XeniumPR1S2ROI4', 'XeniumPR1S2ROI3', 'XeniumPR1S1ROI4', 'XeniumPR1S1ROI3', 'XeniumPR1S1ROI5', 'XeniumPR1S1ROI7', 'XeniumPR1S2ROI2', 'XeniumPR3S1ROI3', 'XeniumPR2S1ROI3', 'XeniumPR3S1ROI8', 'XeniumPR2S1ROI1', 'XeniumPR2S1ROI2', 'XeniumPR1S1ROI8', 'XeniumPR1S1ROI6', 'XeniumPR1S2ROI5', 'XeniumPR1S2ROI6', 'XeniumPR3S1ROI7', 'XeniumPR1S2RO

# Construct 480 panel folders

In [4]:
# --- Standard library ---
import os
import json
import re
import shutil
from pathlib import Path
from typing import List, Dict, Tuple, Optional

# --- Third-party ---
import numpy as np
import pandas as pd
import scanpy as sc

# --- HEST ---
from hest import iter_hest
from hest.utils import get_k_genes
from hest.HESTData import create_splits


# ---------- helpers ----------
def _sanitize_tag(s: str, maxlen: int = 8) -> str:
    s2 = re.sub(r'[^A-Za-z0-9]', '', s)
    return s2.upper()[:maxlen] or "R"


def _extract_run_with_pattern(path: Path, pattern: str) -> Optional[str]:
    """
    Extract a 'run' token from `path` using the user-supplied regex `pattern`.
    The pattern should contain one capturing group that returns the run label/number.

    Examples:
      pattern = r'XeniumPR(\d)'   -> matches 'XeniumPR1' and returns '1'
      pattern = r'XeniumR(\d+)'   -> matches 'XeniumR12' and returns '12'
      pattern = r'Xenium(PR|R)(\d+)' -> if you want group 2 return, supply pattern with a single group like r'Xenium(?:PR|R)(\d+)'

    Returns the string of the first capturing group, or None if no match.
    """
    try:
        m = re.search(pattern, str(path), flags=re.IGNORECASE)
        if not m:
            return None
        # prefer the first non-empty capture
        for i in range(1, len(m.groups()) + 1):
            val = m.group(i)
            if val is not None:
                return str(val)
        return None
    except re.error as e:
        raise ValueError(f"Invalid run pattern regex: {pattern!r}: {e}")


def _extract_slide_number(root: Path) -> Optional[str]:
    """
    Look for 'slideN' pattern in the root folder name and return the digit as string.
    If not found, try to infer from name like 'S1' or 's1' inside the folder name.
    """
    n = root.name.lower()
    m = re.search(r'slide[_\-]?(\d+)', n)
    if m:
        return m.group(1)
    m2 = re.search(r'\bS(\d+)\b', root.name, flags=re.IGNORECASE)
    if m2:
        return m2.group(1)
    return None


def _discover_samples_from_roots(
    roots: List[Path],
    ids: Optional[List[str]] = None,
    *,
    run_pattern: str = r'XeniumPR(\d)',
    run_prefix_format: str = "XeniumPR{run}S{slide}",
) -> Dict[str, Dict[str, Path]]:
    """
    Discover samples under multiple roots and merge into a single map.

    Flexible run extraction:
      - `run_pattern` is a regex string that must contain at least one capturing group.
        The first capturing group (that is non-empty) is interpreted as the run token.
      - `run_prefix_format` is a format string used to create the canonical prefix. It must
        contain placeholders `{run}` and `{slide}` (both provided as strings).

    Defaults preserve previous behavior:
      run_pattern=r'XeniumPR(\d)'
      run_prefix_format='XeniumPR{run}S{slide}'

    Example behavior:
      root = Path("/.../XeniumPR1/slide1") -> run='1', slide='1' -> prefix 'XeniumPR1S1'
      root = Path("/.../XeniumR2/whatever") -> if run_pattern=r'XeniumR(\d)' -> run='2'
    """
    roots = [Path(r) for r in roots]
    roots = [r for r in roots if r.exists() and r.is_dir()]
    collected = []

    if ids is None:
        for r in sorted(roots, key=lambda p: str(p)):
            for p in sorted([d for d in r.iterdir() if d.is_dir()], key=lambda d: d.name):
                collected.append((r, p.name))
    else:
        for sid in sorted(ids):
            for r in sorted(roots, key=lambda p: str(p)):
                if (r / sid).is_dir():
                    collected.append((r, sid))

    samples: Dict[str, Dict[str, Path]] = {}
    for root, sid in collected:
        sdir = root / sid
        adata = sdir / "aligned_adata.h5ad"
        if not adata.exists():
            # try alternative common filename 'adata.h5ad' (optional)
            alt = sdir / "adata.h5ad"
            if alt.exists():
                adata = alt
            else:
                continue

        # pick patch .h5
        patch_h5 = None
        patches_dir = sdir / "patches"
        if patches_dir.exists():
            cands = sorted(patches_dir.glob("*.h5"))
            if cands:
                exact = [c for c in cands if c.name == f"{sid}.h5"]
                patch_h5 = exact[0] if exact else cands[0]

        # pick vis .png
        vis_png = None
        vis_dir = sdir / "patches_vis"
        if vis_dir.exists():
            cands = sorted(vis_dir.glob("*.png"))
            if cands:
                exact = [c for c in cands if c.name == f"{sid}_patch_vis.png"]
                vis_png = exact[0] if exact else cands[0]

        # --- Naming rule using run_pattern & run_prefix_format ---
        run_token = _extract_run_with_pattern(root, run_pattern)
        slide_num = _extract_slide_number(root) or _sanitize_tag(root.name, 3)
        if run_token is not None:
            try:
                prefix = run_prefix_format.format(run=run_token, slide=slide_num)
            except KeyError as e:
                raise ValueError(
                    f"run_prefix_format must include '{{run}}' and '{{slide}}' placeholders: {run_prefix_format!r}"
                )
        else:
            # fallback for unknown roots
            prefix = f"{_sanitize_tag(root.name)}S{slide_num}"

        new_id = f"{prefix}{sid}"
        if new_id in samples:
            raise ValueError(
                f"Duplicate renamed sample id '{new_id}' (collision between roots for sid='{sid}')."
            )

        samples[new_id] = {"adata": adata, "patch": patch_h5, "vis": vis_png}

    return samples


def _transfer(src: Optional[Path], dst: Path, label: str, symlink: bool, missing_list: list):
    if src is None or not Path(src).exists():
        missing_list.append((dst.stem, label, str(src) if src is not None else "<none>"))
        return
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists():
        dst.unlink()
    if symlink:
        try:
            os.symlink(src, dst)
        except FileExistsError:
            pass
    else:
        shutil.copy(src, dst)


def write_var_k_genes_from_paths(
    adata_paths,
    k,
    criteria,
    var_out_path,
    all_genes_out_path=None,
    filtered_common_out_path=None,
    exclude_keywords=None,
    min_cells_pct: float = 0.10,
):
    """
    Load all adatas, call HEST's get_k_genes() for top-k genes,
    and also save:
      - all common genes (keyword-filtered, no expression threshold)
      - filtered common genes using min_cells_pct across each sample

    Returns:
        (var_k_genes, all_common_genes, filtered_common_genes)
    """
    import json, warnings
    import numpy as np
    import scanpy as sc
    from hest.utils import get_k_genes

    if exclude_keywords is None:
        exclude_keywords = ["NegControl", "Codeword", "Intergenic_Region", "Control", "BLANK"]

    warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")

    # ---- Load all adatas
    adata_list = []
    for p in adata_paths:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=FutureWarning)
            ad = sc.read_h5ad(str(p))
        adata_list.append(ad)

    # ---- Top-k variable/mean genes (delegates JSON writing to get_k_genes if var_out_path is a file path)
    var_k_genes = get_k_genes(adata_list, k, criteria, save_dir=str(var_out_path), min_cells_pct=min_cells_pct)

    # ---- ALL common genes (keyword-filtered only; preserves your original behavior)
    common_genes = set(adata_list[0].var_names)
    for ad in adata_list[1:]:
        common_genes &= set(ad.var_names)

    def _keep_keyword(gene: str) -> bool:
        for kw in exclude_keywords:
            if kw in gene:
                return False
        return True

    all_common_genes = sorted([g for g in common_genes if _keep_keyword(g)])

    # ---- Filtered common genes (expression threshold per sample, then intersect)
    filtered_sets = []
    for ad in adata_list:
        # work on a shallow copy to avoid mutating caller's object
        ad_tmp = ad[:, :].copy()
        min_cells = int(np.ceil(min_cells_pct * ad_tmp.n_obs)) if min_cells_pct else 0
        if min_cells > 0:
            sc.pp.filter_genes(ad_tmp, min_cells=min_cells)
        filtered_sets.append(set(ad_tmp.var_names))

    filtered_common = set.intersection(*filtered_sets) if filtered_sets else set()
    # remove BLANK/Control like in get_k_genes
    filtered_common_genes = sorted(
        [g for g in filtered_common if ("BLANK" not in g and "Control" not in g)]
    )

    # ---- Write JSONs
    if all_genes_out_path is None:
        all_genes_out_path = Path(var_out_path).parent / "all_genes.json"
    with open(all_genes_out_path, "w") as f:
        json.dump({"genes": all_common_genes}, f)

    if filtered_common_out_path is None:
        filtered_common_out_path = Path(var_out_path).parent / "common_genes_0.1.json"
    with open(filtered_common_out_path, "w") as f:
        json.dump({"genes": filtered_common_genes, "min_cells_pct": min_cells_pct}, f)

    print(
        f"[INFO] Wrote {var_out_path} (top-{k}, criteria={criteria}); "
        f"{all_genes_out_path} (all_common={len(all_common_genes)}); "
        f"{filtered_common_out_path} (filtered_common={len(filtered_common_genes)}, min_cells_pct={min_cells_pct})"
    )

    return var_k_genes, all_common_genes, filtered_common_genes


# ---------- main entry ----------

def create_benchmark_data_multislide(
    save_dir: str | Path,
    K: Optional[int] = None,
    base_root: str | Path = "sftp://login1.molbiol.ox.ac.uk/ceph/project/simmons_hts/kxu/hest/xenium_data/XeniumPR1_segger",
    slide_subdirs: List[str] | tuple = ("slide1", "slide2"),
    ids: Optional[List[str]] = None,
    exclude_ids: Optional[List[str]] = None,   # <-- NEW
    gene_k: int = 50,
    gene_criteria: str = "var",
    symlink: bool = False,
    seed: int = 0,
    *,
    run_pattern: str = r'XeniumPR(\d)',
    run_prefix_format: str = "XeniumPR{run}S{slide}",
):
    """
    Build a HEST benchmark package from both slide1 and slide2 under the given roots.

    Args:
        save_dir: destination for benchmark package
        K: number of folds (defaults to len(discovered samples))
        base_root: base dir containing slide subfolders
        slide_subdirs: tuple/list of slide dirs (e.g., ("slide1", "slide2"))
        ids: optional list of specific sample IDs to include (before exclusion)
        exclude_ids: optional list of sample IDs to remove **after discovery**
        gene_k, gene_criteria: controls gene selection
        symlink: whether to symlink instead of copy
        seed: RNG seed for deterministic split
        run_pattern, run_prefix_format: control run extraction and naming

    Notes:
        - If K is None, it defaults to len(discovered samples).
        - Any sample IDs in exclude_ids are dropped before gene/split creation.
    """
    from hest.HESTData import create_splits

    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    base_root = Path(base_root)
    roots = [base_root / sd for sd in slide_subdirs]
    print(f"[INFO] Using slide roots: {roots}")

    # ---- Discover samples ----
    samples = _discover_samples_from_roots(
        roots,
        ids=ids,
        run_pattern=run_pattern,
        run_prefix_format=run_prefix_format,
    )
    if not samples:
        raise ValueError(f"No valid samples (with aligned_adata.h5ad) found under any of: {roots}.")

    discovered_ids = sorted(samples.keys())
    print(f"[INFO] Discovered {len(discovered_ids)} samples: {discovered_ids}")

    # ---- Apply exclusion ----
    if exclude_ids:
        exclude_set = set(exclude_ids)
        before = len(discovered_ids)
        discovered_ids = [sid for sid in discovered_ids if sid not in exclude_set]

        missing_excludes = exclude_set - set(samples.keys())
        if missing_excludes:
            print(f"[WARN] Some exclude_ids not found: {sorted(missing_excludes)}")

        removed = before - len(discovered_ids)
        print(f"[INFO] Excluded {removed} samples → remaining {len(discovered_ids)}")
        if removed > 0:
            for e in sorted(exclude_set & set(samples.keys())):
                print(f"   - excluded: {e}")

    if not discovered_ids:
        raise ValueError("All samples excluded — nothing left to process.")

    # ---- Dynamic K ----
    if K is None or K > len(discovered_ids):
        K = len(discovered_ids)
        print(f"[INFO] Using K={K} (equal to number of remaining samples)")

    # ---- Metadata DF ----
    def _infer_patient(sid: str) -> str:
        return sid.split("_")[0] if "_" in sid else sid

    dataset_title = base_root.name or "xenium"
    meta = pd.DataFrame(
        {
            "id": discovered_ids,
            "patient": [_infer_patient(s) for s in discovered_ids],
            "dataset_title": [dataset_title] * len(discovered_ids),
        }
    )

    # ---- Compute top variable genes ----
    adata_paths = [samples[sid]["adata"] for sid in discovered_ids]
    var_json = save_dir / f"var_{gene_k}genes.json"
    write_var_k_genes_from_paths(adata_paths, gene_k, gene_criteria, var_json)
    print(f"[INFO] Wrote {var_json}")

    # ---- Create splits ----
    group = meta.groupby(["dataset_title", "patient"])["id"].agg(list).to_dict()
    rng = np.random.RandomState(seed)
    for key, id_list in group.items():
        rng.shuffle(id_list)

    splits_dir = save_dir / "splits"
    splits_dir.mkdir(parents=True, exist_ok=True)
    create_splits(str(splits_dir), group, K=K)
    print(f"[INFO] Wrote {K}-fold splits to {splits_dir}")

    # ---- Copy/symlink assets ----
    (save_dir / "patches").mkdir(exist_ok=True, parents=True)
    (save_dir / "patches" / "vis").mkdir(exist_ok=True, parents=True)
    (save_dir / "adata").mkdir(exist_ok=True, parents=True)

    missing: List[tuple] = []
    for sid in discovered_ids:
        info = samples[sid]
        _transfer(info.get("patch"), save_dir / "patches" / f"{sid}.h5", "patch", symlink, missing)
        _transfer(info.get("vis"), save_dir / "patches" / "vis" / f"{sid}.png", "vis", symlink, missing)
        _transfer(info.get("adata"), save_dir / "adata" / f"{sid}.h5ad", "adata", symlink, missing)

    if missing:
        print("[WARN] Missing files:")
        for sid, lbl, path in missing:
            print(f"  - {sid} [{lbl}] → {path}")

    print(f"✅ Benchmark dataset created at {save_dir}")


In [5]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumR1",
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumR1",
    run_pattern=r'XeniumR(\d+)',
    run_prefix_format='XeniumR{run}S{slide}',
    exclude_ids=['XeniumR1S1ROI3'], # have way too few patches due to tissue detachment
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR1/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR1/slide2')]
[INFO] Discovered 8 samples: ['XeniumR1S1ROI2', 'XeniumR1S1ROI3', 'XeniumR1S1ROI4', 'XeniumR1S1ROI5', 'XeniumR1S1ROI7', 'XeniumR1S2ROI10', 'XeniumR1S2ROI11', 'XeniumR1S2ROI12']
[INFO] Excluded 1 samples → remaining 7
   - excluded: XeniumR1S1ROI3
[INFO] Using K=7 (equal to number of remaining samples)
min_cells is  263.0
min_cells is  497.0
min_cells is  232.0
min_cells is  257.0
min_cells is  165.0
min_cells is  118.0
min_cells is  367.0
[32m13:55:58[0m | [1mINFO[0m | [1mFound 274 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m13:55:59[0m | [1mINFO[0m | [1mselected genes ['AREG', 'AZGP1', 'B3GNT6', 'BANK1', 'CA2', 'CA7', 'CALB2', 'CCL20', 'CCR7', 'CD177', 'CDHR5', 'CEACAM1', 'CEACAM6', 'CEACAM7', 'CLCA4', 'CMBL', 'CTSB', 'DMBT1', 'DUOX2', 'FABP2', 'FCER2', 'FOXA3', 'HES6', 'HHLA2', 'IL1B', 'ITLN1', 'KRT1', 'KRT86', 'L1TD1', 'LCN2', 'LEFTY1', 'LGR5', 'LILRA4', 'MS4A1', 'MS4A12', 'MS4A8', 'OLFM4', 'PAX5', 'PDZK1IP1', 'PI3', 'PRPH', 'RAP1GAP', 'RETNLB', 'RGS13', 'SCG2', 'SCGN', 'SERPINA1', 'SLPI', 'TFF1', 'UGT2A3'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR1/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumR1/all_genes.json (all_common=322); /project/simmons_hts/kxu/hest/eval/data/XeniumR1/common_genes_0.1.json (filtered_common=274, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR1/var_50genes.json
Split 0/7
train set is  ['XeniumR1S1ROI4', 'XeniumR1S1ROI5', 'XeniumR1S1ROI7', 'XeniumR1S2ROI10', 'XeniumR1S2ROI11',

In [6]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumR2",
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumR2",
    run_pattern=r'XeniumR(\d+)',
    run_prefix_format='XeniumR{run}S{slide}',
    #exclude_ids=['XeniumR1S1ROI3'], # have way too few patches due to tissue detachment
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR2/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR2/slide2')]
[INFO] Discovered 10 samples: ['XeniumR2S1ROI1', 'XeniumR2S1ROI2', 'XeniumR2S1ROI3', 'XeniumR2S1ROI4', 'XeniumR2S1ROI7', 'XeniumR2S2ROI10', 'XeniumR2S2ROI13', 'XeniumR2S2ROI14', 'XeniumR2S2ROI8', 'XeniumR2S2ROI9']
[INFO] Using K=10 (equal to number of remaining samples)
min_cells is  317.0
min_cells is  484.0
min_cells is  255.0
min_cells is  324.0
min_cells is  527.0
min_cells is  275.0
min_cells is  362.0
min_cells is  745.0
min_cells is  191.0
min_cells is  388.0
[32m14:13:23[0m | [1mINFO[0m | [1mFound 474 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m14:13:24[0m | [1mINFO[0m | [1mselected genes ['APOC3', 'AQP8', 'ASCL2', 'BEST4', 'CA4', 'CCL19', 'CCL21', 'CCL25', 'CEACAM7', 'CEMIP', 'CPS1', 'CR2', 'CREB3L3', 'CXCL10', 'CXCL13', 'CXCL9', 'CXCR5', 'DEFA5', 'DEFA6', 'DMBT1', 'DUOXA2', 'ENPEP', 'FCGBP', 'GJB2', 'GPX2', 'GREM1', 'GUCA2A', 'HCAR3', 'KRT17', 'LGR5', 'LYPD8', 'MEP1A', 'MMP1', 'MS4A1', 'MSLN', 'MUC6', 'NOS1', 'NOS2', 'OLFM4', 'PI3', 'REG1A', 'REG1B', 'REG4', 'SCGN', 'SLC26A3', 'SLC5A12', 'SPINK4', 'SST', 'TFF3', 'VIP'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR2/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumR2/all_genes.json (all_common=480); /project/simmons_hts/kxu/hest/eval/data/XeniumR2/common_genes_0.1.json (filtered_common=474, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR2/var_50genes.json
Split 0/10
train set is  ['XeniumR2S1ROI2', 'XeniumR2S1ROI3', 'XeniumR2S1ROI4', 'XeniumR2S1ROI7', 'XeniumR2S2ROI10', 'XeniumR

In [7]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumR3",
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumR3",
    run_pattern=r'XeniumR(\d+)',
    run_prefix_format='XeniumR{run}S{slide}',
    #exclude_ids=['XeniumR1S1ROI3'], # have way too few patches due to tissue detachment
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR3/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR3/slide2')]
[INFO] Discovered 7 samples: ['XeniumR3S1ROI3', 'XeniumR3S1ROI4', 'XeniumR3S1ROI5', 'XeniumR3S1ROI6', 'XeniumR3S1ROI7', 'XeniumR3S2ROI10', 'XeniumR3S2ROI11']
[INFO] Using K=7 (equal to number of remaining samples)
min_cells is  710.0
min_cells is  105.0
min_cells is  235.0
min_cells is  433.0
min_cells is  164.0
min_cells is  339.0
min_cells is  551.0
[32m14:13:59[0m | [1mINFO[0m | [1mFound 202 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m14:14:00[0m | [1mINFO[0m | [1mselected genes ['ACE', 'ADAMDEC1', 'ANGPTL4', 'ANXA2', 'AXIN2', 'CCL19', 'CCL21', 'CD74', 'CDH1', 'CEACAM1', 'CHGB', 'CXCL11', 'CXCL13', 'CXCL5', 'CXCR1', 'DEFA5', 'DERL3', 'DMKN', 'DUOX2', 'ECM1', 'EPCAM', 'FABP1', 'FCGBP', 'FCGR3B', 'G0S2', 'GJB2', 'GPX2', 'HHIP', 'HMGCS2', 'IL1B', 'LCN2', 'LILRA4', 'LILRB4', 'MMP9', 'MUC17', 'MUC2', 'MUC3A', 'MUC4', 'MUC5B', 'MZB1', 'NCAM1', 'NR1H4', 'OLFM4', 'PDZK1IP1', 'REG4', 'S100A10', 'S100P', 'SOX9', 'TRPA1', 'WNT2'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR3/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumR3/all_genes.json (all_common=480); /project/simmons_hts/kxu/hest/eval/data/XeniumR3/common_genes_0.1.json (filtered_common=202, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR3/var_50genes.json
Split 0/7
train set is  ['XeniumR3S1ROI4', 'XeniumR3S1ROI5', 'XeniumR3S1ROI6', 'XeniumR3S1ROI7', 'XeniumR3S2ROI10', '

In [8]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumR5",
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumR5",
    run_pattern=r'XeniumR(\d+)',
    run_prefix_format='XeniumR{run}S{slide}',
    #exclude_ids=['XeniumR1S1ROI3'], # have way too few patches due to tissue detachment
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR5/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR5/slide2')]
[INFO] Discovered 11 samples: ['XeniumR5S1ROI1', 'XeniumR5S1ROI2', 'XeniumR5S1ROI3', 'XeniumR5S1ROI4', 'XeniumR5S1ROI5', 'XeniumR5S1ROI6', 'XeniumR5S2ROI10', 'XeniumR5S2ROI11', 'XeniumR5S2ROI7', 'XeniumR5S2ROI8', 'XeniumR5S2ROI9']
[INFO] Using K=11 (equal to number of remaining samples)
min_cells is  733.0
min_cells is  416.0
min_cells is  158.0
min_cells is  195.0
min_cells is  619.0
min_cells is  280.0
min_cells is  529.0
min_cells is  425.0
min_cells is  655.0
min_cells is  354.0
min_cells is  157.0
[32m14:15:00[0m | [1mINFO[0m | [1mFound 377 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m14:15:01[0m | [1mINFO[0m | [1mselected genes ['ACE', 'ANXA1', 'CA4', 'CCL20', 'CCL22', 'CCL25', 'CDHR1', 'CEACAM1', 'CR2', 'CXCL11', 'CXCL8', 'DEFA5', 'DEFA6', 'DMBT1', 'DMKN', 'DSC2', 'DUOX2', 'ENPEP', 'EPCAM', 'FCGBP', 'GCG', 'GJB2', 'GPX2', 'GREM2', 'HHIP', 'HMGCS2', 'HOPX', 'IHH', 'LCN2', 'LGR5', 'MEP1A', 'MS4A1', 'MUC17', 'MUC5B', 'MUC6', 'OLFM4', 'PIGR', 'PLP1', 'REG1A', 'REG1B', 'REG4', 'S100P', 'SAA2', 'SERPINA3', 'SPIB', 'SPINK1', 'SPP1', 'TF', 'THBS4', 'VIP'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR5/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumR5/all_genes.json (all_common=480); /project/simmons_hts/kxu/hest/eval/data/XeniumR5/common_genes_0.1.json (filtered_common=377, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR5/var_50genes.json
Split 0/11
train set is  ['XeniumR5S1ROI2', 'XeniumR5S1ROI3', 'XeniumR5S1ROI4', 'XeniumR5S1ROI5', 'XeniumR5S1ROI6', 'XeniumR5S2ROI10', 'X

In [10]:
create_benchmark_data_multislide(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumR6",
    base_root="/project/simmons_hts/kxu/hest/xenium_data/XeniumR6",
    run_pattern=r'XeniumR(\d+)',
    run_prefix_format='XeniumR{run}S{slide}',
    #exclude_ids=['XeniumR1S1ROI3'], # have way too few patches due to tissue detachment
    gene_k=50,
    gene_criteria="var",
    symlink=False,            # set True to save disk space
    seed=0                    # controls fold assignment deterministically
)

[INFO] Using slide roots: [PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR6/slide1'), PosixPath('/project/simmons_hts/kxu/hest/xenium_data/XeniumR6/slide2')]
[INFO] Discovered 11 samples: ['XeniumR6S1ROI1', 'XeniumR6S1ROI2', 'XeniumR6S1ROI3', 'XeniumR6S1ROI4', 'XeniumR6S1ROI5', 'XeniumR6S1ROI6', 'XeniumR6S2ROI10', 'XeniumR6S2ROI11', 'XeniumR6S2ROI7', 'XeniumR6S2ROI8', 'XeniumR6S2ROI9']
[INFO] Using K=11 (equal to number of remaining samples)
min_cells is  220.0
min_cells is  249.0
min_cells is  157.0
min_cells is  222.0
min_cells is  807.0
min_cells is  544.0
min_cells is  112.0
min_cells is  951.0
min_cells is  568.0
min_cells is  311.0
min_cells is  486.0
[32m14:33:03[0m | [1mINFO[0m | [1mFound 231 common genes[0m


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m14:33:04[0m | [1mINFO[0m | [1mselected genes ['ADH1C', 'ANXA2', 'AQP8', 'CA4', 'CD69', 'CDH1', 'CDHR1', 'CDX2', 'CEACAM1', 'CEACAM7', 'CFTR', 'CHGA', 'CXCL8', 'DEFA5', 'DMBT1', 'DMKN', 'DSC2', 'EPCAM', 'FABP1', 'FABP2', 'FCGBP', 'GCG', 'GJB2', 'GPX2', 'GUCA2A', 'GUCA2B', 'HMGCS2', 'HSPA5', 'IFI27', 'IL1B', 'JCHAIN', 'LCN2', 'LEFTY1', 'MUC12', 'MUC17', 'MUC2', 'MUC4', 'MUC5B', 'MZB1', 'OLFM4', 'PDZK1IP1', 'PI3', 'PIGR', 'PLA2G2A', 'REG4', 'SLC26A2', 'SLC26A3', 'SPINK1', 'SPINK4', 'SPP1'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR6/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumR6/all_genes.json (all_common=480); /project/simmons_hts/kxu/hest/eval/data/XeniumR6/common_genes_0.1.json (filtered_common=231, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR6/var_50genes.json
Split 0/11
train set is  ['XeniumR6S1ROI2', 'XeniumR6S1ROI3', 'XeniumR6S1ROI4', 'XeniumR6S1ROI5', 'XeniumR6S1ROI6', 'Xe

## create an eval data folder for all 480 runs

In [24]:
meta = create_benchmark_from_eval_dirs(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumR",
    K=15, # 46 samples, 43 patients 
    eval_dirs=["/project/simmons_hts/kxu/hest/eval/data/XeniumR1",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR2",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR3",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR5",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR6",          
    ],
    gene_k=50,
    symlink=False,
    seed=0,
    exclude_ids=['XeniumR1S1ROI3'], # too low patch count
    metadata_csv="/project/simmons_hts/kxu/hest/hest_directory.csv",
)

[INFO] Using eval dirs: [PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR1'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR2'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR3'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR5'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR6')]
[WARN] Some exclude_ids not found: ['XeniumR1S1ROI3']
[INFO] Excluded 1 samples → remaining 46
[INFO] Discovered sample IDs (46): ['XeniumR1S1ROI2', 'XeniumR1S1ROI4', 'XeniumR1S1ROI5', 'XeniumR1S1ROI7', 'XeniumR1S2ROI10', 'XeniumR1S2ROI11', 'XeniumR1S2ROI12', 'XeniumR2S1ROI1', 'XeniumR2S1ROI2', 'XeniumR2S1ROI3', 'XeniumR2S1ROI4', 'XeniumR2S1ROI7', 'XeniumR2S2ROI10', 'XeniumR2S2ROI13', 'XeniumR2S2ROI14', 'XeniumR2S2ROI8', 'XeniumR2S2ROI9', 'XeniumR3S1ROI3', 'XeniumR3S1ROI4', 'XeniumR3S1ROI5', 'XeniumR3S1ROI6', 'XeniumR3S1ROI7', 'XeniumR3S2ROI10', 'XeniumR3S2ROI11', 'XeniumR5S1ROI1', 'XeniumR5S1ROI2', 'XeniumR5S1ROI3', 'XeniumR5S1ROI4', 'XeniumR5S1ROI5',

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumR/all_genes.json (all_common=100); /project/simmons_hts/kxu/hest/eval/data/XeniumR/common_genes_0.1.json (filtered_common=23, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR/var_50genes.json
K=15 doesnt match the number of patients, try to distribute the patients instead
Split 0/15
train set is  ['XeniumR2S2ROI9', 'XeniumR2S1ROI2', 'XeniumR3S1ROI6', 'XeniumR5S2ROI8', 'XeniumR2S2ROI14', 'XeniumR6S1ROI5', 'XeniumR1S1ROI4', 'XeniumR1S2ROI12', 'XeniumR1S1ROI5', 'XeniumR1S2ROI10', 'XeniumR1S1ROI2', 'XeniumR3S1ROI5', 'XeniumR5S1ROI4', 'XeniumR6S1ROI2', 'XeniumR2S2ROI8', 'XeniumR2S2ROI13', 'XeniumR6S2ROI8', 'XeniumR6S2ROI10', 'XeniumR5S1ROI1', 'XeniumR6S2ROI11', 'XeniumR3S1ROI3', 'XeniumR3S1ROI7', 'XeniumR5S2ROI11', 'XeniumR5S2ROI10', 'XeniumR2S1ROI3', 'XeniumR5S2ROI7', 'XeniumR3S2ROI10', 'XeniumR5S1ROI5', 'XeniumR2

Split 10/15
train set is  ['XeniumR2S1ROI1', 'XeniumR6S1ROI6', 'XeniumR6S1ROI4', 'XeniumR5S2ROI9', 'XeniumR2S2ROI9', 'XeniumR2S1ROI2', 'XeniumR3S1ROI6', 'XeniumR5S2ROI8', 'XeniumR2S2ROI14', 'XeniumR6S1ROI5', 'XeniumR1S1ROI4', 'XeniumR1S2ROI12', 'XeniumR1S1ROI5', 'XeniumR1S2ROI10', 'XeniumR1S1ROI2', 'XeniumR3S1ROI5', 'XeniumR5S1ROI4', 'XeniumR6S1ROI2', 'XeniumR2S2ROI8', 'XeniumR2S2ROI13', 'XeniumR6S2ROI8', 'XeniumR6S2ROI10', 'XeniumR5S1ROI1', 'XeniumR6S2ROI11', 'XeniumR3S1ROI3', 'XeniumR3S1ROI7', 'XeniumR5S2ROI11', 'XeniumR5S2ROI10', 'XeniumR2S1ROI3', 'XeniumR5S2ROI7', 'XeniumR3S2ROI10', 'XeniumR3S1ROI4', 'XeniumR5S1ROI2', 'XeniumR5S1ROI6', 'XeniumR6S2ROI9', 'XeniumR1S2ROI11', 'XeniumR1S1ROI7', 'XeniumR2S2ROI10', 'XeniumR2S1ROI4', 'XeniumR5S1ROI3', 'XeniumR6S1ROI3', 'XeniumR6S1ROI1']

test set is  ['XeniumR5S1ROI5' 'XeniumR2S1ROI7' 'XeniumR3S2ROI11' 'XeniumR6S2ROI7']

Split 11/15
train set is  ['XeniumR2S1ROI1', 'XeniumR6S1ROI6', 'XeniumR6S1ROI4', 'XeniumR5S2ROI9', 'XeniumR2S2ROI9', 'Xe

In [25]:
meta = create_benchmark_from_eval_dirs(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumR2-6",
    K=15, # 39 samples, 43 patients 
    eval_dirs=[#"/project/simmons_hts/kxu/hest/eval/data/XeniumR1",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR2",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR3",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR5",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR6",          
    ],
    gene_k=50,
    symlink=False,
    seed=0,
    #exclude_ids=['XeniumR1S1ROI3'], # too low patch count
    metadata_csv="/project/simmons_hts/kxu/hest/hest_directory.csv",
)

[INFO] Using eval dirs: [PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR2'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR3'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR5'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR6')]
[INFO] Discovered sample IDs (39): ['XeniumR2S1ROI1', 'XeniumR2S1ROI2', 'XeniumR2S1ROI3', 'XeniumR2S1ROI4', 'XeniumR2S1ROI7', 'XeniumR2S2ROI10', 'XeniumR2S2ROI13', 'XeniumR2S2ROI14', 'XeniumR2S2ROI8', 'XeniumR2S2ROI9', 'XeniumR3S1ROI3', 'XeniumR3S1ROI4', 'XeniumR3S1ROI5', 'XeniumR3S1ROI6', 'XeniumR3S1ROI7', 'XeniumR3S2ROI10', 'XeniumR3S2ROI11', 'XeniumR5S1ROI1', 'XeniumR5S1ROI2', 'XeniumR5S1ROI3', 'XeniumR5S1ROI4', 'XeniumR5S1ROI5', 'XeniumR5S1ROI6', 'XeniumR5S2ROI10', 'XeniumR5S2ROI11', 'XeniumR5S2ROI7', 'XeniumR5S2ROI8', 'XeniumR5S2ROI9', 'XeniumR6S1ROI1', 'XeniumR6S1ROI2', 'XeniumR6S1ROI3', 'XeniumR6S1ROI4', 'XeniumR6S1ROI5', 'XeniumR6S1ROI6', 'XeniumR6S2ROI10', 'XeniumR6S2ROI11', 'XeniumR6S2ROI7', 'XeniumR6S2R

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[32m11:47:17[0m | [1mINFO[0m | [1mselected genes ['ADAMDEC1', 'ANO1', 'ANXA1', 'ASS1', 'C7', 'CA4', 'CCL19', 'CCL21', 'CCN2', 'CD74', 'CDH1', 'CEACAM1', 'CUBN', 'DEFA5', 'DMKN', 'EGR1', 'EMP1', 'EPCAM', 'FCGBP', 'FOSB', 'G0S2', 'GHRL', 'GJB2', 'GPX2', 'GREM1', 'HES1', 'HHIP', 'HMGCS2', 'IFI27', 'IL1B', 'IL1R2', 'IL7R', 'JCHAIN', 'LAMB3', 'LCN2', 'MAF', 'MUC17', 'MUC5B', 'MZB1', 'NR1H4', 'OLFM4', 'PIGR', 'POSTN', 'PTGDS', 'REG4', 'S100P', 'SELENBP1', 'SOX9', 'TIMP1', 'TRPA1'][0m
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR2-6/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumR2-6/all_genes.json (all_common=480); /project/simmons_hts/kxu/hest/eval/data/XeniumR2-6/common_genes_0.1.json (filtered_common=144, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR2-6/var_50genes.json
K=15 doesnt match the number of patients, try to distribute the patients instead
Split 0/19
train set is  ['XeniumR6S1ROI4', 'X

Split 18/19
train set is  ['XeniumR2S1ROI1', 'XeniumR6S1ROI6', 'XeniumR6S1ROI4', 'XeniumR5S2ROI9', 'XeniumR2S2ROI9', 'XeniumR2S1ROI2', 'XeniumR3S1ROI6', 'XeniumR5S2ROI8', 'XeniumR2S2ROI14', 'XeniumR6S1ROI5', 'XeniumR3S1ROI5', 'XeniumR5S1ROI4', 'XeniumR6S1ROI2', 'XeniumR2S2ROI8', 'XeniumR2S2ROI13', 'XeniumR6S2ROI8', 'XeniumR6S2ROI10', 'XeniumR5S1ROI1', 'XeniumR6S2ROI11', 'XeniumR3S1ROI3', 'XeniumR3S1ROI7', 'XeniumR5S2ROI11', 'XeniumR5S2ROI10', 'XeniumR2S1ROI3', 'XeniumR5S2ROI7', 'XeniumR3S2ROI10', 'XeniumR5S1ROI5', 'XeniumR2S1ROI7', 'XeniumR3S2ROI11', 'XeniumR6S2ROI7', 'XeniumR3S1ROI4', 'XeniumR5S1ROI2', 'XeniumR5S1ROI6', 'XeniumR6S2ROI9', 'XeniumR2S2ROI10', 'XeniumR2S1ROI4', 'XeniumR5S1ROI3']

test set is  ['XeniumR6S1ROI3' 'XeniumR6S1ROI1']

[INFO] Wrote 15-fold patient-level splits to /project/simmons_hts/kxu/hest/eval/data/XeniumR2-6/splits
✅ Merged benchmark created at /project/simmons_hts/kxu/hest/eval/data/XeniumR2-6


### do leave one patient out split

In [22]:
meta = create_benchmark_from_eval_dirs(
    save_dir="/project/simmons_hts/kxu/hest/eval/data/XeniumR_LOOCV",
    K = 43,
    eval_dirs=["/project/simmons_hts/kxu/hest/eval/data/XeniumR1",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR2",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR3",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR5",
               "/project/simmons_hts/kxu/hest/eval/data/XeniumR6",          
    ],
    gene_k=50,
    symlink=False,
    seed=0,
    exclude_ids=['XeniumR1S1ROI3'], # too low patch count
    metadata_csv="/project/simmons_hts/kxu/hest/hest_directory.csv",
)

[INFO] Using eval dirs: [PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR1'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR2'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR3'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR5'), PosixPath('/project/simmons_hts/kxu/hest/eval/data/XeniumR6')]
[WARN] Some exclude_ids not found: ['XeniumR1S1ROI3']
[INFO] Excluded 1 samples → remaining 46
[INFO] Discovered sample IDs (46): ['XeniumR1S1ROI2', 'XeniumR1S1ROI4', 'XeniumR1S1ROI5', 'XeniumR1S1ROI7', 'XeniumR1S2ROI10', 'XeniumR1S2ROI11', 'XeniumR1S2ROI12', 'XeniumR2S1ROI1', 'XeniumR2S1ROI2', 'XeniumR2S1ROI3', 'XeniumR2S1ROI4', 'XeniumR2S1ROI7', 'XeniumR2S2ROI10', 'XeniumR2S2ROI13', 'XeniumR2S2ROI14', 'XeniumR2S2ROI8', 'XeniumR2S2ROI9', 'XeniumR3S1ROI3', 'XeniumR3S1ROI4', 'XeniumR3S1ROI5', 'XeniumR3S1ROI6', 'XeniumR3S1ROI7', 'XeniumR3S2ROI10', 'XeniumR3S2ROI11', 'XeniumR5S1ROI1', 'XeniumR5S1ROI2', 'XeniumR5S1ROI3', 'XeniumR5S1ROI4', 'XeniumR5S1ROI5',

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR_LOOCV/var_50genes.json (top-50, criteria=var); /project/simmons_hts/kxu/hest/eval/data/XeniumR_LOOCV/all_genes.json (all_common=100); /project/simmons_hts/kxu/hest/eval/data/XeniumR_LOOCV/common_genes_0.1.json (filtered_common=23, min_cells_pct=0.1)
[INFO] Wrote /project/simmons_hts/kxu/hest/eval/data/XeniumR_LOOCV/var_50genes.json
Split 0/43
train set is  ['XeniumR6S1ROI6', 'XeniumR6S1ROI4', 'XeniumR5S2ROI9', 'XeniumR2S2ROI9', 'XeniumR2S1ROI2', 'XeniumR3S1ROI6', 'XeniumR5S2ROI8', 'XeniumR2S2ROI14', 'XeniumR6S1ROI5', 'XeniumR1S1ROI4', 'XeniumR1S2ROI12', 'XeniumR1S1ROI5', 'XeniumR1S2ROI10', 'XeniumR1S1ROI2', 'XeniumR3S1ROI5', 'XeniumR5S1ROI4', 'XeniumR6S1ROI2', 'XeniumR2S2ROI8', 'XeniumR2S2ROI13', 'XeniumR6S2ROI8', 'XeniumR6S2ROI10', 'XeniumR5S1ROI1', 'XeniumR6S2ROI11', 'XeniumR3S1ROI3', 'XeniumR3S1ROI7', 'XeniumR5S2ROI11', 'XeniumR5S2ROI10', 'XeniumR2S1ROI3', 'XeniumR5S2ROI7', 'XeniumR3S2ROI10', 'XeniumR5S1ROI5', 'XeniumR2S1R