# Patient-Level Merging

Inputs
Clinical Table

- File: Clini_Table_Vorversuche_HER2.xlsx
- Column: Patho-ID
- HDF5 Feature Directory (result directory from previous Kronos embedding)

In [3]:
from pathlib import Path

patients_file = Path(
    "/mnt/bulk-sirius/nguyenmin/multiplex/Clini_Table_Vorversuche_HER2.xlsx"
)  # <-- YOUR .xls or .csv
h5_input_dir = Path(
    "/mnt/bulk-neptune/laura/multiplex/features"
)  # <-- raw h5 directory
h5_output_dir = Path(
    "/mnt/bulk-sirius/nguyenmin/multiplex/temp_results"
)  # <-- save here

In [4]:
import pandas as pd
import numpy as np
import h5py
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm

h5_output_dir.mkdir(parents=True, exist_ok=True)

# OPTIONAL: Name of the column that contains Patho-ID
PATIENT_COL = "Patho-ID"

# Name of dataset inside h5 to merge
DATASET_NAME = "patch_embeddings"

# ------------------------------------------------------------
# LOAD PATIENT LIST
# ------------------------------------------------------------
if patients_file.suffix.lower() in [".xls", ".xlsx"]:
    df = pd.read_excel(patients_file)
else:
    df = pd.read_csv(patients_file)

print("Loaded patient table:", df.shape)

# Extract patient prefixes like "E/19/10396"
raw_ids = df[PATIENT_COL].dropna().astype(str).tolist()


# Convert "E/19/10396-2B" → "E-19-10396"
def normalize_patient_id(pid):
    """
    Remove cell suffix (-2B), change / to -, keep only 'E-19-10396'
    """
    pid = pid.strip()
    pid = pid.split("-")[0]  # remove trailing -2B
    pid = pid.replace("/", "-")  # E/19/10396 → E-19-10396
    return pid


patient_list = [normalize_patient_id(pid) for pid in raw_ids]
patient_set = set(patient_list)

print("Unique patients:", len(patient_set))


# ------------------------------------------------------------
# 4. FIND ALL H5 FILES THAT BELONG TO ANY PATIENT
# ------------------------------------------------------------
files = sorted(h5_input_dir.glob("*.h5"))

# Map patient → list of h5 files
patient_to_files = defaultdict(list)


def extract_patient_from_h5name(fname):
    """
    For E-19-10396_[10856,55448]_component_data_Her2.h5
    return 'E-19-10396'
    """
    base = fname.split("_")[0]  # E-19-10396
    return base


for f in files:
    pid = extract_patient_from_h5name(f.stem)
    if pid in patient_set:
        patient_to_files[pid].append(f)

print("Patients with H5 files found:", len(patient_to_files))


# ------------------------------------------------------------
# 5. MERGE ALL H5 FILES PER PATIENT
# ------------------------------------------------------------
def merge_patch_embeddings(h5_list, dataset_name=DATASET_NAME):
    """
    Load `patch_embeddings` from all h5s and concatenate.
    """
    arrays = []
    for path in h5_list:
        with h5py.File(path, "r") as f:
            if dataset_name not in f:
                raise ValueError(f"{dataset_name} not found in {path}")
            arrays.append(f[dataset_name][:])
    return np.concatenate(arrays, axis=0)


# ------------------------------------------------------------
# 6. SAVE MERGED FILES
# ------------------------------------------------------------
for patient, flist in tqdm(patient_to_files.items(), desc="Merging patients"):
    # Example flist contains many files like:
    # E-19-10396_[10856,55448]_component_data_Her2.h5

    # Detect marker name from the FIRST file
    # e.g., "component_data_Her2"
    first_name = flist[0].stem
    marker_part = "_".join(first_name.split("_")[2:])  # component_data_Her2

    # Build output filename:
    # E-19-10396_component_data_Her2.h5
    outname = f"{patient}_{marker_part}.h5"
    outpath = h5_output_dir / outname

    print(f"\nProcessing patient {patient}, {len(flist)} files")
    print("→ Output:", outpath)

    merged = merge_patch_embeddings(flist, DATASET_NAME)

    with h5py.File(outpath, "w") as f:
        f.create_dataset(DATASET_NAME, data=merged, dtype="f")

    print(f"Saved merged H5 with {merged.shape[0]} patches\n")

Loaded patient table: (81, 5)
Unique patients: 80
Patients with H5 files found: 8


Merging patients:   0%|          | 0/8 [00:00<?, ?it/s]


Processing patient E-19-10396, 72 files
→ Output: /mnt/bulk-sirius/nguyenmin/multiplex/temp_results/E-19-10396_component_data.h5
Saved merged H5 with 1080 patches


Processing patient E-19-14186, 148 files
→ Output: /mnt/bulk-sirius/nguyenmin/multiplex/temp_results/E-19-14186_component_data.h5


Merging patients:  25%|██▌       | 2/8 [00:00<00:00,  8.29it/s]

Saved merged H5 with 2220 patches


Processing patient E-19-1584, 164 files
→ Output: /mnt/bulk-sirius/nguyenmin/multiplex/temp_results/E-19-1584_component_data.h5


Merging patients:  38%|███▊      | 3/8 [00:01<00:03,  1.32it/s]

Saved merged H5 with 2460 patches


Processing patient E-19-17360, 130 files
→ Output: /mnt/bulk-sirius/nguyenmin/multiplex/temp_results/E-19-17360_component_data.h5


Merging patients:  38%|███▊      | 3/8 [00:03<00:05,  1.09s/it]


KeyboardInterrupt: 

# Slide–to–Clinical Table Mapping and CSV Generation

This notebook step constructs a slide-level mapping between multiplex feature files (.h5) and the corresponding clinical metadata rows in clinical table. The objective is to produce a clean slides.csv file that links each processed slide to its correct Patho-ID entry for downstream modeling.

In [5]:
from pathlib import Path

h5_dir = Path("/mnt/bulk-sirius/nguyenmin/multiplex/features/her2_feats_patient_level")
clini_table = Path(
    "/mnt/bulk-sirius/nguyenmin/multiplex/Clini_Table_Vorversuche_HER2.xlsx"
)
output_csv = Path("/mnt/bulk-sirius/nguyenmin/multiplex/slides.csv")

In [None]:
import pandas as pd

# -----------------------------
# LOAD CLINICAL TABLE
# -----------------------------
df_clini = pd.read_excel(clini_table)

# Example column: "Patho-ID" contains strings like "E/19/10396-2B"
raw_ids = df_clini["Patho-ID"].dropna().astype(str).tolist()

# Build mapping:
#   E-19-10396 --> E/19/10396-2B
#   E-19-1584  --> E/19/1584-2D
patient_map = {}

for pid in raw_ids:
    trimmed = pid.split("-")[0].replace("/", "-")  # E/19/10396-2B → E-19-10396
    patient_map[trimmed] = pid  # map trimmed → full


# -----------------------------
# Helper: extract trimmed ID from filename
# -----------------------------
def extract_trimmed_id(fname: str) -> str:
    # "E-19-10396_[coords]_component..." → "E-19-10396"
    return fname.split("_")[0]


# -----------------------------
# Scan h5 files and build CSV entries
# -----------------------------
entries = []

for h5_path in sorted(h5_dir.glob("*.h5")):
    fname = h5_path.name
    trimmed = extract_trimmed_id(h5_path.stem)  # E-19-10396

    if trimmed not in patient_map:
        print(f"WARNING: No clinical Patho-ID found for {trimmed}")
        continue

    full_pid = patient_map[trimmed]  # E/19/10396-2B

    entries.append(
        {
            "Patho-ID": full_pid,
            "FILENAME": fname,
        }
    )

# -----------------------------
# Save to CSV
# -----------------------------
df = pd.DataFrame(entries)
df.to_csv(output_csv, index=False)

print(f"Saved {len(df)} slides to {output_csv}")
df.head()