In [1]:
import random
import os
from glob import glob
import xarray as xr
import shutil
import spatialproteomics as sp

def load_zarr(sample_id, source_dir="/g/huber/users/meyerben/data/codex/BNHL/data_for_publication/tmas_unzipped"):
    ds = xr.open_zarr(f"{source_dir}/{sample_id}.zarr")
    img_raw = ds['_image_raw'].values
    markers = ds.coords['channels'].values
    ds = sp.load_image_data(img_raw, channel_coords=markers)
    return ds

In [2]:
files = glob('/g/huber/users/meyerben/data/codex/BNHL/data_for_publication/tmas_unzipped/*.zarr')

# set seed for reproducibility
random.seed(42)

# output folder
outdir = "selected_zarrs"
os.makedirs(outdir, exist_ok=True)

# only looking at FL1 and DLBCL
for entity in ['FL 1', 'DLBCL']:
    files_subset = [x for x in files if x.split('/')[-1].startswith(entity)]
    print(f"Found {len(files_subset)} files for entity {entity}")

    # parse into entity_patient_replicate
    parsed = [f.split('/')[-1].replace('.zarr', '').split('_') for f in files_subset]
    patient_to_samples = {}
    for (entity_id, patient_id, replicate_id), f in zip(parsed, files_subset):
        sample_id = f.split('/')[-1].replace('.zarr', '')
        patient_to_samples.setdefault(patient_id, []).append(sample_id)

    # keep only patients with 2 replicates
    eligible_patients = {p: s for p, s in patient_to_samples.items() if len(s) == 2}

    # randomly select 10 patients
    selected_patients = random.sample(list(eligible_patients.keys()), 10)

    # collect sample IDs
    selected_samples = [s for p in selected_patients for s in eligible_patients[p]]

    print("Selected samples:")
    for s in selected_samples:
        print(s)

        # load and re-save with your loader
        ds = load_zarr(s, source_dir="/g/huber/users/meyerben/data/codex/BNHL/data_for_publication/tmas_unzipped")
        dest = os.path.join(outdir, f"{s}.zarr")

        # overwrite if exists
        if os.path.exists(dest):
            shutil.rmtree(dest)

        ds.to_zarr(dest)

Found 21 files for entity FL 1
Selected samples:
FL 1_10_1
FL 1_10_2
FL 1_7_1
FL 1_7_2
FL 1_3_1
FL 1_3_2
FL 1_9_1
FL 1_9_2
FL 1_4_1
FL 1_4_2
FL 1_5_1
FL 1_5_2
FL 1_11_1
FL 1_11_2
FL 1_6_1
FL 1_6_2
FL 1_8_1
FL 1_8_2
FL 1_2_2
FL 1_2_1
Found 49 files for entity DLBCL
Selected samples:
DLBCL_20_1
DLBCL_20_2
DLBCL_6_2
DLBCL_6_1
DLBCL_21_1
DLBCL_21_2
DLBCL_25_1
DLBCL_25_2
DLBCL_1_1
DLBCL_1_2
DLBCL_14_1
DLBCL_14_2
DLBCL_23_1
DLBCL_23_2
DLBCL_4_1
DLBCL_4_2
DLBCL_8_1
DLBCL_8_2
DLBCL_13_1
DLBCL_13_2
