In [1]:
import os
import sys
# Use local biomni
sys.path.insert(0, "/home/runai-home/Biomni/")

In [2]:
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher

In [3]:
from biomni.cogzero.canonicalizer import canonicalize_output, compare_outputs
from biomni.cogzero.utils import load_data, save_data
from biomni.cogzero.nb_utils import *

In [4]:
ver = "v0"
tasks_dir = Path(f"/mlbio_scratch/kodryan/Biomni/data/generated/tasks/{ver}")
raw_tasks_dir = tasks_dir / "raw"
all_tasks_dict = load_data_dict(raw_tasks_dir)

In [5]:
clean_tasks = []

for trial, tasks in all_tasks_dict.items():
    print(f"{trial}:\t{len(tasks)} tasks")
    for task in tasks:
        clean_tasks.append(task)

print(25 * '-')
print(f"Total:  \t{len(clean_tasks)} tasks")

trial_0:	21 tasks
trial_1:	20 tasks
trial_2:	20 tasks
trial_3:	21 tasks
trial_4:	21 tasks
trial_5:	21 tasks
trial_6:	21 tasks
trial_7:	21 tasks
trial_8:	21 tasks
trial_9:	21 tasks
trial_10:	21 tasks
trial_11:	21 tasks
trial_12:	21 tasks
trial_13:	21 tasks
trial_14:	21 tasks
trial_15:	21 tasks
-------------------------
Total:  	334 tasks


In [6]:
def get_tasks_by_gta(task_list: list[dict[str, str]], sim_thresh=0.9) -> dict[str, list[dict[str, str]]]:
    """
    Bin tasks by ground-truth answers.
    Tasks with similar answers end up in the same bin.
    """
    tasks_by_gta = defaultdict(list)

    for task in task_list:
        task_gta = task["ground_truth_answer"]
        task_key = canonicalize_output(task_gta.lower())
        sm = SequenceMatcher(a=task_key)
        
        bin_key = task_key
        for key in tasks_by_gta:
            sm.set_seq2(key)
            if sm.ratio() > sim_thresh:
                bin_key = key
                break
                
        tasks_by_gta[bin_key].append(task)

    return tasks_by_gta

In [7]:
tasks_by_gta = get_tasks_by_gta(clean_tasks)

In [8]:
nonsingle_keys = []

for gta_key, tasks in tasks_by_gta.items():
    n_tasks = len(tasks)
    print(f"{gta_key:100}{n_tasks} tasks")
    if n_tasks > 1:
        nonsingle_keys.append(gta_key)

nonmalignant                                                                                        1 tasks
nb_7767_3495_reg1                                                                                   1 tasks
paep                                                                                                2 tasks
lymphocyte                                                                                          5 tasks
natural_killer_cell                                                                                 1 tasks
monocyte                                                                                            2 tasks
3357                                                                                                3 tasks
ensg00000286009                                                                                     1 tasks
gabaergic neuron                                                                                    1 tasks
hoxd13                      

In [9]:
print(f"All keys: {len(tasks_by_gta)} \t nonsingle keys: {len(nonsingle_keys)}")
print(f"All tasks: {sum(len(tasks) for tasks in tasks_by_gta.values())} \t \
nonsingle tasks: {sum(len(tasks) for key, tasks in tasks_by_gta.items() if key in nonsingle_keys)}")

All keys: 212 	 nonsingle keys: 63
All tasks: 334 	 nonsingle tasks: 185


In [10]:
for gta_key in nonsingle_keys:
    tasks = tasks_by_gta[gta_key]
    print(f"Group {gta_key}")
    print(100 * '-')
    for tid, task in enumerate(tasks):
        print(f"Task {tid}")
        print(f"Dataset path: {task['dataset_path']}")
        print(f"Description: {task['task_description']}")
        print(f"Answer: {task['ground_truth_answer']}")
        print()

Group paep
----------------------------------------------------------------------------------------------------
Task 0
Dataset path: /mlbio_scratch/videnovi/papers_BIBA/paper2/data/dataset3.h5ad
Description: In the snRNA-seq dataset of macrophages from high‑risk neuroblastoma before (DX) and after (PTX) chemotherapy, identify the gene that shows the strongest statistically significant up‑regulation (largest positive log₂ fold‑change, FDR < 0.05) in post‑treatment macrophages compared to diagnostic macrophages. Answer format: a single HGNC gene symbol (shape: gene_symbol).
Answer: PAEP

Task 1
Dataset path: /mlbio_scratch/videnovi/papers_BIBA/paper2/data/dataset3.h5ad
Description: Within the macrophage nuclei, identify the gene that shows the strongest statistically significant up‑regulation (largest positive log2‑fold‑change, adjusted p‑value < 0.05) in the PTX stage compared to the DX stage. Answer format: a single HGNC gene symbol (shape: gene_symbol).
Answer: PAEP

Group lymphocyte


In [11]:
# Dict with the following 
# - keys: nonsingle GTA keys
# - values: list of duplicate tasks ids where the FIRST task id 
#           refers to the representative of the whole group
duplicates_by_gta = {
    "paep": [[0, 1]],
    "lymphocyte": [[2, 0], [3, 4], [1]],
    "monocyte": [[0], [1]],
    "3357": [[0, 1, 2]],
    "nmj accessory": [[0, 2], [4, 1], [3]],
    "mf-i": [[0], [1], [2]],
    "goblet cell": [[3, 1, 2, 4], [0], [5]],
    "b_cell": [[2, 0], [1]],
    "r24_p2538": [[1, 0]],
    "tom_vct": [[1, 0]],
    "hepatocyte": [[0], [1]],
    "5": [[0], [1]],
    "4": [[0], [1], [2]],
    "control": [[8, 0, 1, 3, 5, 7], [2], [4], [6]],
    "3427": [[0], [1]],
    "xist": [[0], [1], [2], [3], [4]],
    "b-lymphocytes": [[1, 0, 2]],
    "alzheimer disease": [[1, 0]],
    "interferon-response": [[0], [1]],
    "csh1": [[0, 1]],
    "anbl12p1-like chemotherapy": [[0], [1], [2], [3], [4]],
    "mt-co3": [[0], [1], [2]],
    "464c": [[1, 0]],
    "nuclei": [[0], [1]],
    "mf-isn(fg)": [[0], [1], [2], [3], [4]],
    "9": [[1, 0, 2, 3, 5], [4]],
    "b cell": [[3, 0], [1], [2]],
    "13": [[0], [1]],
    "endothelial_cell": [[3, 0, 2], [1]],
    "adipocyte of omentum tissue": [[0], [1], [2], [3]],
    "grin2a": [[0], [1]],
    "caecum": [[0], [1]],
    "6720483e21rik": [[2, 1], [0]],
    "evt_late_3": [[1, 0, 2, 4], [3]],
    "cardiac_neuron": [[0], [1]],
    "mesenteric lymph node": [[1, 0, 2]],
    "tnfrsf12a+muscs": [[2, 1], [0]],
    "500c": [[0], [1]],
    "8": [[2, 0, 1, 3, 4]],
    "14": [[1, 2], [0]],
    "3": [[0, 1], [2]],
    "20": [[1, 0]],
    "central memory cd4-positive, alpha-beta t cell": [[0, 1]],
    "fp236383.1": [[0], [1]],
    "young": [[0], [1]],
    "mf-iisc(fg)": [[0], [1]],
    "tg": [[3, 1], [0], [2], [4]],
    "syncytiotrophoblast cell": [[0], [1]],
    "anbl 09p1": [[0], [1]],
    "mf_typei(cytoplasmic)": [[0], [1]],
    "myh2+myh7+": [[0], [1]],
    "2": [[0], [1]],
    "anbl0532-like therapy with cisplatin & etoposide": [[0], [1]],
    "3313": [[0], [1]],
    "neuron": [[0], [1]],
    "mis-c": [[0], [1], [2]],
    "adh+fibro": [[0, 1]],
    "pf": [[0], [1], [2]],
    "myh1+myh2+myh7+": [[0], [1]],
    "11": [[0], [1]],
    "fibroblast": [[0], [1]],
    "small intestine peyer's patch": [[0, 2], [1]],
    "mf_type-myh8+": [[0], [1]],
}

In [12]:
assert sorted(nonsingle_keys) == sorted(duplicates_by_gta.keys())

In [13]:
for gta_key in nonsingle_keys:
    tasks = tasks_by_gta[gta_key]
    dups_ids = duplicates_by_gta[gta_key]
    assert list(range(len(tasks))) == sorted(sum(dups_ids, []))

In [14]:
print(f"Duplicate groups: {sum(len(dups_ids) for dups_ids in duplicates_by_gta.values())}")

Duplicate groups: 137


In [15]:
def get_clean_tasks(tasks_by_key, duplicates_by_key):
    clean_tasks = []

    for key, tasks in tasks_by_key.items():
        task_ids = set(range(len(tasks)))
        
        for dup_ids in duplicates_by_key.get(key, []):
            rem_tids = set(dup_ids[1:])  # the first index is the representative of a duplicates group
            task_ids -= rem_tids

        clean_tasks.extend([tasks[tid] for tid in task_ids])

    return clean_tasks

In [16]:
clean_tasks = get_clean_tasks(tasks_by_gta, duplicates_by_gta)
print(f"Tasks after GTA-wise filtration: {len(clean_tasks)}")

Tasks after GTA-wise filtration: 286


In [17]:
datasets_list = load_data("/mlbio_scratch/kodryan/Biomni/data/all_sc_datasets.json")

In [18]:
tasks_by_dataset = {ds: [task for task in clean_tasks if task['dataset_path'] == ds] for ds in datasets_list}

In [19]:
for did, (dataset_path, tasks) in enumerate(tasks_by_dataset.items(), 1):
    print(f"Dataset {did}: {dataset_path}")
    print(f"# tasks: {len(tasks)}")
    print()

Dataset 1: /mlbio_scratch/videnovi/papers_BIBA/paper2/data/dataset1.h5ad
# tasks: 15

Dataset 2: /mlbio_scratch/videnovi/papers_BIBA/paper2/data/dataset2.h5ad
# tasks: 16

Dataset 3: /mlbio_scratch/videnovi/papers_BIBA/paper2/data/dataset3.h5ad
# tasks: 14

Dataset 4: /mlbio_scratch/videnovi/papers_BIBA/paper4/data/dataset1.h5ad
# tasks: 13

Dataset 5: /mlbio_scratch/videnovi/papers_BIBA/paper4/data/lymph_dataset.h5ad
# tasks: 15

Dataset 6: /mlbio_scratch/videnovi/papers_BIBA/paper4/data/my_dataset.h5ad
# tasks: 11

Dataset 7: /mlbio_scratch/videnovi/papers_BIBA/paper5/data/dataset2.h5ad
# tasks: 12

Dataset 8: /mlbio_scratch/videnovi/papers_BIBA/paper5/data/VA_adipocytes.h5ad
# tasks: 16

Dataset 9: /mlbio_scratch/videnovi/papers_BIBA/paper6/data/dataset1.h5ad
# tasks: 16

Dataset 10: /mlbio_scratch/videnovi/papers_BIBA/paper7/data/dataset2.h5ad
# tasks: 15

Dataset 11: /mlbio_scratch/panigrah/agents/data/nature/paper1/cxg_data.h5ad
# tasks: 13

Dataset 12: /mlbio_scratch/panigrah/ag

In [20]:
for dataset_path, tasks in tasks_by_dataset.items():
    print(f"Dataset: {dataset_path}")
    print(100 * '-')
    for tid, task in enumerate(tasks):
        print(f"Task {tid}")
        print(f"Description: {task['task_description']}")
        print(f"Answer: {task['ground_truth_answer']}")
        print()

Dataset: /mlbio_scratch/videnovi/papers_BIBA/paper2/data/dataset1.h5ad
----------------------------------------------------------------------------------------------------
Task 0
Description: Using the provided single-nucleus RNA‑seq dataset (dataset1.h5ad), compute the mean Euclidean distance between pre‑treatment cells (where the 'Response' column is missing/NaN) and post‑treatment cells (where 'Response' is not NaN) for each unique value in the 'malignancy' column, based on the first 30 principal components stored in adata.obsm['X_pca']. Report the malignancy label that has the largest mean pre‑post distance. Answer format: a single token exactly matching one entry of the 'malignancy' column (shape: label).
Answer: Nonmalignant

Task 1
Description: Using the provided snRNA‑seq dataset, identify the cell type whose relative abundance increases the most after chemotherapy (Stage_Code = PTX) compared to diagnosis (Stage_Code = DX). For each cell type, compute its proportion of cells wi

In [21]:
duplicates_by_dataset= {
    # "/mlbio_scratch/panigrah/agents/data/nature/paper2/processed_MuSC_data.h5ad": [[5, 6]],
    # "/mlbio_scratch/panigrah/agents/data/nature/paper2/processed_myonuclei_data.h5ad": [[10, 9]],
    "/mlbio_scratch/panigrah/agents/data/nature/paper6/data.h5ad": [[4, 0, 2]],
    "/mlbio_scratch/panigrah/agents/data/cell/paper2/data.h5ad": [[6, 8]],
    # "/mlbio_scratch/panigrah/agents/data/cell/paper3/data.h5ad": [[3, 8]],
}

In [22]:
clean_tasks = get_clean_tasks(tasks_by_dataset, duplicates_by_dataset)
print(f"Tasks after dataset-wise filtration: {len(clean_tasks)}")

Tasks after dataset-wise filtration: 283


In [23]:
clean_tasks_path = tasks_dir / "clean.json"
save_data(clean_tasks, clean_tasks_path)