In [1]:
import pathlib
import warnings

import pandas as pd

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

In [2]:
patients = pd.read_csv(
    pathlib.Path(f"{root_dir}/data/patient_IDs.txt"),
    header=None,
    names=["patient"],
    dtype=str,
)["patient"].tolist()
list_of_sc_profile_paths = []
list_of_organoid_profile_paths = []
for patient_id in patients:
    list_of_sc_profile_paths.append(
        pathlib.Path(
            f"{root_dir}/data/{patient_id}/image_based_profiles/2.annotated_profiles/sc_anno.parquet"
        )
    )
    list_of_organoid_profile_paths.append(
        pathlib.Path(
            f"{root_dir}/data/{patient_id}/image_based_profiles/2.annotated_profiles/organoid_anno.parquet"
        )
    )

In [3]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "parent_organoid",
]
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "single_cell_count",
]

In [4]:
single_cell_counts = pd.concat(
    [
        pd.read_parquet(path)
        for path in tqdm(list_of_sc_profile_paths, desc="Reading sc profiles")
    ],
    ignore_index=True,
)[sc_metadata_columns]

Reading sc profiles:   0%|          | 0/8 [00:00<?, ?it/s]

In [5]:
organoid_counts = pd.concat(
    [
        pd.read_parquet(path)
        for path in tqdm(
            list_of_organoid_profile_paths, desc="Reading organoid profiles"
        )
    ],
    ignore_index=True,
)[organoid_metadata_columns]

Reading organoid profiles:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
# replace the single cell count NAN with 0

organoid_counts = organoid_counts.fillna(0)
sc_counts = single_cell_counts.fillna(0)
sc_counts.drop(columns=["object_id"], inplace=True, errors="ignore")
organoid_counts.drop(columns=["object_id"], inplace=True, errors="ignore")

In [7]:
print("Single cell counts shape:", single_cell_counts.shape)
print("Organoid counts shape:", organoid_counts.shape)

Single cell counts shape: (11278, 8)
Organoid counts shape: (1507, 7)


In [8]:
organoid_counts.drop_duplicates(
    subset=["patient", "Well", "single_cell_count"], inplace=True, ignore_index=True
)
organoid_counts = (
    organoid_counts.groupby(["patient", "unit", "dose", "treatment", "Well"])
    .sum(numeric_only=True)
    .reset_index()
)

In [9]:
sc_counts.drop_duplicates(
    subset=["patient", "Well", "parent_organoid"], inplace=True, ignore_index=True
)
sc_counts = (
    sc_counts.groupby(["patient", "unit", "dose", "treatment", "Well"])
    .size()
    .reset_index(name="organoid_count")
)

In [10]:
sc_and_organoid_counts = pd.merge(
    organoid_counts,
    sc_counts,
    how="inner",
    on=[
        "patient",
        "unit",
        "dose",
        "treatment",
        "Well",
    ],
)

In [11]:
# save the merged profile counts
pathlib.Path(f"{root_dir}/figures/results/").mkdir(parents=True, exist_ok=True)
sc_and_organoid_counts.to_parquet(
    pathlib.Path(f"{root_dir}/figures/results/sc_and_organoid_counts.parquet"),
    index=False,
)
sc_and_organoid_counts.shape

(393, 7)

In [12]:
sc_and_organoid_counts

Unnamed: 0,patient,unit,dose,treatment,Well,single_cell_count,organoid_count
0,NF0014,%,1,DMSO,C4,49.0,3
1,NF0014,%,1,DMSO,D4,32.0,2
2,NF0014,%,1,DMSO,D9,67.0,4
3,NF0014,%,1,DMSO,E4,36.0,3
4,NF0014,%,1,DMSO,E9,16.0,2
...,...,...,...,...,...,...,...
388,SARCO361,uM,10,Mirdametinib,G8,17.0,3
389,SARCO361,uM,10,Selumetinib,E11,18.0,3
390,SARCO361,uM,10,Selumetinib,F11,4.0,3
391,SARCO361,uM,10,Trametinib,E10,1.0,2
