In [1]:
import argparse
import os
import pathlib
import sys

import pandas as pd
from arg_parsing_utils import parse_args
from notebook_init_utils import bandicoot_check, init_notebook

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path(os.path.expanduser("~/mnt/bandicoot")).resolve(), root_dir
)

In [2]:
if not in_notebook:
    args = parse_args()
    well_fov = args["well_fov"]
    patient = args["patient"]
    image_based_profiles_subparent_name = args["image_based_profiles_subparent_name"]

else:
    patient = "SARCO361_T1"
    well_fov = "D2-1"
    image_based_profiles_subparent_name = "image_based_profiles"

In [3]:
def centroid_within_bbox_detection(
    centroid: tuple,
    bbox: tuple,
) -> bool:
    """
    Check if the centroid is within the bbox

    Parameters
    ----------
    centroid : tuple
        Centroid of the object in the order of (z, y, x)
        Order of the centroid is important
    bbox : tuple
        Where the bbox is in the order of (z_min, y_min, x_min, z_max, y_max, x_max)
        Order of the bbox is important

    Returns
    -------
    bool
        True if the centroid is within the bbox, False otherwise
    """
    z_min, y_min, x_min, z_max, y_max, x_max = bbox
    z, y, x = centroid
    # check if the centroid is within the bbox
    if (
        z >= z_min
        and z <= z_max
        and y >= y_min
        and y <= y_max
        and x >= x_min
        and x <= x_max
    ):
        return True
    else:
        return False

### Pathing

In [4]:
# input paths
sc_profile_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/0.converted_profiles/{well_fov}/sc_profiles_{well_fov}.parquet"
).resolve(strict=True)
organoid_profile_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/0.converted_profiles/{well_fov}/organoid_profiles_{well_fov}.parquet"
).resolve(strict=True)
# output paths
sc_profile_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/0.converted_profiles/{well_fov}/sc_profiles_{well_fov}_related.parquet"
).resolve()
organoid_profile_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/0.converted_profiles/{well_fov}/organoid_profiles_{well_fov}_related.parquet"
).resolve()

In [5]:
sc_profile_df = pd.read_parquet(sc_profile_path)
organoid_profile_df = pd.read_parquet(organoid_profile_path)
print(f"Single-cell profile shape: {sc_profile_df.shape}")
print(f"Organoid profile shape: {organoid_profile_df.shape}")

Single-cell profile shape: (25, 7690)
Organoid profile shape: (8, 2562)


In [6]:
# initialize the parent organoid column
sc_profile_df.insert(2, "parent_organoid", -1)

In [7]:
x_y_z_sc_colnames = [
    x for x in sc_profile_df.columns if "area" in x.lower() and "center" in x.lower()
]
print(
    f"The nuclei centroids in the single-cell profile are in the columns:\n{x_y_z_sc_colnames}"
)

The nuclei centroids in the single-cell profile are in the columns:
['Area.Size.Shape_Nuclei_CENTER.X', 'Area.Size.Shape_Nuclei_CENTER.Y', 'Area.Size.Shape_Nuclei_CENTER.Z', 'Area.Size.Shape_Cell_CENTER.X', 'Area.Size.Shape_Cell_CENTER.Y', 'Area.Size.Shape_Cell_CENTER.Z', 'Area.Size.Shape_Cytoplasm_CENTER.X', 'Area.Size.Shape_Cytoplasm_CENTER.Y', 'Area.Size.Shape_Cytoplasm_CENTER.Z']


In [8]:
organoid_bbox_colnames = [
    x for x in organoid_profile_df.columns if "Area" in x and ("MIN" in x or "MAX" in x)
]
organoid_bbox_colnames = sorted(organoid_bbox_colnames)
print(f"The organoid bounding boxes are in the columns:\n{organoid_bbox_colnames}")

The organoid bounding boxes are in the columns:
['Area.Size.Shape_Organoid_MAX.X', 'Area.Size.Shape_Organoid_MAX.Y', 'Area.Size.Shape_Organoid_MAX.Z', 'Area.Size.Shape_Organoid_MIN.X', 'Area.Size.Shape_Organoid_MIN.Y', 'Area.Size.Shape_Organoid_MIN.Z']


In [9]:
from tqdm import tqdm

# Initialize parent_organoid to -1
sc_profile_df["parent_organoid"] = -1

# Extract single-cell centroids as numpy array for faster access
sc_centroids = sc_profile_df[x_y_z_sc_colnames].values  # (N_cells, 3) array

# Loop through organoids with progress bar
for organoid_index, organoid_row in tqdm(
    organoid_profile_df.iterrows(),
    total=len(organoid_profile_df),
    desc="Assigning cells to organoids",
):
    # Get organoid bbox
    organoid_bbox = (
        organoid_row[organoid_bbox_colnames[5]],  # z_min
        organoid_row[organoid_bbox_colnames[4]],  # y_min
        organoid_row[organoid_bbox_colnames[3]],  # x_min
        organoid_row[organoid_bbox_colnames[2]],  # z_max
        organoid_row[organoid_bbox_colnames[1]],  # y_max
        organoid_row[organoid_bbox_colnames[0]],  # x_max
    )

    z_min, y_min, x_min, z_max, y_max, x_max = organoid_bbox

    # Vectorized bbox check - much faster!
    mask = (
        (sc_centroids[:, 2] >= z_min)
        & (sc_centroids[:, 2] <= z_max)  # z
        & (sc_centroids[:, 1] >= y_min)
        & (sc_centroids[:, 1] <= y_max)  # y
        & (sc_centroids[:, 0] >= x_min)
        & (sc_centroids[:, 0] <= x_max)  # x
    )

    # Only assign if cell doesn't already have a parent
    unassigned_mask = sc_profile_df["parent_organoid"] == -1
    final_mask = mask & unassigned_mask

    # Assign parent organoid to matching cells
    sc_profile_df.loc[final_mask, "parent_organoid"] = organoid_row["object_id"]

print(f"Assigned {(sc_profile_df['parent_organoid'] != -1).sum()} cells to organoids")
print(f"Unassigned cells: {(sc_profile_df['parent_organoid'] == -1).sum()}")

Assigning cells to organoids: 100%|██████████| 8/8 [00:00<00:00, 2247.75it/s]

Assigned 22 cells to organoids
Unassigned cells: 3





### Add single-cell counts for each organoid

In [10]:
organoid_sc_counts = (
    sc_profile_df["parent_organoid"]
    .value_counts()
    .to_frame(name="single_cell_count")
    .reset_index()
)
# merge the organoid profile with the single-cell counts
organoid_profile_df = pd.merge(
    organoid_profile_df,
    organoid_sc_counts,
    left_on="object_id",
    right_on="parent_organoid",
    how="left",
).drop(columns=["parent_organoid"])
sc_count = organoid_profile_df.pop("single_cell_count")
organoid_profile_df.insert(2, "single_cell_count", sc_count)

Even if the file is empty we still want to add it to the final dataframe dictionary so that we can merge on the same columns later.
This will help with file-based checking and merging.


In [11]:
organoid_profile_df

Unnamed: 0,object_id,image_set,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,Area.Size.Shape_Organoid_CENTER.Y,Area.Size.Shape_Organoid_CENTER.Z,Area.Size.Shape_Organoid_BBOX.VOLUME,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,...,Texture_Organoid_Mito_Difference.Entropy_256.3,Texture_Organoid_Mito_Difference.Variance_256.3,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3
0,1,D2-1,2.0,189203.0,197.75235,159.986801,0.583225,13500108.0,0.0,516.0,...,0.049443,0.003859,0.05814,-0.231675,0.082439,0.996112,0.083851,0.053193,1.936182,0.689026
1,2,D2-1,,185275.0,748.59613,657.609802,18.0,298775.0,433.0,1062.0,...,0.054738,0.00386,0.065053,-0.225361,0.086144,0.99601,0.494603,0.059767,71.188589,24.989597
2,5,D2-1,6.0,1122501.0,743.9505,722.828979,2.499002,2086368.0,338.0,1162.0,...,0.33749,0.00367,0.438545,-0.27293,0.322616,0.973107,1.114182,0.372728,87.965325,31.348332
3,22,D2-1,,668719.0,242.766983,163.694519,4.557856,1372680.0,0.0,615.0,...,0.183167,0.003759,0.234919,-0.385469,0.311103,0.983104,1.32124,0.210698,102.547089,31.371405
4,40,D2-1,12.0,2888250.0,732.385986,695.191467,10.759619,5870904.0,350.0,1094.0,...,0.714112,0.003348,1.105665,-0.39959,0.634871,0.935878,3.227834,0.882133,216.04596,62.387229
5,43,D2-1,2.0,3346145.0,429.047302,1335.954468,9.902335,4709435.0,146.0,703.0,...,0.611665,0.003405,0.957311,-0.524859,0.700722,0.939171,4.866813,0.813452,356.195108,93.446341
6,47,D2-1,,114620.0,191.118835,886.310303,17.433886,155800.0,102.0,292.0,...,0.039362,0.003868,0.047078,-0.318725,0.123074,0.997058,0.235822,0.043166,20.088772,6.681711
7,49,D2-1,,351864.0,774.080566,652.095947,19.502424,564256.0,454.0,1070.0,...,0.095263,0.003832,0.11434,-0.219136,0.112235,0.992419,0.993387,0.1038,146.980151,51.655468


In [12]:
if organoid_profile_df.empty:
    # add a row with Na values
    organoid_profile_df.loc[len(organoid_profile_df)] = [None] * len(
        organoid_profile_df.columns
    )
    organoid_profile_df["image_set"] = well_fov

In [13]:
print(f"Single-cell profile shape: {sc_profile_df.shape}")

Single-cell profile shape: (25, 7691)


In [14]:
if sc_profile_df.empty:
    # add a row with Na values
    sc_profile_df.loc[len(sc_profile_df)] = [None] * len(sc_profile_df.columns)
    sc_profile_df["image_set"] = well_fov

### Save the profiles

In [15]:
organoid_profile_df.to_parquet(organoid_profile_output_path, index=False)
organoid_profile_df.head()

Unnamed: 0,object_id,image_set,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,Area.Size.Shape_Organoid_CENTER.Y,Area.Size.Shape_Organoid_CENTER.Z,Area.Size.Shape_Organoid_BBOX.VOLUME,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,...,Texture_Organoid_Mito_Difference.Entropy_256.3,Texture_Organoid_Mito_Difference.Variance_256.3,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3
0,1,D2-1,2.0,189203.0,197.75235,159.986801,0.583225,13500108.0,0.0,516.0,...,0.049443,0.003859,0.05814,-0.231675,0.082439,0.996112,0.083851,0.053193,1.936182,0.689026
1,2,D2-1,,185275.0,748.59613,657.609802,18.0,298775.0,433.0,1062.0,...,0.054738,0.00386,0.065053,-0.225361,0.086144,0.99601,0.494603,0.059767,71.188589,24.989597
2,5,D2-1,6.0,1122501.0,743.9505,722.828979,2.499002,2086368.0,338.0,1162.0,...,0.33749,0.00367,0.438545,-0.27293,0.322616,0.973107,1.114182,0.372728,87.965325,31.348332
3,22,D2-1,,668719.0,242.766983,163.694519,4.557856,1372680.0,0.0,615.0,...,0.183167,0.003759,0.234919,-0.385469,0.311103,0.983104,1.32124,0.210698,102.547089,31.371405
4,40,D2-1,12.0,2888250.0,732.385986,695.191467,10.759619,5870904.0,350.0,1094.0,...,0.714112,0.003348,1.105665,-0.39959,0.634871,0.935878,3.227834,0.882133,216.04596,62.387229


In [16]:
sc_profile_df.to_parquet(sc_profile_output_path, index=False)
sc_profile_df.head()

Unnamed: 0,object_id,image_set,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.3,Texture_Cytoplasm_Mito_Difference.Variance_256.3,Texture_Cytoplasm_Mito_Entropy_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3
0,11,D2-1,-1,7508.0,804.349487,611.009583,0.5,10064.0,771.0,839.0,...,0.002992,0.00389,0.003327,-0.169203,0.016471,0.999833,0.032149,0.003114,6.753211,2.542119
1,45,D2-1,5,60100.0,630.456909,824.483215,3.269517,563850.0,538.0,853.0,...,0.006619,0.003888,0.007375,-0.182837,0.031831,0.999645,0.013618,0.006823,1.03639,0.443374
2,45,D2-1,5,60100.0,630.456909,824.483215,3.269517,563850.0,538.0,853.0,...,0.006619,0.003888,0.007375,-0.182837,0.031831,0.999645,0.013618,0.006823,1.03639,0.443374
3,113,D2-1,-1,151504.0,842.568726,189.661865,11.295741,285114.0,775.0,897.0,...,0.042757,0.003868,0.054309,-0.446293,0.174716,0.997069,0.232061,0.048815,19.388863,5.642771
4,118,D2-1,-1,3851.0,937.378113,1072.717529,3.465334,5760.0,910.0,970.0,...,0.000443,0.003891,0.000466,-0.067374,0.003151,0.999981,0.001017,0.000445,0.092373,0.04148
