# Perform organoid-level quality control

In [1]:
import os
import pathlib
import sys

import pandas as pd
from arg_parsing_utils import parse_args
from cosmicqc import find_outliers
from notebook_init_utils import bandicoot_check, init_notebook

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path(os.path.expanduser("~/mnt/bandicoot/NF1_organoid_data")).resolve(),
    root_dir,
)

In [2]:
if not in_notebook:
    args = parse_args()
    image_based_profiles_subparent_name = args["image_based_profiles_subparent_name"]

else:
    image_based_profiles_subparent_name = "image_based_profiles"

## Load in all the organoid profiles and concat together

In [None]:
# Path to patient folders
path_to_patients = pathlib.Path(f"{profile_base_dir}/data/")

# Get all organoid profiles per patient folder and concatenate them
dfs = []
for patient_folder in path_to_patients.iterdir():
    organoid_file = pathlib.Path(
        patient_folder
        / f"{image_based_profiles_subparent_name}"
        / "2.annotated_profiles/organoid_anno.parquet"
    )
    if organoid_file.exists():
        print(f"Processing {organoid_file}")
        df = pd.read_parquet(organoid_file)
        dfs.append(df)
    else:
        print(f"Organoid profiles file not found for patient folder: {patient_folder}")
orig_organoid_profiles_df = pd.concat(dfs, ignore_index=True)

# Print the shape and head of the combined organoid profiles DataFrame
print(orig_organoid_profiles_df.shape)
orig_organoid_profiles_df.head()

Processing /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0014_T2/image_based_profiles/2.annotated_profiles/organoid_anno.parquet
Organoid profiles file not found for patient folder: /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0040_T1
Processing /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0030_T1/image_based_profiles/2.annotated_profiles/organoid_anno.parquet
Organoid profiles file not found for patient folder: /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-1
Organoid profiles file not found for patient folder: /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/.DS_Store
Organoid profiles file not found for patient folder: /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/patient_IDs.txt
Organoid profiles file not found for patient folder: /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/blocklist
Organoid profiles file not found for patient folder: /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-0.1
Organoid profiles fi

Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,...,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,Metadata_Target,Metadata_Class,Metadata_Therapeutic_Categories,Metadata_patient,Metadata_tumor,Metadata_patient_id
0,NF0014_T2,14,nM,10.0,Staurosporine,C11-4,C11,,1296580.0,824.287109,...,2.163053,0.278313,228.354152,70.320466,Apoptosis induction,Small Molecule,Experimental,NF0014,T2,NF0014_T2
1,NF0014_T2,18,nM,10.0,Staurosporine,C11-4,C11,,128.0,604.625,...,0.000727,8.2e-05,0.145701,0.071439,Apoptosis induction,Small Molecule,Experimental,NF0014,T2,NF0014_T2
2,NF0014_T2,10,uM,1.0,Selumetinib,D11-5,D11,,2832086.0,760.868103,...,2.067724,0.73414,92.356472,27.130117,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T2,NF0014_T2
3,NF0014_T2,14,uM,1.0,Selumetinib,D11-5,D11,,288.0,842.194458,...,0.001829,0.000232,0.316325,0.146497,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T2,NF0014_T2
4,NF0014_T2,26,uM,1.0,Selumetinib,D11-5,D11,,1772.0,1003.268616,...,0.008353,0.00092,1.516083,0.67605,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T2,NF0014_T2


## Perform a first round of QC by flagging any row with NaNs in metadata

We check for NaNs in the `object_id` and/or the `single_cell_count` column and flag them because:
   - An organoid can not exist if there aren't any cells.
   - NaN in object_id would be incorrect as that means the object/organoid does not exist (will have all NaNs in the feature space).

In [None]:
organoid_profiles_df = orig_organoid_profiles_df.copy()
organoid_profiles_df["Metadata_cqc_nan_detected"] = (
    organoid_profiles_df[
        [
            "Metadata_object_id",
            "Metadata_single_cell_count",
            "Area.Size.Shape_Organoid_VOLUME",
        ]
    ]
    .isna()
    .any(axis=1)
)
# Print the number of organoids flagged
flagged_count = organoid_profiles_df["Metadata_cqc_nan_detected"].sum()
print(f"Number of organoids flagged: {flagged_count}")

organoid_profiles_df.head()

Number of organoids flagged: 3453


Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,...,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,Metadata_Target,Metadata_Class,Metadata_Therapeutic_Categories,Metadata_patient,Metadata_tumor,Metadata_patient_id,cqc.nan_detected
0,NF0014_T2,14,nM,10.0,Staurosporine,C11-4,C11,,1296580.0,824.287109,...,0.278313,228.354152,70.320466,Apoptosis induction,Small Molecule,Experimental,NF0014,T2,NF0014_T2,True
1,NF0014_T2,18,nM,10.0,Staurosporine,C11-4,C11,,128.0,604.625,...,8.2e-05,0.145701,0.071439,Apoptosis induction,Small Molecule,Experimental,NF0014,T2,NF0014_T2,True
2,NF0014_T2,10,uM,1.0,Selumetinib,D11-5,D11,,2832086.0,760.868103,...,0.73414,92.356472,27.130117,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T2,NF0014_T2,True
3,NF0014_T2,14,uM,1.0,Selumetinib,D11-5,D11,,288.0,842.194458,...,0.000232,0.316325,0.146497,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T2,NF0014_T2,True
4,NF0014_T2,26,uM,1.0,Selumetinib,D11-5,D11,,1772.0,1003.268616,...,0.00092,1.516083,0.67605,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T2,NF0014_T2,True


## Process non-NaN rows to detect abnormally small and large organoids and flag them

In [None]:
# Set the metadata columns to be used in the QC process
metadata_columns = [
    "Metadata_patient_tumor",
    "Metadata_image_set",
    "Metadata_single_cell_count",
    "Metadata_object_id",
    "Metadata_cqc_nan_detected",
]

In [None]:
# Process each plate (patient_id) independently in the combined dataframe
for plate_name, plate_df in organoid_profiles_df.groupby("Metadata_patient_tumor"):
    print(f"Processing plate: {plate_name}")

    # Only process the rows that are not flagged
    filtered_plate_df = plate_df[~plate_df["Metadata_cqc_nan_detected"]]

    # Find outlier organoids based on the 'Area.Size.Shape_Organoid_VOLUME' column
    print("Finding small organoid outliers...")
    small_size_outliers = find_outliers(
        df=filtered_plate_df,
        metadata_columns=metadata_columns,
        feature_thresholds={
            "Area.Size.Shape_Organoid_VOLUME": -1,  # Detect very small organoids
        },
    )

    # Ensure the column exists before assignment
    plate_df["Metadata_cqc_small_organoid_outlier"] = False
    plate_df.loc[small_size_outliers.index, "Metadata_cqc_small_organoid_outlier"] = (
        True
    )

    print("Finding large organoid outliers...")
    large_size_outliers = find_outliers(
        df=filtered_plate_df,
        metadata_columns=metadata_columns,
        feature_thresholds={
            "Area.Size.Shape_Organoid_VOLUME": 3,  # Detect very large organoids
        },
    )

    # Ensure the column exists before assignment
    plate_df["Metadata_cqc_large_organoid_outlier"] = False
    plate_df.loc[large_size_outliers.index, "Metadata_cqc_large_organoid_outlier"] = (
        True
    )

    # Update original dataframe so flags persist
    organoid_profiles_df.loc[plate_df.index, :] = plate_df

    # Print number of outliers (only in filtered rows)
    small_count = filtered_plate_df.index.intersection(small_size_outliers.index).shape[
        0
    ]
    large_count = filtered_plate_df.index.intersection(large_size_outliers.index).shape[
        0
    ]
    print(f"Small organoid outliers found: {small_count}")
    print(f"Large organoid outliers found: {large_count}")

    # Save updated plate_df with flag columns included
    output_folder = path_to_patients / plate_name / "image_based_profiles/3.qc_profiles"
    output_folder.mkdir(parents=True, exist_ok=True)
    output_file = output_folder / "organoid_flagged_outliers.parquet"
    plate_df.to_parquet(output_file, index=False)
    print(f"Saved organoid profiles with outlier flags to {output_file}\n")

Processing plate: NF0014_T1
Finding small organoid outliers...
Number of outliers: 12 (15.19%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 460122.0
Area.Size.Shape_Organoid_VOLUME Max: 2018826.0
Finding large organoid outliers...
Number of outliers: 1 (1.27%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 28609348.0
Area.Size.Shape_Organoid_VOLUME Max: 28609348.0
Small organoid outliers found: 12
Large organoid outliers found: 1
Saved organoid profiles with outlier flags to /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/image_based_profiles/1a.qc_profiles/organoid_flagged_outliers.parquet

Processing plate: NF0014_T2
Finding small organoid outliers...
Number of outliers: 14 (10.45%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 138328.0
Area.Size.Shape_Organoid_VOLUME Max: 576308.0
Finding large organoid outliers...
Number of outliers: 4 (2.99%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 8160154.0
Area.Size.Shape_Organoid_VOLUME Max: 95606

In [11]:
# Print example output of the flagged organoid profiles
print(f"Example flagged organoid profiles: {plate_name}")
print(plate_df.shape)
plate_df.head()

Example flagged organoid profiles: NF0030_T1
(396, 2004)


Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,...,Texture_Organoid_Mito_Variance_256.3,Metadata_Target,Metadata_Class,Metadata_Therapeutic_Categories,Metadata_patient,Metadata_tumor,Metadata_patient_id,cqc.nan_detected,cqc.small_organoid_outlier,cqc.large_organoid_outlier
1432,NF0030_T1,4,nM,10.0,Staurosporine,C11-4,C11,,2632089.0,882.465332,...,48.541913,Apoptosis induction,Small Molecule,Experimental,NF0030,T1,NF0030_T1,True,False,False
1433,NF0030_T1,8,nM,10.0,Staurosporine,C11-4,C11,,372.0,315.010742,...,0.120026,Apoptosis induction,Small Molecule,Experimental,NF0030,T1,NF0030_T1,True,False,False
1434,NF0030_T1,12,nM,10.0,Staurosporine,C11-4,C11,,1510316.0,264.324921,...,43.012409,Apoptosis induction,Small Molecule,Experimental,NF0030,T1,NF0030_T1,True,False,False
1435,NF0030_T1,30,nM,10.0,Staurosporine,C11-4,C11,,2938900.0,713.619446,...,107.057396,Apoptosis induction,Small Molecule,Experimental,NF0030,T1,NF0030_T1,True,False,False
1436,NF0030_T1,33,nM,10.0,Staurosporine,C11-4,C11,,142092.0,804.719055,...,19.405408,Apoptosis induction,Small Molecule,Experimental,NF0030,T1,NF0030_T1,True,False,False
