# Perform organoid-level quality control

In [1]:
import pandas as pd
import pathlib
from cosmicqc import find_outliers

## Load in all the organoid profiles and concat together

In [2]:
# Path to patient folders
path_to_patients = pathlib.Path(
    "/home/jenna/mnt/bandicoot/NF1_organoid_processed_patients/"
)

# Get all organoid profiles per patient folder and concatenate them
dfs = []
for patient_folder in path_to_patients.iterdir():
    organoid_file = (
        patient_folder / "image_based_profiles/1.combined_profiles" / "organoid.parquet"
    )
    if organoid_file.exists():
        df = pd.read_parquet(organoid_file)
        df["patient_id"] = patient_folder.name
        # Group by image_set and count organoids
        organoid_counts = (
            df.groupby("image_set")["object_id"].count().rename("organoid_count")
        )
        df = df.merge(organoid_counts, on="image_set", how="left")
        dfs.append(df)
orig_organoid_profiles_df = pd.concat(dfs, ignore_index=True)

# Print the shape and head of the combined organoid profiles DataFrame
print(orig_organoid_profiles_df.shape)
orig_organoid_profiles_df.head()

(1511, 456)


Unnamed: 0,object_id,image_set,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,Area.Size.Shape_Organoid_CENTER.Y,Area.Size.Shape_Organoid_CENTER.Z,Area.Size.Shape_Organoid_BBOX.VOLUME,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,...,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,patient_id,organoid_count
0,3.0,F10-3,,1781460.0,345.987488,488.705536,1.5,2374240.0,0.0,760.0,...,0.73481,-0.282089,0.418897,0.940579,1.526162,0.641485,37.087719,11.696999,NF0018,1
1,20.0,G8-1,27.0,7292209.0,724.284119,708.943604,23.929886,95081560.0,18.0,1408.0,...,0.860797,-0.270331,0.425176,0.923478,6.401641,0.734478,527.768642,171.57946,NF0018,1
2,29.0,D5-1,5.0,7947467.0,939.415649,710.585999,10.794022,25843536.0,316.0,1540.0,...,2.351234,-0.270822,0.664101,0.768638,17.046799,1.929867,939.303593,281.843141,NF0018,1
3,,F8-1,,,,,,,,,...,,,,,,,,,NF0018,0
4,11.0,G11-1,1.0,349071.0,941.275452,752.774719,14.133512,7905168.0,633.0,1416.0,...,0.148919,-0.226451,0.108759,0.988185,0.860475,0.134373,69.363807,27.387181,NF0018,1


## Perform a first round of QC by flagging any row with NaNs in metadata

We check for NaNs in the `object_id` and/or the `single_cell_count` column and flag them because:
   - An organoid can not exist if there aren't any cells.
   - NaN in object_id would be incorrect as that means the object/organoid does not exist (will have all NaNs in the feature space).

In [3]:
organoid_profiles_df = orig_organoid_profiles_df.copy()
organoid_profiles_df["cqc.nan_detected"] = (
    organoid_profiles_df[["object_id", "single_cell_count"]].isna().any(axis=1)
)

# Print the number of organoids flagged
flagged_count = organoid_profiles_df["cqc.nan_detected"].sum()
print(f"Number of organoids flagged: {flagged_count}")

organoid_profiles_df.head()

Number of organoids flagged: 458


Unnamed: 0,object_id,image_set,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,Area.Size.Shape_Organoid_CENTER.Y,Area.Size.Shape_Organoid_CENTER.Z,Area.Size.Shape_Organoid_BBOX.VOLUME,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,...,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,patient_id,organoid_count,cqc.nan_detected
0,3.0,F10-3,,1781460.0,345.987488,488.705536,1.5,2374240.0,0.0,760.0,...,-0.282089,0.418897,0.940579,1.526162,0.641485,37.087719,11.696999,NF0018,1,True
1,20.0,G8-1,27.0,7292209.0,724.284119,708.943604,23.929886,95081560.0,18.0,1408.0,...,-0.270331,0.425176,0.923478,6.401641,0.734478,527.768642,171.57946,NF0018,1,False
2,29.0,D5-1,5.0,7947467.0,939.415649,710.585999,10.794022,25843536.0,316.0,1540.0,...,-0.270822,0.664101,0.768638,17.046799,1.929867,939.303593,281.843141,NF0018,1,False
3,,F8-1,,,,,,,,,...,,,,,,,,NF0018,0,True
4,11.0,G11-1,1.0,349071.0,941.275452,752.774719,14.133512,7905168.0,633.0,1416.0,...,-0.226451,0.108759,0.988185,0.860475,0.134373,69.363807,27.387181,NF0018,1,False


## Process non-NaN rows to detect abnormally small and large organoids and flag them

In [4]:
# Set the metadata columns to be used in the QC process
metadata_columns = [
    "patient_id",
    "image_set",
    "object_id",
    "single_cell_count",
    "organoid_count",
    "cqc.nan_detected",
]

In [5]:
# Process each plate (patient_id) independently in the combined dataframe
for plate_name, plate_df in organoid_profiles_df.groupby("patient_id"):
    print(f"Processing plate: {plate_name}")

    # Only process the rows that are not flagged
    filtered_plate_df = plate_df[~plate_df["cqc.nan_detected"]]

    # Find outlier organoids based on the 'Area.Size.Shape_Organoid_VOLUME' column
    print("Finding small organoid outliers...")
    small_size_outliers = find_outliers(
        df=filtered_plate_df,
        metadata_columns=metadata_columns,
        feature_thresholds={
            "Area.Size.Shape_Organoid_VOLUME": -1,  # Detect very small organoids
        },
    )

    # Ensure the column exists before assignment
    plate_df["cqc.small_organoid_outlier"] = False
    plate_df.loc[small_size_outliers.index, "cqc.small_organoid_outlier"] = True

    print("Finding large organoid outliers...")
    large_size_outliers = find_outliers(
        df=filtered_plate_df,
        metadata_columns=metadata_columns,
        feature_thresholds={
            "Area.Size.Shape_Organoid_VOLUME": 3,  # Detect very large organoids
        },
    )

    # Ensure the column exists before assignment
    plate_df["cqc.large_organoid_outlier"] = False
    plate_df.loc[large_size_outliers.index, "cqc.large_organoid_outlier"] = True

    # Update original dataframe so flags persist
    organoid_profiles_df.loc[plate_df.index, :] = plate_df

    # Print number of outliers (only in filtered rows)
    small_count = filtered_plate_df.index.intersection(small_size_outliers.index).shape[
        0
    ]
    large_count = filtered_plate_df.index.intersection(large_size_outliers.index).shape[
        0
    ]
    print(f"Small organoid outliers found: {small_count}")
    print(f"Large organoid outliers found: {large_count}")

    # Save updated plate_df with flag columns included
    output_folder = (
        path_to_patients / plate_name / "image_based_profiles/1a.qc_profiles"
    )
    output_folder.mkdir(parents=True, exist_ok=True)
    output_file = output_folder / "organoid_flagged_outliers.parquet"
    plate_df.to_parquet(output_file, index=False)
    print(f"Saved organoid profiles with outlier flags to {output_file}\n")

Processing plate: NF0014
Finding small organoid outliers...
Number of outliers: 9 (9.47%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 1187129.0
Area.Size.Shape_Organoid_VOLUME Max: 2721536.0
Finding large organoid outliers...
Number of outliers: 2 (2.11%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 45764776.0
Area.Size.Shape_Organoid_VOLUME Max: 66231340.0
Small organoid outliers found: 9
Large organoid outliers found: 2
Saved organoid profiles with outlier flags to /home/jenna/mnt/bandicoot/NF1_organoid_processed_patients/NF0014/image_based_profiles/1a.qc_profiles/organoid_flagged_outliers.parquet

Processing plate: NF0016
Finding small organoid outliers...
Number of outliers: 2 (3.85%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 138472.0
Area.Size.Shape_Organoid_VOLUME Max: 1473650.0
Finding large organoid outliers...
Number of outliers: 0 (0.00%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: nan
Area.Size.Shape_Organoid_VOLUME Max: nan
Small organoi

In [6]:
# Print example output of the flagged organoid profiles
print(f"Example flagged organoid profiles: {plate_name}")
print(plate_df.shape)
plate_df.head()

Example flagged organoid profiles: SARCO361
(229, 459)


Unnamed: 0,object_id,image_set,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,Area.Size.Shape_Organoid_CENTER.Y,Area.Size.Shape_Organoid_CENTER.Z,Area.Size.Shape_Organoid_BBOX.VOLUME,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,...,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,patient_id,organoid_count,cqc.nan_detected,cqc.small_organoid_outlier,cqc.large_organoid_outlier
811,3,G10-7,,92.0,729.043457,932.782593,1.5,1392.0,723.0,735.0,...,0.999996,0.000788,9.7e-05,0.148239,0.072444,SARCO361,1,True,False,False
812,12,F6-4,1.0,663.0,1147.567139,816.111633,4.803922,11337396.0,534.0,1505.0,...,0.999965,0.005419,0.000671,0.85881,0.416273,SARCO361,1,False,True,False
813,3,C8-7,,428.0,885.102783,635.411194,1.5,87576.0,798.0,976.0,...,0.999989,0.001659,0.000249,0.25635,0.124124,SARCO361,1,True,False,False
814,45,C5-4,12.0,10500540.0,785.57074,791.923218,24.563698,58991504.0,322.0,1196.0,...,0.924954,5.577156,0.922488,385.51961,108.164886,SARCO361,1,False,False,False
815,5,F10-3,,251685.0,1174.855347,1460.179688,17.609604,358530.0,1020.0,1343.0,...,0.995038,0.7181,0.074539,101.321235,29.294415,SARCO361,1,True,False,False
