# Perform single-cell level quality control

In [1]:
import pandas as pd
import pathlib
from cosmicqc import find_outliers

## Load in each single-cell level profile per patient and process

1. Load in the single-cell data (add `patient_id` column).
2. Load in respective organoid qc data (only metadata and cqc columns) to already flag cells that come from a flagged organoid.
   - Also add a flag for if single-cells do not have an organoid segmentation (`parent_organoid` == -1).
   - Also add flag for if the `object_id` for a single-cell is NaN.
3. Concat single-cell data together.

In [2]:
# Path to patient folders
path_to_patients = pathlib.Path(
    "/home/jenna/mnt/bandicoot/NF1_organoid_processed_patients/"
)

dfs = []
for patient_folder in path_to_patients.iterdir():
    single_cell_file = (
        patient_folder / "image_based_profiles/1.combined_profiles" / "sc.parquet"
    )
    organoid_flags_file = (
        patient_folder
        / "image_based_profiles/1a.qc_profiles"
        / "organoid_flagged_outliers.parquet"
    )

    if single_cell_file.exists():
        sc_df = pd.read_parquet(single_cell_file)
        sc_df["patient_id"] = patient_folder.name

        # Default QC flags
        sc_df["cqc.organoid_flagged"] = False
        sc_df["cqc.nan_detected"] = sc_df["object_id"].isna()
        sc_df["cqc.missing_organoid"] = sc_df["parent_organoid"] == -1

        if organoid_flags_file.exists():
            organoid_flags_df = pd.read_parquet(organoid_flags_file)[
                ["object_id", "image_set"]
                + [
                    col
                    for col in pd.read_parquet(organoid_flags_file).columns
                    if col.startswith("cqc")
                ]
            ]

            # Get flagged (object_id, image_set) pairs
            flagged_pairs = set(
                organoid_flags_df.loc[
                    organoid_flags_df.filter(like="cqc").any(axis=1),
                    ["object_id", "image_set"],
                ].itertuples(index=False, name=None)
            )

            # Flag SC rows where both parent_organoid & image_set match a flagged organoid
            sc_df["cqc.organoid_flagged"] = sc_df.apply(
                lambda row: (row["parent_organoid"], row["image_set"]) in flagged_pairs,
                axis=1,
            )

        dfs.append(sc_df)

orig_single_cell_profiles_df = pd.concat(dfs, ignore_index=True)

print(orig_single_cell_profiles_df.shape)
orig_single_cell_profiles_df.head()

(11283, 1362)


Unnamed: 0,object_id,image_set,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,...,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3,patient_id,cqc.organoid_flagged,cqc.nan_detected,cqc.missing_organoid
0,63,F10-3,-1,28320.0,1248.030884,1102.819214,3.942938,59598.0,1204.0,1290.0,...,0.092827,0.998877,0.044396,0.020818,2.862554,1.015763,NF0018,False,False,True
1,127,F10-3,-1,13926.0,318.801086,457.011475,5.499138,21760.0,283.0,363.0,...,0.902757,0.805164,8.739858,2.151855,184.892341,47.741621,NF0018,False,False,True
2,255,F10-3,-1,3187.0,899.354858,950.341675,6.448698,4340.0,885.0,916.0,...,0.029806,0.999749,0.025128,0.00471,3.03407,1.184079,NF0018,False,False,True
3,39,G8-1,20,25715.0,414.767761,520.522827,14.032004,38808.0,379.0,456.0,...,0.260216,0.995318,0.941244,0.073568,168.183126,45.306241,NF0018,False,False,False
4,47,G8-1,20,22188.0,130.78772,832.4729,14.043807,30660.0,102.0,162.0,...,0.024222,0.999847,0.019745,0.00279,2.774002,1.060193,NF0018,False,False,False


## Detect outlier single-cells using the non-flagged data

We will attempt to detect instances of poor quality segmentations using the nuclei compartment as the base. The conditions we are using are as follows:

1. Abnormally small or large nuclei using `Volume`
2. Abnormally high `mass displacement` in the nuclei for instances of mis-segmentation of background/no longer in-focus

In [3]:
# Set the metadata columns to be used in the QC process
metadata_columns = [
    "patient_id",
    "image_set",
    "object_id",
    "parent_organoid",
    "Area.Size.Shape_Nuclei_CENTER.X",
    "Area.Size.Shape_Nuclei_CENTER.Y",
    "cqc.nan_detected",
    "cqc.organoid_flagged",
    "cqc.missing_organoid",
]

In [4]:
# Process each plate (patient_id) independently in the combined dataframe
for plate_name, plate_df in orig_single_cell_profiles_df.groupby("patient_id"):
    print(f"Processing plate: {plate_name}")

    # Make a contiguous copy to prevent DataFrame fragmentation
    plate_df = plate_df.copy()

    # Only process the rows that are not flagged
    filtered_plate_df = plate_df[
        ~(
            plate_df["cqc.nan_detected"]
            | plate_df["cqc.organoid_flagged"]
            | plate_df["cqc.missing_organoid"]
        )
    ]

    # --- Find size based nuclei outliers ---
    print("Finding small nuclei outliers...")
    small_nuclei_outliers = find_outliers(
        df=filtered_plate_df,
        metadata_columns=metadata_columns,
        feature_thresholds={
            "Area.Size.Shape_Nuclei_VOLUME": -1,  # Detect very small nuclei
        },
    )

    # Ensure the column exists before assignment
    plate_df["cqc.small_nuclei_outlier"] = False
    plate_df.loc[small_nuclei_outliers.index, "cqc.small_nuclei_outlier"] = True

    print("Finding large nuclei outliers...")
    large_nuclei_outliers = find_outliers(
        df=filtered_plate_df,
        metadata_columns=metadata_columns,
        feature_thresholds={
            "Area.Size.Shape_Nuclei_VOLUME": 2,  # Detect very large nuclei
        },
    )

    # Ensure the column exists before assignment
    plate_df["cqc.large_nuclei_outlier"] = False
    plate_df.loc[large_nuclei_outliers.index, "cqc.large_nuclei_outlier"] = True

    # --- Find mass displacement based nuclei outliers ---
    print("Finding high mass displacement outliers...")
    high_mass_displacement_outliers = find_outliers(
        df=filtered_plate_df,
        metadata_columns=metadata_columns,
        feature_thresholds={
            "Intensity_Nuclei_DNA_MASS.DISPLACEMENT": 2,  # Detect high mass displacement
        },
    )

    # Ensure the column exists before assignment
    plate_df["cqc.mass_displacement_outlier"] = False
    plate_df.loc[
        high_mass_displacement_outliers.index, "cqc.mass_displacement_outlier"
    ] = True

    # Print number of outliers (only in filtered rows)
    small_count = filtered_plate_df.index.intersection(
        small_nuclei_outliers.index
    ).shape[0]
    large_count = filtered_plate_df.index.intersection(
        large_nuclei_outliers.index
    ).shape[0]
    high_mass_count = filtered_plate_df.index.intersection(
        high_mass_displacement_outliers.index
    ).shape[0]

    print(f"Small nuclei outliers found: {small_count}")
    print(f"Large nuclei outliers found: {large_count}")
    print(f"High mass displacement outliers found: {high_mass_count}")

    # Save updated plate_df with flag columns included
    output_folder = (
        path_to_patients / plate_name / "image_based_profiles/1a.qc_profiles"
    )
    output_folder.mkdir(parents=True, exist_ok=True)
    output_file = output_folder / "sc_flagged_outliers.parquet"
    plate_df.to_parquet(output_file, index=False)
    print(f"Saved single-cell profiles with outlier flags to {output_file}\n")

Processing plate: NF0014
Finding small nuclei outliers...
Number of outliers: 175 (14.57%)
Outliers Range:
Area.Size.Shape_Nuclei_VOLUME Min: 350.0
Area.Size.Shape_Nuclei_VOLUME Max: 19155.0
Finding large nuclei outliers...
Number of outliers: 67 (5.58%)
Outliers Range:
Area.Size.Shape_Nuclei_VOLUME Min: 163455.0
Area.Size.Shape_Nuclei_VOLUME Max: 310499.0
Finding high mass displacement outliers...
Number of outliers: 18 (1.50%)
Outliers Range:
Intensity_Nuclei_DNA_MASS.DISPLACEMENT Min: 1588.0222
Intensity_Nuclei_DNA_MASS.DISPLACEMENT Max: 1924.7913
Small nuclei outliers found: 175
Large nuclei outliers found: 67
High mass displacement outliers found: 18
Saved single-cell profiles with outlier flags to /home/jenna/mnt/bandicoot/NF1_organoid_processed_patients/NF0014/image_based_profiles/1a.qc_profiles/sc_flagged_outliers.parquet

Processing plate: NF0016
Finding small nuclei outliers...
Number of outliers: 43 (16.10%)
Outliers Range:
Area.Size.Shape_Nuclei_VOLUME Min: 355.0
Area.Size.

In [5]:
# Print example output of the flagged single-cell profiles
print(f"Example flagged single-cell profiles: {plate_name}")
print(plate_df.shape)
plate_df.head()

Example flagged single-cell profiles: SARCO361
(1503, 1365)


Unnamed: 0,object_id,image_set,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,...,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3,patient_id,cqc.organoid_flagged,cqc.nan_detected,cqc.missing_organoid,cqc.small_nuclei_outlier,cqc.large_nuclei_outlier,cqc.mass_displacement_outlier
6546,63,F6-4,-1,33439.0,280.742004,924.894287,7.15763,67150.0,247.0,326.0,...,0.074045,50.896208,14.104151,SARCO361,False,False,True,False,False,False
6547,127,F6-4,12,48327.0,856.496338,385.540802,8.599976,84840.0,808.0,909.0,...,0.094164,21.10297,5.665539,SARCO361,True,False,False,False,False,False
6548,191,F6-4,-1,51519.0,206.176987,991.958923,6.966712,67774.0,156.0,259.0,...,0.050947,70.259518,23.01607,SARCO361,False,False,True,False,False,False
6549,19,C5-4,45,39623.0,994.490601,753.775269,13.942836,58618.0,952.0,1031.0,...,0.024333,4.438447,1.314797,SARCO361,False,False,False,False,False,False
6550,39,C5-4,45,56150.0,780.578857,817.812744,17.611238,107328.0,742.0,828.0,...,0.024122,64.362335,19.475377,SARCO361,False,False,False,False,False,False
