# Merge single cells from CellProfiler outputs using CytoTable

In [1]:
import argparse
import pathlib
import pprint
import sys

import pandas as pd
from cytotable import convert, presets

sys.path.append("../../../utils")
import sc_extraction_utils as sc_utils
from parsl.config import Config
from parsl.executors import HighThroughputExecutor

## Set paths and variables

All paths must be string but we use pathlib to show which variables are paths

In [2]:
# check if in a jupyter notebook
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

if not in_notebook:
    print("Running as script")
    # set up arg parser
    parser = argparse.ArgumentParser(description="Segment the nuclei of a tiff image")

    parser.add_argument(
        "--input_dir",
        type=str,
        help="Path to the input directory containing the tiff images",
    )

    args = parser.parse_args()
    input_dir = pathlib.Path(args.input_dir).resolve(strict=True)
else:
    print("Running in a notebook")
    input_dir = pathlib.Path(
        "../../3.cellprofiling/analysis_output/W0052_F0001"
    ).resolve(strict=True)

Running in a notebook


In [3]:
# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# directory where parquet files are saved to
output_dir = pathlib.Path("../data/converted_data")
output_dir.mkdir(exist_ok=True, parents=True)

In [4]:
# get the .sqlite file from the input directory
sqlite_file = list(input_dir.glob("*.sqlite"))[0]
dest_path = output_dir / str(sqlite_file.parent).split("/")[-1]
dest_path.mkdir(exist_ok=True, parents=True)
dest_path = dest_path / f"{sqlite_file.stem}.{dest_datatype}"
print(f"Destination path: {dest_path}")

Destination path: ../data/converted_data/W0052_F0001/pyroptosis_timelapse.parquet


## set config joins for each preset

In [None]:
# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"
presets.config[preset][
    "CONFIG_JOINS"
    # remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names
    # add Image_Metadata_FOV as this is an important metadata when finding where single cells are located
] = """WITH Per_Image_Filtered AS (
                SELECT
                    Metadata_ImageNumber,
                    Image_Metadata_Time,
                    Image_Metadata_Well,
                    Image_Metadata_FOV,
                    Image_PathName_CL488,
                    Image_PathName_CL561,
                    Image_PathName_GSDM,
                    Image_PathName_BF,
                    Image_PathName_DNA,
                    Image_FileName_CL488,
                    Image_FileName_CL561,
                    Image_FileName_GSDM,
                    Image_FileName_BF,
                    Image_FileName_DNA,
                FROM
                    read_parquet('per_image.parquet')
                )
            SELECT
                *
            FROM
                Per_Image_Filtered AS per_image
            LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
                per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
                per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
                per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
            """

## Convert SQLite file and merge single cell objects into parquet file

This was not run to completion as we use the nbconverted python file for full run.

In [6]:
# merge single cells and output as parquet file
convert(
    source_path=sqlite_file,
    dest_path=dest_path,
    dest_datatype=dest_datatype,
    preset=preset,
    parsl_config=Config(
        executors=[HighThroughputExecutor()],
    ),
    chunk_size=1000,
)

PosixPath('/home/lippincm/Documents/pyroptosis_live-cell_timelapse/Wave1_data/4.processing_profiled_features/data/converted_data/W0052_F0001/pyroptosis_timelapse.parquet')

In [7]:
print(f"Merged and converted {pathlib.Path(dest_path).name}!")
print(f"Saved to {dest_path}")
df = pd.read_parquet(dest_path)
df["Metadata_Well_Time"] = df["Image_Metadata_Well"] + "_" + df["Image_Metadata_Time"]
print(f"Shape of {pathlib.Path(dest_path).name}: {df.shape}")
df.head()

Merged and converted pyroptosis_timelapse.parquet!
Saved to ../data/converted_data/W0052_F0001/pyroptosis_timelapse.parquet
Shape of pyroptosis_timelapse.parquet: (12384, 2779)


Unnamed: 0,Metadata_ImageNumber,Image_Metadata_FOV,Image_Metadata_Time,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ImageNumber_1,Metadata_ImageNumber_2,Metadata_ImageNumber_3,...,Nuclei_Texture_Variance_CL561_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256,Nuclei_Texture_Variance_GSDM_3_00_256,Nuclei_Texture_Variance_GSDM_3_01_256,Nuclei_Texture_Variance_GSDM_3_02_256,Nuclei_Texture_Variance_GSDM_3_03_256,Metadata_Well_Time
0,1,1,0,52,1.0,1,1,1,1.0,1.0,...,0.0,0.184131,0.188453,0.177188,0.181128,0.103136,0.115436,0.113431,0.134241,0052_00
1,2,1,1,52,1.0,1,1,2,2.0,2.0,...,0.0,0.470863,0.465088,0.462358,0.468509,0.023555,0.019912,0.017416,0.014922,0052_01
2,3,1,2,52,1.0,1,13,3,3.0,3.0,...,0.0,0.277647,0.27147,0.271735,0.270393,0.131301,0.123887,0.119853,0.121257,0052_02
3,4,1,3,52,1.0,1,22,4,4.0,4.0,...,0.0,0.004624,0.005019,0.004734,0.004745,0.269545,0.264233,0.268985,0.272696,0052_03
4,5,1,4,52,1.0,1,2,5,5.0,5.0,...,0.0,0.383196,0.372558,0.3953,0.397009,0.386546,0.379937,0.407594,0.41207,0052_04


In [None]:
Metadata_number_of_singlecells_df = (
    df.groupby("Metadata_Well_Time")
    .value_counts()
    .reset_index(name="Metadata_number_of_singlecells")
)
# merge the number of single cells with the original dataframe
df = df.merge(
    Metadata_number_of_singlecells_df, on=["Metadata_Well_Time", "Metadata_Well_Time"]
)
df.head()

Unnamed: 0,Metadata_ImageNumber,Image_Metadata_FOV,Image_Metadata_Time,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ImageNumber_1,Metadata_ImageNumber_2,Metadata_ImageNumber_3,...,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256,Nuclei_Texture_Variance_GSDM_3_00_256,Nuclei_Texture_Variance_GSDM_3_01_256,Nuclei_Texture_Variance_GSDM_3_02_256,Nuclei_Texture_Variance_GSDM_3_03_256,Metadata_Well_Time,Metadata_number_of_singlecells
0,1,1,0,52,1.0,1,1,1,1.0,1.0,...,0.184131,0.188453,0.177188,0.181128,0.103136,0.115436,0.113431,0.134241,0052_00,502
1,1,1,0,52,2.0,2,17,1,1.0,1.0,...,0.212552,0.217013,0.210907,0.213728,0.410092,0.382479,0.408034,0.422193,0052_00,502
2,1,1,0,52,3.0,3,2,1,1.0,1.0,...,0.232724,0.223048,0.226669,0.223174,0.090572,0.08778,0.096994,0.087674,0052_00,502
3,1,1,0,52,4.0,4,3,1,1.0,1.0,...,0.437401,0.436013,0.421019,0.423367,0.0,0.0,0.0,0.0,0052_00,502
4,1,1,0,52,5.0,5,32,1,1.0,1.0,...,0.473553,0.457916,0.465556,0.472717,0.0,0.0,0.0,0.0,0052_00,502


In [9]:
# cast to int
df["Metadata_number_of_singlecells"] = df["Metadata_number_of_singlecells"].astype(int)
df.to_parquet(dest_path)

print(f"Shape of {pathlib.Path(dest_path).name}: {df.shape}")
print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!")

Shape of pyroptosis_timelapse.parquet: (12384, 2780)
Added single cell count as metadata to pyroptosis_timelapse.parquet!


In [10]:
df.head()

Unnamed: 0,Metadata_ImageNumber,Image_Metadata_FOV,Image_Metadata_Time,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ImageNumber_1,Metadata_ImageNumber_2,Metadata_ImageNumber_3,...,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256,Nuclei_Texture_Variance_GSDM_3_00_256,Nuclei_Texture_Variance_GSDM_3_01_256,Nuclei_Texture_Variance_GSDM_3_02_256,Nuclei_Texture_Variance_GSDM_3_03_256,Metadata_Well_Time,Metadata_number_of_singlecells
0,1,1,0,52,1.0,1,1,1,1.0,1.0,...,0.184131,0.188453,0.177188,0.181128,0.103136,0.115436,0.113431,0.134241,0052_00,502
1,1,1,0,52,2.0,2,17,1,1.0,1.0,...,0.212552,0.217013,0.210907,0.213728,0.410092,0.382479,0.408034,0.422193,0052_00,502
2,1,1,0,52,3.0,3,2,1,1.0,1.0,...,0.232724,0.223048,0.226669,0.223174,0.090572,0.08778,0.096994,0.087674,0052_00,502
3,1,1,0,52,4.0,4,3,1,1.0,1.0,...,0.437401,0.436013,0.421019,0.423367,0.0,0.0,0.0,0.0,0052_00,502
4,1,1,0,52,5.0,5,32,1,1.0,1.0,...,0.473553,0.457916,0.465556,0.472717,0.0,0.0,0.0,0.0,0052_00,502
