# Merge single cells from CellProfiler outputs using CytoTable

In [1]:
import pathlib
import pprint
import sys

import pandas as pd
from cytotable import convert, presets

sys.path.append("../../../utils")
import sc_extraction_utils as sc_utils
from parsl.config import Config
from parsl.executors import HighThroughputExecutor

## Set paths and variables

All paths must be string but we use pathlib to show which variables are paths

In [2]:
# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# s1lite directory
source_dir = pathlib.Path("../../3.cellprofiling/analysis_output")
# directory where parquet files are saved to
output_dir = pathlib.Path("../data/converted_data")
output_dir.mkdir(exist_ok=True, parents=True)

## set config joins for each preset

In [3]:
# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"
# remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names
# add Image_Metadata_Site as this is an important metadata when finding where single cells are located
presets_config = """WITH Per_Image_Filtered AS (
                SELECT
                    Metadata_ImageNumber,
                    Image_Metadata_Well,
                    Image_Metadata_FOV,
                    Image_PathName_CL488,
                    Image_PathName_CL561,
                    Image_PathName_GSDM,
                    Image_PathName_BF,
                    Image_PathName_DNA,
                    Image_FileName_CL488,
                    Image_FileName_CL561,
                    Image_FileName_GSDM,
                    Image_FileName_BF,
                    Image_FileName_DNA
                FROM
                    read_parquet('per_image.parquet')
                )
            SELECT
                *
            FROM
                Per_Image_Filtered AS per_image
            LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
                per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
                per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
                per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
            """

In [4]:
sqlite_file_paths = pathlib.Path("../../3.cellprofiling/analysis_output").rglob(
    "*.sqlite"
)
# get all directories with raw images
dict_of_runs = {}
sqlite_file_paths = [str(x) for x in sqlite_file_paths]
# order the sqlite files by the name of the directory
sqlite_file_paths = sorted(sqlite_file_paths)

for sqlite in sqlite_file_paths:
    # get the name of the directory
    run_name = sqlite.split("/")[-2].split(".")[0]
    dict_of_runs[run_name] = {
        "source_path": sqlite,
        "dest_path": str(pathlib.Path(output_dir / f"{run_name}.parquet")),
        "preset": presets_config,
    }

## Convert SQLite file and merge single cells into parquet file

This was not run to completion as we use the nbconverted python file for full run.

In [5]:
# run through each run with each set of paths based on dictionary
for sqlite_file, info in dict_of_runs.items():
    source_path = info["source_path"]
    dest_path = info["dest_path"]
    presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"] = info["preset"]
    print(f"Performing merge single cells and conversion on {sqlite_file}!")
    print(f"Source path: {source_path}")
    print(f"Destination path: {dest_path}")
    # merge single cells and output as parquet file
    convert(
        source_path=source_path,
        dest_path=dest_path,
        dest_datatype=dest_datatype,
        preset=preset,
        parsl_config=Config(
            executors=[HighThroughputExecutor()],
        ),
        chunk_size=1000,
    )
    print(f"Merged and converted {pathlib.Path(dest_path).name}!")
    df = pd.read_parquet(dest_path)
    print(f"Shape of {pathlib.Path(dest_path).name}: {df.shape}")
    # add single cell count per well as metadata column to parquet file and save back to same path
    sc_utils.add_sc_count_metadata_file(
        data_path=dest_path,
        well_column_name="Metadata_ImageNumber",
        file_type="parquet",
    )
    # read the parquet file to check if metadata was added
    df1 = pd.read_parquet(dest_path)
    print(f"Shape of {pathlib.Path(dest_path).name}: {df.shape}")
    print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!")

Performing merge single cells and conversion on 20241024T194653_W0053_F0001!
Source path: ../../3.cellprofiling/analysis_output/20241024T194653_W0053_F0001/pyroptosis_timelapse.sqlite
Destination path: ../data/converted_data/20241024T194653_W0053_F0001.parquet
Merged and converted 20241024T194653_W0053_F0001.parquet!
Shape of 20241024T194653_W0053_F0001.parquet: (423, 2887)


Reusing previously loaded Parsl configuration.


Shape of 20241024T194653_W0053_F0001.parquet: (423, 2887)
Added single cell count as metadata to 20241024T194653_W0053_F0001.parquet!
Performing merge single cells and conversion on 20241026T104653_W0082_F0005!
Source path: ../../3.cellprofiling/analysis_output/20241026T104653_W0082_F0005/pyroptosis_timelapse.sqlite
Destination path: ../data/converted_data/20241026T104653_W0082_F0005.parquet
Merged and converted 20241026T104653_W0082_F0005.parquet!
Shape of 20241026T104653_W0082_F0005.parquet: (757, 2887)


Reusing previously loaded Parsl configuration.


Shape of 20241026T104653_W0082_F0005.parquet: (757, 2887)
Added single cell count as metadata to 20241026T104653_W0082_F0005.parquet!
Performing merge single cells and conversion on 20241026T134545_W0150_F0004!
Source path: ../../3.cellprofiling/analysis_output/20241026T134545_W0150_F0004/pyroptosis_timelapse.sqlite
Destination path: ../data/converted_data/20241026T134545_W0150_F0004.parquet
Merged and converted 20241026T134545_W0150_F0004.parquet!
Shape of 20241026T134545_W0150_F0004.parquet: (337, 2887)
Shape of 20241026T134545_W0150_F0004.parquet: (337, 2887)
Added single cell count as metadata to 20241026T134545_W0150_F0004.parquet!


In [6]:
df1.head()

Unnamed: 0,Metadata_ImageNumber,Image_Metadata_FOV,Metadata_number_of_singlecells,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ImageNumber_1,Metadata_ImageNumber_2,Metadata_ImageNumber_3,...,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256,Nuclei_Texture_Variance_GSDM_3_00_256,Nuclei_Texture_Variance_GSDM_3_01_256,Nuclei_Texture_Variance_GSDM_3_02_256,Nuclei_Texture_Variance_GSDM_3_03_256,Nuclei_Texture_Variance_Whole_Cell_3_00_256,Nuclei_Texture_Variance_Whole_Cell_3_01_256,Nuclei_Texture_Variance_Whole_Cell_3_02_256,Nuclei_Texture_Variance_Whole_Cell_3_03_256
0,1,4,337,150,1,1,1,1,1,1,...,0.26261,0.262635,0.240914,0.241297,0.241129,0.242638,0.417559,0.398609,0.404051,0.426458
1,1,4,337,150,2,2,2,1,1,1,...,0.594994,0.606577,0.473665,0.407374,0.424724,0.447725,2.466934,2.330493,2.575771,2.629137
2,1,4,337,150,3,3,3,1,1,1,...,0.293844,0.301022,0.0,0.0,0.0,0.0,0.376208,0.31932,0.36017,0.356125
3,1,4,337,150,4,4,4,1,1,1,...,3.624972,3.632566,0.661129,0.666471,0.680764,0.655552,12.910636,15.930176,19.616605,15.518947
4,1,4,337,150,5,5,5,1,1,1,...,0.155497,0.15958,0.0,0.0,0.0,0.0,0.03555,0.028821,0.031768,0.009708
