# Merge single cells from CellProfiler outputs using CytoTable

In [1]:
import pathlib
import pprint
import shutil
import sys

import pandas as pd
from cytotable import convert, presets

sys.path.append("../../utils")
import sc_extraction_utils as sc_utils
from parsl.config import Config
from parsl.executors import HighThroughputExecutor

## Set paths and variables

All paths must be string but we use pathlib to show which variables are paths

In [2]:
# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# s1lite directory
source_dir = pathlib.Path("../../2.cellprofiling/analysis_output/C4-2")
# directory where parquet files are saved to
output_dir = pathlib.Path("../data/converted_data")
output_dir.mkdir(exist_ok=True, parents=True)

## set config joins for each preset

In [3]:
# temporary remove the existing files
paths_to_remove = pathlib.Path("../data").resolve()
if paths_to_remove.exists():
    for f in paths_to_remove.iterdir():
        shutil.rmtree(f)

In [4]:
# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

presets.config[preset]["CONFIG_NAMES_COMPARTMENTS"] = (
    "cell",
    "nuclei",
    "cytoplasm",
    "organoid",
)

presets.config[preset]["CONFIG_IDENTIFYING_COLUMNS"] = (
    "ImageNumber",
    "Metadata_Well",
    "Parent_Cell",
    "Parent_Nuclei",
    "Parent_Organoid",
)

presets.config[preset]["CONFIG_PAGE_KEYS"] = {
    "image": "ImageNumber",
    "cell": "Cell_Number_Object_Number",
    "nuclei": "Nuclei_Number_Object_Number",
    "cytoplasm": "Cytoplasm_Number_Object_Number",
    "organoid": "Organoid_Number_Object_Number",
    "join": "Cytoplasm_Number_Object_Number",
}

# remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names
# add Image_Metadata_Site as this is an important metadata when finding where single cells are located
presets.config[preset][
    "CONFIG_JOINS"
    # create filtered list of image features to be extracted and used for merging tables
    # with the list of image features, this will merge the objects together using the image number,
    # and parent objects to create all single cells (all objects associated to one cell)
] = """WITH Per_Image_Filtered AS (
                SELECT
                    Metadata_ImageNumber,
                    Image_Metadata_Well,
                    Image_Metadata_FOV,
                FROM
                    read_parquet('per_image.parquet')
                )
            SELECT
                *
            FROM
                Per_Image_Filtered AS per_image

            LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
                per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber

            LEFT JOIN read_parquet('per_cell.parquet') AS per_cell ON
                per_cell.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_cell.Cell_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cell

            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
                per_nuclei.Metadata_ImageNumber = per_cell.Metadata_ImageNumber
                AND per_nuclei.Nuclei_Number_Object_Number = per_cell.Cell_Parent_Nuclei
 """
# LEFT JOIN read_parquet('per_organoid.parquet') AS per_organoid ON
#     per_organoid.Metadata_ImageNumber = per_nuclei.Metadata_ImageNumber
#     AND per_organoid.Organoid_Number_Object_Number = per_nuclei.Nuclei_Parent_Organoid


presets.config["cellprofiler_sqlite_pycytominer"]

{'CONFIG_SOURCE_VERSION': {'cellprofiler': 'v4.2.4',
  'pycytominer': 'c90438fd7c11ad8b1689c21db16dab1a5280de6c'},
 'CONFIG_NAMES_COMPARTMENTS': ('cell', 'nuclei', 'cytoplasm', 'organoid'),
 'CONFIG_NAMES_METADATA': ('image',),
 'CONFIG_IDENTIFYING_COLUMNS': ('ImageNumber',
  'Metadata_Well',
  'Parent_Cell',
  'Parent_Nuclei',
  'Parent_Organoid'),
 'CONFIG_PAGE_KEYS': {'image': 'ImageNumber',
  'cell': 'Cell_Number_Object_Number',
  'nuclei': 'Nuclei_Number_Object_Number',
  'cytoplasm': 'Cytoplasm_Number_Object_Number',
  'organoid': 'Organoid_Number_Object_Number',
  'join': 'Cytoplasm_Number_Object_Number'},
 'CONFIG_CHUNK_SIZE': 1000,
 'CONFIG_JOINS': "WITH Per_Image_Filtered AS (\n                SELECT\n                    Metadata_ImageNumber,\n                    Image_Metadata_Well,\n                    Image_Metadata_FOV,\n                FROM\n                    read_parquet('per_image.parquet')\n                )\n            SELECT\n                *\n            FROM\n

In [5]:
input_path = source_dir / "C4-2.sqlite"
dest_path = f"{output_dir}/C4-2.parquet"

## Convert SQLite file and merge single cells into parquet file

This was not run to completion as we use the nbconverted python file for full run.

In [6]:
print(f"Performing merge single cells and conversion on {source_dir}!")
print(f"Source path: {input_path}")
print(f"Destination path: {dest_path}")
# merge single cells and output as parquet file
convert(
    source_path=input_path,
    dest_path=dest_path,
    dest_datatype=dest_datatype,
    preset=preset,
    parsl_config=Config(
        executors=[HighThroughputExecutor()],
    ),
    chunk_size=10000,
)
print(f"Merged and converted {pathlib.Path(dest_path).name}!")
df = pd.read_parquet(dest_path)
print(f"Shape of {pathlib.Path(dest_path).name}: {df.shape}")
# add single cell count per well as metadata column to parquet file and save back to same path
sc_utils.add_sc_count_metadata_file(
    data_path=dest_path,
    well_column_name="Metadata_ImageNumber",
    file_type="parquet",
)
# read the parquet file to check if metadata was added
df1 = pd.read_parquet(dest_path)
print(f"Shape of {pathlib.Path(dest_path).name}: {df.shape}")
print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!")

Performing merge single cells and conversion on ../../2.cellprofiling/analysis_output/C4-2!
Source path: ../../2.cellprofiling/analysis_output/C4-2/C4-2.sqlite
Destination path: ../data/converted_data/C4-2.parquet
Merged and converted C4-2.parquet!
Shape of C4-2.parquet: (99, 23)
Shape of C4-2.parquet: (99, 23)
Added single cell count as metadata to C4-2.parquet!


In [7]:
# number of columns to display to max
pd.set_option("display.max_columns", None)

In [24]:
# read in the parquet file to check if metadata was added
df = pd.read_parquet(dest_path)
df.head()

Unnamed: 0,Metadata_ImageNumber,Image_Metadata_FOV,Metadata_number_of_singlecells,Image_Metadata_Well,Metadata_ImageNumber_1,Metadata_ImageNumber_2,Metadata_ImageNumber_3,Cytoplasm_Location_Center_X,Cytoplasm_Location_Center_Y,Cytoplasm_Location_Center_Z,Cytoplasm_Number_Object_Number,Cytoplasm_Parent_Cell,Nuclei_Children_Cell_Count,Nuclei_Location_Center_X,Nuclei_Location_Center_Y,Nuclei_Location_Center_Z,Nuclei_Number_Object_Number,Nuclei_Parent_Organoid,Cell_Children_Cytoplasm_Count,Cell_Location_Center_X,Cell_Location_Center_Y,Cell_Location_Center_Z,Cell_Number_Object_Number,Cell_Parent_Nuclei
0,1,2,99,C4,1,1,1.0,720.758471,193.879486,7.049994,1,1,2.0,715.248712,224.309529,7.80682,1.0,3.0,1,719.122275,202.517496,7.259229,1,1
1,1,2,99,C4,1,1,,958.931941,390.012849,0.0,2,2,,,,,,,1,958.931941,390.012849,0.0,2,0
2,1,2,99,C4,1,1,,794.522771,646.754605,1.0,3,3,,,,,,,1,794.522771,646.754605,1.0,3,0
3,1,2,99,C4,1,1,1.0,470.420402,223.070723,4.97712,4,4,1.0,503.492571,253.396013,4.949646,22.0,3.0,3,482.159045,233.644757,4.886259,4,22
4,1,2,99,C4,1,1,,310.617761,316.538798,1.5,5,5,,,,,,,1,310.617761,316.538798,1.5,5,0


In [26]:
# drop rows that have NaN values
df = df.dropna()
df.shape

(59, 24)

In [27]:
df

Unnamed: 0,Metadata_ImageNumber,Image_Metadata_FOV,Metadata_number_of_singlecells,Image_Metadata_Well,Metadata_ImageNumber_1,Metadata_ImageNumber_2,Metadata_ImageNumber_3,Cytoplasm_Location_Center_X,Cytoplasm_Location_Center_Y,Cytoplasm_Location_Center_Z,Cytoplasm_Number_Object_Number,Cytoplasm_Parent_Cell,Nuclei_Children_Cell_Count,Nuclei_Location_Center_X,Nuclei_Location_Center_Y,Nuclei_Location_Center_Z,Nuclei_Number_Object_Number,Nuclei_Parent_Organoid,Cell_Children_Cytoplasm_Count,Cell_Location_Center_X,Cell_Location_Center_Y,Cell_Location_Center_Z,Cell_Number_Object_Number,Cell_Parent_Nuclei
0,1,2,99,C4,1,1,1.0,720.758471,193.879486,7.049994,1,1,2.0,715.248712,224.309529,7.80682,1.0,3.0,1,719.122275,202.517496,7.259229,1,1
3,1,2,99,C4,1,1,1.0,470.420402,223.070723,4.97712,4,4,1.0,503.492571,253.396013,4.949646,22.0,3.0,3,482.159045,233.644757,4.886259,4,22
5,1,2,99,C4,1,1,1.0,843.622874,292.08789,1.501617,6,6,2.0,715.248712,224.309529,7.80682,1.0,3.0,1,842.57081,291.781097,1.5,6,1
7,1,2,99,C4,1,1,1.0,720.614743,430.717722,15.363581,8,8,1.0,756.831546,531.576773,22.302886,53.0,3.0,4,722.124828,431.04322,16.127434,8,53
8,1,2,99,C4,1,1,1.0,169.431116,368.617577,1.0,9,9,1.0,178.397704,363.35595,1.0,3.0,0.0,1,175.660261,364.962292,1.0,9,3
9,1,2,99,C4,1,1,1.0,426.17462,460.779069,13.276953,10,10,1.0,466.132324,466.749976,13.380236,13.0,3.0,3,436.866267,467.269732,13.524059,10,13
10,1,2,99,C4,1,1,1.0,511.405907,566.291485,4.977179,11,11,1.0,505.684966,556.496021,2.508908,5.0,3.0,3,509.155702,562.963224,4.752233,11,5
13,1,2,99,C4,1,1,1.0,651.506642,639.508669,13.196878,14,14,1.0,631.706705,623.896648,14.842532,18.0,3.0,2,647.205528,630.419021,14.820391,14,18
14,1,2,99,C4,1,1,1.0,1088.85565,697.722843,1.493317,15,15,4.0,1104.703567,673.944565,3.001895,6.0,3.0,1,1098.093325,684.379696,1.5,15,6
15,1,2,99,C4,1,1,1.0,519.055992,730.564191,1.990417,16,16,2.0,565.530449,803.121923,5.578015,8.0,3.0,2,530.597921,748.802547,2.016131,16,8
