In [1]:
import argparse
import pathlib
import sys

import pandas as pd
from cytotable import convert, presets

sys.path.append("../../../utils")
import uuid

from parsl.config import Config
from parsl.executors import HighThroughputExecutor

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    argparser.add_argument(
        "--well_fov",
        type=str,
        required=True,
        help="Well and field of view to process, e.g. 'A01_1'",
    )
    args = argparser.parse_args()
    patient = args.patient
    well_fov = args.well_fov
else:
    patient = "NF0014"
    well_fov = "C4-2"

In [3]:
input_sqlite_file = pathlib.Path(
    f"../../data/{patient}/converted_profiles/{well_fov}.sqlite"
).resolve(strict=True)
destination_sc_parquet_file = pathlib.Path(
    f"../../data/{patient}/converted_profiles/{well_fov}.parquet"
).resolve()
destination_organoid_parquet_file = pathlib.Path(
    f"../../data/{patient}/converted_profiles/{well_fov}.parquet"
).resolve()
dest_datatype = "parquet"

In [17]:
nuclei_table = pd.read_sql_table(
    "Nuclei",
    str(input_sqlite_file),
    index_col="nucleus_id",
)
cell_table = pd.read_sql_table(
    "Cell",
    str(input_sqlite_file),
    index_col="cell_id",
)
cytoplasm_table = pd.read_sql_table(
    "Cytoplasm",
    str(input_sqlite_file),
    index_col="cytoplasm_id",
)
organoid_table = pd.read_sql_table(
    "Organoid",
    str(input_sqlite_file),
    index_col="organoid_id",
)
nuclei_table

ImportError: Using URI string without sqlalchemy installed.

DatabaseError: Execution failed on sql 'SELECT *
            FROM Cytoplasm
            LEFT JOIN Cell USING (object_id)
            LEFT JOIN Nuclei USING (object_id);': too many columns in result set

In [10]:
# merge single cells and output as parquet file
convert(
    source_path=input_sqlite_file,
    dest_path=destination_sc_parquet_file,
    dest_datatype=dest_datatype,
    preset=preset,
    joins=presets.config[preset]["CONFIG_JOINS"],
    parsl_config=Config(
        executors=[HighThroughputExecutor()],
        run_dir=f"run_dir_{uuid.uuid4().hex}",
    ),
    chunk_size=1000,
)

Reusing previously loaded Parsl configuration.


BinderException: Binder Error: Referenced column "Cytoplasm_Number_Object_Number" not found in FROM clause!
Candidate bindings: "Texture_Cytoplasm_BF_Sum.Average_256.1"

LINE 1: SELECT Cytoplasm_Number_Object_Number FROM sqlite_scan('/home/lipp...
               ^

### Get the organoid profile

In [12]:
# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"
presets.config[preset][
    "CONFIG_JOINS"
    # remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names
    # add Image_Metadata_FOV as this is an important metadata when finding where single cells are located
] = """"
            SELECT
                *
"""
presets.config[preset]["CONFIG_JOINS"]

'"\n            SELECT\n                *\n'

In [13]:
# merge single cells and output as parquet file
convert(
    source_path=input_sqlite_file,
    dest_path=destination_organoid_parquet_file,
    dest_datatype=dest_datatype,
    preset=preset,
    parsl_config=Config(
        executors=[HighThroughputExecutor()],
        run_dir=f"run_dir_{uuid.uuid4().hex}",
    ),
    chunk_size=1000,
)

Reusing previously loaded Parsl configuration.


BinderException: Binder Error: Referenced column "Cytoplasm_Number_Object_Number" not found in FROM clause!
Candidate bindings: "Texture_Cytoplasm_BF_Sum.Average_256.1"

LINE 1: SELECT Cytoplasm_Number_Object_Number FROM sqlite_scan('/home/lipp...
               ^