In [1]:
import argparse
import os
import pathlib
import sys
import uuid

import duckdb
import pandas as pd
from arg_parsing_utils import parse_args
from cytotable import convert, presets
from notebook_init_utils import bandicoot_check, init_notebook
from parsl.config import Config
from parsl.executors import HighThroughputExecutor

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path(os.path.expanduser("~/mnt/bandicoot")).resolve(), root_dir
)

In [2]:
if not in_notebook:
    args = parse_args()
    well_fov = args["well_fov"]
    patient = args["patient"]
    image_based_profiles_subparent_name = args["image_based_profiles_subparent_name"]

else:
    patient = "NF0037_T1-Z-1"
    well_fov = "F4-3"
    image_based_profiles_subparent_name = "image_based_profiles"

In [3]:
input_sqlite_file = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/0.converted_profiles/{well_fov}/{well_fov}.duckdb"
).resolve(strict=True)
destination_sc_parquet_file = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/0.converted_profiles/{well_fov}/sc_profiles_{well_fov}.parquet"
).resolve()
destination_organoid_parquet_file = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/0.converted_profiles/{well_fov}/organoid_profiles_{well_fov}.parquet"
).resolve()
destination_sc_parquet_file.parent.mkdir(parents=True, exist_ok=True)
dest_datatype = "parquet"

In [4]:
# show the tables
with duckdb.connect(input_sqlite_file) as con:
    tables = con.execute("SHOW TABLES").fetchdf()
    print(tables)
    nuclei_table = con.sql("SELECT * FROM Nuclei").df()
    cells_table = con.sql("SELECT * FROM Cell").df()
    cytoplasm_table = con.sql("SELECT * FROM Cytoplasm").df()
    organoid_table = con.sql("SELECT * FROM Organoid").df()

        name
0       Cell
1  Cytoplasm
2     Nuclei
3   Organoid


In [13]:
nuclei_table

Unnamed: 0,object_id,image_set,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,Area.Size.Shape_Nuclei_MIN.Y,...,Texture_Nuclei_Mito_Difference.Entropy_256.3,Texture_Nuclei_Mito_Difference.Variance_256.3,Texture_Nuclei_Mito_Entropy_256.3,Texture_Nuclei_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Nuclei_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Nuclei_Mito_Inverse.Difference.Moment_256.3,Texture_Nuclei_Mito_Sum.Average_256.3,Texture_Nuclei_Mito_Sum.Entropy_256.3,Texture_Nuclei_Mito_Sum.Variance_256.3,Texture_Nuclei_Mito_Variance_256.3


In [5]:
nuclei_id_set = set(nuclei_table["object_id"].to_list())
cells_id_set = set(cells_table["object_id"].to_list())
cytoplasm_id_set = set(cytoplasm_table["object_id"].to_list())
# find the intersection of the three sets
intersection_set = nuclei_id_set.intersection(cells_id_set, cytoplasm_id_set)
# keep only the rows in the three tables that are in the intersection set
nuclei_table = nuclei_table[nuclei_table["object_id"].isin(intersection_set)]
cells_table = cells_table[cells_table["object_id"].isin(intersection_set)]
cytoplasm_table = cytoplasm_table[cytoplasm_table["object_id"].isin(intersection_set)]

In [6]:
# connect to DuckDB and register the tables
with duckdb.connect() as con:
    con.register("nuclei", nuclei_table)
    con.register("cells", cells_table)
    con.register("cytoplasm", cytoplasm_table)
    # Merge them with SQL
    merged_df = con.execute("""
        SELECT *
        FROM nuclei
        LEFT JOIN cells USING (object_id)
        LEFT JOIN cytoplasm USING (object_id)
    """).df()

In [10]:
nuclei_table

Unnamed: 0,object_id,image_set,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,Area.Size.Shape_Nuclei_MIN.Y,...,Texture_Nuclei_Mito_Difference.Entropy_256.3,Texture_Nuclei_Mito_Difference.Variance_256.3,Texture_Nuclei_Mito_Entropy_256.3,Texture_Nuclei_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Nuclei_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Nuclei_Mito_Inverse.Difference.Moment_256.3,Texture_Nuclei_Mito_Sum.Average_256.3,Texture_Nuclei_Mito_Sum.Entropy_256.3,Texture_Nuclei_Mito_Sum.Variance_256.3,Texture_Nuclei_Mito_Variance_256.3


In [11]:
cells_table

Unnamed: 0,object_id,image_set,Area.Size.Shape_Cell_VOLUME,Area.Size.Shape_Cell_CENTER.X,Area.Size.Shape_Cell_CENTER.Y,Area.Size.Shape_Cell_CENTER.Z,Area.Size.Shape_Cell_BBOX.VOLUME,Area.Size.Shape_Cell_MIN.X,Area.Size.Shape_Cell_MAX.X,Area.Size.Shape_Cell_MIN.Y,...,Texture_Cell_Mito_Difference.Entropy_256.3,Texture_Cell_Mito_Difference.Variance_256.3,Texture_Cell_Mito_Entropy_256.3,Texture_Cell_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Cell_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cell_Mito_Inverse.Difference.Moment_256.3,Texture_Cell_Mito_Sum.Average_256.3,Texture_Cell_Mito_Sum.Entropy_256.3,Texture_Cell_Mito_Sum.Variance_256.3,Texture_Cell_Mito_Variance_256.3


In [12]:
cytoplasm_table

Unnamed: 0,object_id,image_set,Area.Size.Shape_Cytoplasm_VOLUME,Area.Size.Shape_Cytoplasm_CENTER.X,Area.Size.Shape_Cytoplasm_CENTER.Y,Area.Size.Shape_Cytoplasm_CENTER.Z,Area.Size.Shape_Cytoplasm_BBOX.VOLUME,Area.Size.Shape_Cytoplasm_MIN.X,Area.Size.Shape_Cytoplasm_MAX.X,Area.Size.Shape_Cytoplasm_MIN.Y,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.3,Texture_Cytoplasm_Mito_Difference.Variance_256.3,Texture_Cytoplasm_Mito_Entropy_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3


In [7]:
# save the organoid data as parquet
print(f"Final organoid data shape: {merged_df.shape}")
organoid_table.to_parquet(destination_organoid_parquet_file, index=False)
organoid_table.head()

Final organoid data shape: (0, 7686)


Unnamed: 0,object_id,image_set,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,Area.Size.Shape_Organoid_CENTER.Y,Area.Size.Shape_Organoid_CENTER.Z,Area.Size.Shape_Organoid_BBOX.VOLUME,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,Area.Size.Shape_Organoid_MIN.Y,...,Texture_Organoid_Mito_Difference.Entropy_256.3,Texture_Organoid_Mito_Difference.Variance_256.3,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3
0,36,F4-3,734680.0,319.866425,1387.379639,11.5,974460.0,0.0,745.0,1199.0,...,0.100005,0.003822,0.120409,-0.297762,0.176048,0.991166,0.822598,0.109579,81.032279,27.854111
1,44,F4-3,741099.0,270.128723,1392.871948,20.992054,1034775.0,0.0,657.0,1211.0,...,0.094105,0.003828,0.116828,-0.357492,0.207307,0.991861,0.946391,0.105923,113.610685,36.512772
2,47,F4-3,321823.0,262.517914,1426.653198,24.943857,505005.0,0.0,655.0,1269.0,...,0.052574,0.003856,0.060145,-0.238333,0.071666,0.995555,0.372785,0.055222,33.647033,13.10235
3,71,F4-3,2254.0,21.566105,1352.020386,40.299469,72498.0,0.0,129.0,1245.0,...,0.000622,0.003891,0.000674,-0.129977,0.005263,0.999967,0.004889,0.000638,0.77778,0.334593
4,77,F4-3,22191750.0,830.759705,742.332764,19.615656,50321804.0,348.0,1336.0,273.0,...,1.133531,0.002809,1.756429,-0.509422,0.83453,0.859593,9.700805,1.485096,434.224148,112.942569


In [8]:
# drop columns that end with _x or _y lowercase
merged_df = merged_df.loc[:, ~merged_df.columns.str.endswith(("_x", "_y"))]

In [9]:
print(f"Final merged single cell dataframe shape: {merged_df.shape}")
# save the sc data as parquet
merged_df.to_parquet(destination_sc_parquet_file, index=False)
merged_df.head()

Final merged single cell dataframe shape: (0, 7686)


Unnamed: 0,object_id,image_set,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,Area.Size.Shape_Nuclei_MIN.Y,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.3,Texture_Cytoplasm_Mito_Difference.Variance_256.3,Texture_Cytoplasm_Mito_Entropy_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3
