In [1]:
import argparse
import pathlib
import sys

import pandas as pd
from cytotable import convert, presets

sys.path.append("../../../utils")
import uuid

import duckdb
from parsl.config import Config
from parsl.executors import HighThroughputExecutor

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    argparser.add_argument(
        "--well_fov",
        type=str,
        required=True,
        help="Well and field of view to process, e.g. 'A01_1'",
    )
    args = argparser.parse_args()
    patient = args.patient
    well_fov = args.well_fov
else:
    patient = "NF0014"
    well_fov = "C4-2"

In [3]:
input_sqlite_file = pathlib.Path(
    f"../../data/{patient}/converted_profiles/{well_fov}/{well_fov}.duckdb"
).resolve(strict=True)
destination_sc_parquet_file = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/{well_fov}/sc_profiles_{well_fov}.parquet"
).resolve()
destination_organoid_parquet_file = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/{well_fov}/organoid_profiles_{well_fov}.parquet"
).resolve()
destination_sc_parquet_file.parent.mkdir(parents=True, exist_ok=True)
dest_datatype = "parquet"

In [4]:
# show the tables
con = duckdb.connect(input_sqlite_file)
tables = con.execute("SHOW TABLES").fetchdf()
tables["name"].to_list()

['Cell', 'Cytoplasm', 'Nuclei', 'Organoid']

In [5]:
nuclei_table = con.sql("SELECT * FROM Nuclei").df()
cells_table = con.sql("SELECT * FROM Cell").df()
cytoplasm_table = con.sql("SELECT * FROM Cytoplasm").df()
organoid_table = con.sql("SELECT * FROM Organoid").df()
con.close()

In [6]:
nuclei_id_set = set(nuclei_table["object_id"].to_list())
cells_id_set = set(cells_table["object_id"].to_list())
cytoplasm_id_set = set(cytoplasm_table["object_id"].to_list())
# find the intersection of the three sets
intersection_set = nuclei_id_set.intersection(cells_id_set, cytoplasm_id_set)
# keep only the rows in the three tables that are in the intersection set
nuclei_table = nuclei_table[nuclei_table["object_id"].isin(intersection_set)]
cells_table = cells_table[cells_table["object_id"].isin(intersection_set)]
cytoplasm_table = cytoplasm_table[cytoplasm_table["object_id"].isin(intersection_set)]

In [7]:
con = duckdb.connect()
con.register("df1", nuclei_table)
con.register("df2", cells_table)
con.register("df3", cytoplasm_table)
# Merge them with SQL
merged_df = con.execute("""
    SELECT *
    FROM df1
    LEFT JOIN df2 USING (object_id)
    LEFT JOIN df3 USING (object_id)
""").df()
con.close()

In [8]:
# save the organoid data as parquet
print(f"Final organoid data shape: {merged_df.shape}")
organoid_table.to_parquet(destination_organoid_parquet_file, index=False)
organoid_table.head()

Final organoid data shape: (30, 2106)


Unnamed: 0,object_id,image_set,Area.Size.Shape_Organoid_AGP_VOLUME,Area.Size.Shape_Organoid_AGP_CENTER.X,Area.Size.Shape_Organoid_AGP_CENTER.Y,Area.Size.Shape_Organoid_AGP_CENTER.Z,Area.Size.Shape_Organoid_AGP_BBOX.VOLUME,Area.Size.Shape_Organoid_AGP_MIN.X,Area.Size.Shape_Organoid_AGP_MAX.X,Area.Size.Shape_Organoid_AGP_MIN.Y,...,Texture_Organoid_Mito_Difference.Entropy_256.1,Texture_Organoid_Mito_Difference.Variance_256.1,Texture_Organoid_Mito_Entropy_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Organoid_Mito_Inverse.Difference.Moment_256.1,Texture_Organoid_Mito_Sum.Average_256.1,Texture_Organoid_Mito_Sum.Entropy_256.1,Texture_Organoid_Mito_Sum.Variance_256.1,Texture_Organoid_Mito_Variance_256.1
0,32,C4-2,20908636.0,669.720104,557.78382,14.473177,33828762.0,167,1173,39,...,1.33351,0.002369,2.76259,-0.507738,0.920417,0.831104,8.297708,2.122714,198.636846,50.32471


In [9]:
print(f"Final merged dataframe shape: {merged_df.shape}")
# save the sc data as parquet
merged_df.to_parquet(destination_sc_parquet_file, index=False)
merged_df.head()

Final merged dataframe shape: (30, 2106)


Unnamed: 0,object_id,image_set,Area.Size.Shape_Nuclei_AGP_VOLUME,Area.Size.Shape_Nuclei_AGP_CENTER.X,Area.Size.Shape_Nuclei_AGP_CENTER.Y,Area.Size.Shape_Nuclei_AGP_CENTER.Z,Area.Size.Shape_Nuclei_AGP_BBOX.VOLUME,Area.Size.Shape_Nuclei_AGP_MIN.X,Area.Size.Shape_Nuclei_AGP_MAX.X,Area.Size.Shape_Nuclei_AGP_MIN.Y,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.1,Texture_Cytoplasm_Mito_Difference.Variance_256.1,Texture_Cytoplasm_Mito_Entropy_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_Mito_Sum.Average_256.1,Texture_Cytoplasm_Mito_Sum.Entropy_256.1,Texture_Cytoplasm_Mito_Sum.Variance_256.1,Texture_Cytoplasm_Mito_Variance_256.1
0,15,C4-2,99661.0,473.778268,746.900453,10.934618,142417.0,421,528,687,...,0.051024,0.003861,0.070853,-0.59996,0.242368,0.996241,0.699428,0.063534,121.523028,32.474614
1,26,C4-2,156362.0,715.339418,224.483033,7.888688,256768.0,652,780,165,...,0.061713,0.003853,0.091123,-0.59905,0.273598,0.995213,0.716101,0.078503,99.806008,26.505418
2,37,C4-2,84453.0,503.486353,253.488615,4.147455,130980.0,453,564,195,...,0.030409,0.003874,0.042813,-0.607538,0.191419,0.998169,0.093143,0.037626,3.992738,1.070541
3,43,C4-2,131041.0,693.588457,424.839974,19.420853,334530.0,639,757,369,...,0.051189,0.00386,0.071099,-0.587976,0.239543,0.996087,0.859732,0.06178,175.059103,46.919265
4,51,C4-2,69045.0,399.909088,694.579159,5.122876,105984.0,355,447,649,...,0.043201,0.003866,0.061412,-0.619343,0.231458,0.996829,0.5343,0.0539,83.157453,22.02088
