In [1]:
import argparse
import os
import pathlib
import pprint
import sqlite3
from contextlib import closing

import duckdb
import pandas as pd

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--well_fov",
        type=str,
        required=True,
        help="Well and field of view to process, e.g. 'A01_1'",
    )
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    well_fov = args.well_fov
    patient = args.patient
else:
    well_fov = "C4-2"
    patient = "NF0014"


result_path = pathlib.Path(
    f"../../data/{patient}/extracted_features/{well_fov}"
).resolve(strict=True)
database_path = pathlib.Path(f"../../data/{patient}/converted_profiles/").resolve()
database_path.mkdir(parents=True, exist_ok=True)
# create the sqlite database
sqlite_path = database_path / f"{well_fov}.sqlite"


# get a list of all parquets in the directory recursively
parquet_files = list(result_path.rglob("*.parquet"))
parquet_files.sort()
print(len(parquet_files), "parquet files found")

121 parquet files found


In [3]:
feature_types_dict = {
    "Organoid": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
    "Cell": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
    "Nuclei": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
    "Cytoplasm": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
}
for file in parquet_files:
    for compartment in feature_types_dict.keys():
        for feature_type in feature_types_dict[compartment].keys():
            if compartment in str(file) and feature_type in str(file):
                feature_types_dict[compartment][feature_type].append(file)

In [4]:
# create a record for each compartment
merged_df_dict = {
    "Organoid": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
    "Cell": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
    "Nuclei": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
    "Cytoplasm": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
}

for compartment in feature_types_dict.keys():
    for feature_type in feature_types_dict[compartment].keys():
        if len(feature_types_dict[compartment][feature_type]) > 0:
            for file in feature_types_dict[compartment][feature_type]:
                # check if the file exists
                if not file.exists():
                    if (
                        "neighbor" in file.name.lower()
                        and "nuclei" not in file.name.lower()
                    ):
                        print(f"File {file} does not exist")
                        continue
                # check if the file is a parquet file
                if not file.name.endswith(".parquet"):
                    print(f"File {file} is not a parquet file")
                    continue
                # read the parquet files
                try:
                    df = duckdb.read_parquet(str(file)).to_df()
                except Exception as e:
                    print(
                        f"Error reading {feature_types_dict[compartment][feature_type]}: {e}"
                    )

                # add the dataframe to the dictionary
                merged_df_dict[compartment][feature_type].append(df)
        else:
            if (
                "neighbor" in feature_type.lower()
                and "nuclei" not in compartment.lower()
            ):
                merged_df_dict[compartment][feature_type].append(pd.DataFrame())
            else:
                print(
                    f"No files found for {compartment} {feature_type}. Please check the directory."
                )
                merged_df_dict[compartment][feature_type].append(pd.DataFrame())
                for channel_df in merged_df_dict[compartment][feature_type]:
                    if channel_df.empty:
                        continue
                    # check if the dataframe has the required columns
                    if (
                        "object_id" not in channel_df.columns
                        or "image_set" not in channel_df.columns
                    ):
                        print(
                            f"Dataframe {channel_df} does not have the required columns"
                        )
                        continue
                    # check if the dataframe is empty
                    if channel_df.empty:
                        continue

In [5]:
from functools import reduce

In [6]:
final_df_dict = {
    "Organoid": {
        "AreaSize_Shape": pd.DataFrame(),
        "Colocalization": pd.DataFrame(),
        "Intensity": pd.DataFrame(),
        "Granularity": pd.DataFrame(),
        "Neighbor": pd.DataFrame(),
        "Texture": pd.DataFrame(),
    },
    "Cell": {
        "AreaSize_Shape": pd.DataFrame(),
        "Colocalization": pd.DataFrame(),
        "Intensity": pd.DataFrame(),
        "Granularity": pd.DataFrame(),
        "Neighbor": pd.DataFrame(),
        "Texture": pd.DataFrame(),
    },
    "Nuclei": {
        "AreaSize_Shape": pd.DataFrame(),
        "Colocalization": pd.DataFrame(),
        "Intensity": pd.DataFrame(),
        "Granularity": pd.DataFrame(),
        "Neighbor": pd.DataFrame(),
        "Texture": pd.DataFrame(),
    },
    "Cytoplasm": {
        "AreaSize_Shape": pd.DataFrame(),
        "Colocalization": pd.DataFrame(),
        "Intensity": pd.DataFrame(),
        "Granularity": pd.DataFrame(),
        "Neighbor": pd.DataFrame(),
        "Texture": pd.DataFrame(),
    },
}

In [7]:
for compartment in merged_df_dict.keys():
    for feature_type in merged_df_dict[compartment].keys():
        for df in merged_df_dict[compartment][feature_type]:
            if df.empty:
                continue
            df.drop(columns=["__index_level_0__"], inplace=True, errors="ignore")
            # if "Texture" not in feature_type:
            final_df_dict[compartment][feature_type] = reduce(
                lambda left, right: pd.merge(
                    left, right, how="left", on=["object_id", "image_set"]
                ),
                merged_df_dict[compartment][feature_type],
            )

In [8]:
merged_df = pd.DataFrame(
    {
        "object_id": [],
        "image_set": [],
    }
)

In [9]:
compartment_merged_dict = {
    "Organoid": pd.DataFrame(),
    "Cell": pd.DataFrame(),
    "Nuclei": pd.DataFrame(),
    "Cytoplasm": pd.DataFrame(),
}

In [10]:
final_df_dict["Nuclei"]["AreaSize_Shape"]
final_df_dict["Nuclei"]["Colocalization"]
final_df_dict["Nuclei"]["Granularity"]
final_df_dict["Nuclei"]["Intensity"]
final_df_dict["Nuclei"]["Neighbor"]
final_df_dict["Nuclei"]["Texture"]

Unnamed: 0,image_set,object_id,Texture_Nuclei_AGP_Angular.Second.Moment_256.1,Texture_Nuclei_AGP_Contrast_256.1,Texture_Nuclei_AGP_Correlation_256.1,Texture_Nuclei_AGP_Difference.Entropy_256.1,Texture_Nuclei_AGP_Difference.Variance_256.1,Texture_Nuclei_AGP_Entropy_256.1,Texture_Nuclei_AGP_Information.Measure.of.Correlation.1_256.1,Texture_Nuclei_AGP_Information.Measure.of.Correlation.2_256.1,...,Texture_Nuclei_Mito_Difference.Entropy_256.1,Texture_Nuclei_Mito_Difference.Variance_256.1,Texture_Nuclei_Mito_Entropy_256.1,Texture_Nuclei_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Nuclei_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Nuclei_Mito_Inverse.Difference.Moment_256.1,Texture_Nuclei_Mito_Sum.Average_256.1,Texture_Nuclei_Mito_Sum.Entropy_256.1,Texture_Nuclei_Mito_Sum.Variance_256.1,Texture_Nuclei_Mito_Variance_256.1
0,C4-2,15,0.997186,3.107954,0.897379,0.008178,0.003886,0.019586,-0.793075,0.159178,...,0.01793,0.003881,0.024493,-0.676109,0.156971,0.998789,0.355498,0.021977,93.696801,24.650923
1,C4-2,26,0.995561,3.539515,0.880603,0.013527,0.003883,0.029857,-0.751623,0.187515,...,0.028064,0.003876,0.038491,-0.629546,0.186038,0.998065,0.346335,0.034011,57.470498,15.262003
2,C4-2,37,0.997755,1.733566,0.912409,0.006933,0.003887,0.016212,-0.819409,0.149153,...,0.014956,0.003883,0.020667,-0.696157,0.147727,0.999025,0.242404,0.018427,54.627623,14.201402
3,C4-2,43,0.996404,5.239457,0.917928,0.008285,0.003886,0.023344,-0.814383,0.17756,...,0.022518,0.003879,0.030515,-0.680547,0.175937,0.99846,0.418224,0.027489,100.742714,26.348306
4,C4-2,51,0.998152,2.018091,0.918554,0.004753,0.003888,0.013371,-0.838982,0.138317,...,0.012235,0.003885,0.016743,-0.715638,0.135943,0.99922,0.142915,0.015166,22.900737,5.951695
5,C4-2,52,0.999638,0.840413,0.844654,0.001443,0.00389,0.003033,-0.742542,0.059608,...,0.00322,0.00389,0.004132,-0.62357,0.060923,0.999832,0.040744,0.003692,10.018713,2.775265
6,C4-2,69,0.997527,3.757993,0.879065,0.007379,0.003887,0.016731,-0.790981,0.146914,...,0.016014,0.003883,0.021754,-0.669497,0.146923,0.998938,0.227031,0.019362,43.563159,11.540071
7,C4-2,70,0.997107,3.717415,0.898603,0.007687,0.003887,0.019851,-0.806834,0.16257,...,0.018424,0.003881,0.02502,-0.686036,0.160418,0.998763,0.357046,0.022603,91.451871,24.00583
8,C4-2,79,0.997795,3.689817,0.881328,0.005365,0.003888,0.014485,-0.794471,0.137215,...,0.014635,0.003884,0.019426,-0.659637,0.137326,0.999086,0.086694,0.017644,7.174277,1.912492
9,C4-2,86,0.998825,1.483206,0.816778,0.004911,0.003889,0.009646,-0.736932,0.105437,...,0.008069,0.003887,0.010643,-0.661144,0.101992,0.9995,0.108068,0.009747,21.162186,5.691714


In [11]:
for compartment in final_df_dict.keys():
    print(f"Processing compartment: {compartment}")
    for feature_type in final_df_dict[compartment].keys():
        if compartment != "Nuclei" and feature_type == "Neighbor":
            print(
                f"Skipping {compartment} {feature_type} as it is not applicable for this compartment."
            )
            continue
        if compartment_merged_dict[compartment].empty:
            compartment_merged_dict[compartment] = final_df_dict[compartment][
                feature_type
            ].copy()
        else:
            compartment_merged_dict[compartment] = pd.merge(
                compartment_merged_dict[compartment],
                final_df_dict[compartment][feature_type],
                on=["object_id", "image_set"],
                how="outer",
            )

Processing compartment: Organoid
Skipping Organoid Neighbor as it is not applicable for this compartment.
Processing compartment: Cell
Skipping Cell Neighbor as it is not applicable for this compartment.
Processing compartment: Nuclei
Processing compartment: Cytoplasm
Skipping Cytoplasm Neighbor as it is not applicable for this compartment.


In [12]:
with closing(sqlite3.connect(sqlite_path)) as cx:
    # with cx:
    # conn = sqlite3.connect(sqlite_path)
    # merge all the feature types into one dataframe
    for compartment in compartment_merged_dict.keys():
        compartment_merged_dict[compartment].to_sql(
            f"{compartment}",
            cx,
            if_exists="replace",
            index=False,
        )