In [1]:
import pathlib

import duckdb
import pandas as pd

In [15]:
result_path = pathlib.Path("../../2.cellprofiling/results/C4-2/").resolve(strict=True)
# get a list of all parquets in the directory
parquet_files = list(result_path.glob("*.parquet"))
parquet_files.sort()

In [16]:
pd.options.display.max_columns = None

In [17]:
feature_types_dict = {
    "Organoid": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
    "Cell": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
    "Nuclei": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
    "Cytoplasm": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
}
for file in parquet_files:
    for compartment in feature_types_dict.keys():
        for feature_type in feature_types_dict[compartment].keys():
            if compartment in file.name and feature_type in file.name:
                feature_types_dict[compartment][feature_type].append(file)

In [22]:
# create a record for each compartment
merged_df_dict = {
    "Nuclei": {
        "AreaSize_Shape": [],
        "Colocalization": [],
        "Intensity": [],
        "Granularity": [],
        "Neighbor": [],
        "Texture": [],
    },
}

for feature_type in feature_types_dict["Nuclei"].keys():
    if len(feature_types_dict["Nuclei"][feature_type]) > 0:
        for file in feature_types_dict["Nuclei"][feature_type]:
            # check if the file exists
            if not file.exists():
                print(f"File {file} does not exist")
                continue
            # check if the file is a parquet file
            if not file.name.endswith(".parquet"):
                print(f"File {file} is not a parquet file")
                continue
            # read the parquet files
            try:
                df = duckdb.read_parquet(str(file)).to_df()
            except Exception as e:
                print(
                    f"Error reading {feature_types_dict['Nuclei'][feature_type]}: {e}"
                )
                continue
        # add a column with the compartment name
        df["Compartment"] = "Nuclei"
        # add a column with the feature type
        df["FeatureType"] = feature_type
        # add the dataframe to the dictionary
        merged_df_dict["Nuclei"][feature_type] = df
merged_df_dict["Nuclei"]["AreaSize_Shape"].head()

Unnamed: 0,Nuclei_Mito_object_id,Nuclei_Mito_VOLUME,Nuclei_Mito_CENTER.X,Nuclei_Mito_CENTER.Y,Nuclei_Mito_CENTER.Z,Nuclei_Mito_BBOX.VOLUME,Nuclei_Mito_MIN.X,Nuclei_Mito_MAX.X,Nuclei_Mito_MIN.Y,Nuclei_Mito_MAX.Y,Nuclei_Mito_MIN.Z,Nuclei_Mito_MAX.Z,Nuclei_Mito_EXTENT,Nuclei_Mito_EULER.NUMBER,Nuclei_Mito_EQUIVALENT.DIAMETER,Nuclei_Mito_SURFACE.AREA,image_set,Compartment,FeatureType
0,1,12336.0,177.250811,363.405156,3.522049,17160.0,148,203,338,390,1,7,0.718881,1,28.667638,161.957922,C4-2,Nuclei,AreaSize_Shape
1,2,88442.0,742.105323,386.923532,5.314149,131130.0,671,812,343,436,1,11,0.67446,1,55.278126,594.900645,C4-2,Nuclei,AreaSize_Shape
2,3,87388.0,469.986497,471.372626,6.181077,164220.0,410,525,411,530,1,13,0.53214,1,55.057657,650.578847,C4-2,Nuclei,AreaSize_Shape
3,4,80885.0,507.441367,560.068023,4.669457,119286.0,458,552,489,630,1,10,0.678076,1,53.656597,550.127748,C4-2,Nuclei,AreaSize_Shape
4,5,40573.0,1104.57314,672.455894,3.916767,57288.0,1074,1136,605,737,1,8,0.708229,1,42.633027,374.459451,C4-2,Nuclei,AreaSize_Shape


In [8]:
df = pd.read_parquet(feature_types_dict["Nuclei"]["AreaSize_Shape"][0])
df.head()

Unnamed: 0,Nuclei_AGP_object_id,Nuclei_AGP_VOLUME,Nuclei_AGP_CENTER.X,Nuclei_AGP_CENTER.Y,Nuclei_AGP_CENTER.Z,Nuclei_AGP_BBOX.VOLUME,Nuclei_AGP_MIN.X,Nuclei_AGP_MAX.X,Nuclei_AGP_MIN.Y,Nuclei_AGP_MAX.Y,Nuclei_AGP_MIN.Z,Nuclei_AGP_MAX.Z,Nuclei_AGP_EXTENT,Nuclei_AGP_EULER.NUMBER,Nuclei_AGP_EQUIVALENT.DIAMETER,Nuclei_AGP_SURFACE.AREA,image_set
0,1,12336.0,177.250811,363.405156,3.522049,17160.0,148,203,338,390,1,7,0.718881,1,28.667638,161.957922,C4-2
1,2,88442.0,742.105323,386.923532,5.314149,131130.0,671,812,343,436,1,11,0.67446,1,55.278126,594.900645,C4-2
2,3,87388.0,469.986497,471.372626,6.181077,164220.0,410,525,411,530,1,13,0.53214,1,55.057657,650.578847,C4-2
3,4,80885.0,507.441367,560.068023,4.669457,119286.0,458,552,489,630,1,10,0.678076,1,53.656597,550.127748,C4-2
4,5,40573.0,1104.57314,672.455894,3.916767,57288.0,1074,1136,605,737,1,8,0.708229,1,42.633027,374.459451,C4-2


In [None]:
df_dict = {
    "organoid": [],
    "nuclei": [],
    "cell": [],
    "cytoplasm": [],
}
for areasizeshape_df in feature_types_dict["AreaSize_Shape"]:
    df = pd.read_parquet(areasizeshape_df)
    # rename any column that contain object id to "ObjectID"
    for col in df.columns:
        if "object_id" in col:
            df.rename(columns={col: "ObjectID"}, inplace=True)
        elif "image_set" in col:
            continue
        else:
            # prepend the column name with the feature type
            df.rename(columns={col: f"AreaSizeShape_{col}"}, inplace=True)
    if "Organoid" in areasizeshape_df.stem:
        df_dict["organoid"].append(df)
    elif "Nuclei" in areasizeshape_df.stem:
        df_dict["nuclei"].append(df)
    elif "Cell" in areasizeshape_df.stem:
        df_dict["cell"].append(df)
    elif "Cytoplasm" in areasizeshape_df.stem:
        df_dict["cytoplasm"].append(df)

# merge all of the organoid dataframes into one by ObjectID and image_set with duckdb
con = duckdb.connect()
dfs = {}
for i, df in enumerate(df_dict["organoid"]):
    dfs[f"{i}"] = df