This notebook combines all well fovs for each patient into a single file.


In [None]:
import argparse
import pathlib
import sys

import duckdb
import pandas as pd

cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd
else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break
sys.path.append(str(root_dir / "utils"))
from arg_parsing_utils import parse_args
from notebook_init_utils import bandicoot_check, init_notebook

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path("/home/lippincm/mnt/bandicoot").resolve(), root_dir
)

In [None]:
if not in_notebook:
    args = parse_args()
    patient = args["patient"]

else:
    patient = "NF0018"

In [None]:
# set paths
profiles_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/0.converted_profiles"
).resolve(strict=True)
# output_paths
sc_merged_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/1.combined_profiles/sc.parquet"
).resolve()
organoid_merged_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/1.combined_profiles/organoid.parquet"
).resolve()
organoid_merged_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# get all profiles in the directory recursively
profiles = list(profiles_path.glob("**/*.parquet"))
# filter out profiles that are not related
profiles = [x for x in profiles if "related" in str(x)]

In [5]:
sc_profiles = [str(x) for x in profiles if "sc" in str(x.name)]
organoid_profiles = [str(x) for x in profiles if "organoid" in str(x.name)]

In [6]:
# concat all sc profiles with duckdb
with duckdb.connect() as conn:
    sc_profile = conn.execute(
        f"SELECT * FROM read_parquet({sc_profiles}, union_by_name=true)"
    ).df()
    organoid_profile = conn.execute(
        f"SELECT * FROM read_parquet({organoid_profiles}, union_by_name=true)"
    ).df()
print(f"Single-cell profiles concatenated. Shape: {sc_profile.shape}")
print(f"Organoid profiles concatenated. Shape: {organoid_profile.shape}")
# drop imageset_1 and image_set_2 columns if they exist
if "image_set_1" in sc_profile.columns:
    sc_profile = sc_profile.drop(columns=["image_set_1"])
if "image_set_2" in sc_profile.columns:
    sc_profile = sc_profile.drop(columns=["image_set_2"])
if "image_set_1" in organoid_profile.columns:
    organoid_profile = organoid_profile.drop(columns=["image_set_1"])
if "image_set_2" in organoid_profile.columns:
    organoid_profile = organoid_profile.drop(columns=["image_set_2"])

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Single-cell profiles concatenated. Shape: (670, 1927)
Organoid profiles concatenated. Shape: (154, 643)


In [7]:
sc_profile.head()

Unnamed: 0,object_id,image_set,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.3,Texture_Cytoplasm_Mito_Difference.Variance_256.3,Texture_Cytoplasm_Mito_Entropy_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3
0,63,F10-3,-1,28320.0,1248.030884,1102.819214,3.942938,59598.0,1204.0,1290.0,...,0.019702,0.003882,0.023836,-0.315742,0.092827,0.998877,0.044396,0.020818,2.862554,1.015763
1,127,F10-3,-1,13926.0,318.801086,457.011475,5.499138,21760.0,283.0,363.0,...,1.486478,0.002275,2.610394,-0.488758,0.902757,0.805164,8.739858,2.151855,184.892341,47.741621
2,255,F10-3,-1,3187.0,899.354858,950.341675,6.448698,4340.0,885.0,916.0,...,0.004574,0.003889,0.005051,-0.197363,0.029806,0.999749,0.025128,0.00471,3.03407,1.184079
3,39,G8-1,20,25715.0,414.767761,520.522827,14.032004,38808.0,379.0,456.0,...,0.059522,0.003854,0.081704,-0.600691,0.260216,0.995318,0.941244,0.073568,168.183126,45.306241
4,47,G8-1,20,22188.0,130.78772,832.4729,14.043807,30660.0,102.0,162.0,...,0.002662,0.00389,0.002965,-0.203013,0.024222,0.999847,0.019745,0.00279,2.774002,1.060193


## Remvoe all BF channels


In [8]:
print(f"Single-cell profiles shape: {sc_profile.shape}")
list_of_columns_to_drop = [col for col in sc_profile.columns if "BF" in col]
sc_profile = sc_profile.drop(columns=list_of_columns_to_drop)
print(f"Single-cell profiles shape after dropping BF channels: {sc_profile.shape}")

print(f"Organoid profiles shape: {organoid_profile.shape}")
list_of_columns_to_drop = [col for col in organoid_profile.columns if "BF" in col]
organoid_profile = organoid_profile.drop(columns=list_of_columns_to_drop)
print(f"Organoid profiles shape after dropping BF channels: {organoid_profile.shape}")

Single-cell profiles shape: (670, 1925)
Single-cell profiles shape after dropping BF channels: (670, 1358)
Organoid profiles shape: (154, 643)
Organoid profiles shape after dropping BF channels: (154, 454)


In [9]:
sc_profile.to_parquet(sc_merged_output_path, index=False)
organoid_profile.to_parquet(organoid_merged_output_path, index=False)
print(f"Single-cell profiles saved to {sc_merged_output_path}")
print(f"Organoid profiles saved to {organoid_merged_output_path}")

Single-cell profiles saved to /home/lippincm/Documents/GFF_3D_organoid_profiling_pipeline/data/NF0018/image_based_profiles/1.combined_profiles/sc.parquet
Organoid profiles saved to /home/lippincm/Documents/GFF_3D_organoid_profiling_pipeline/data/NF0018/image_based_profiles/1.combined_profiles/organoid.parquet
