This notebook combines all well fovs for each patient into a single file.


In [1]:
import argparse
import pathlib
import sys

import duckdb
import pandas as pd

cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd
else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break
sys.path.append(str(root_dir / "utils"))
from arg_parsing_utils import parse_args
from notebook_init_utils import bandicoot_check, init_notebook

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path("/home/lippincm/mnt/bandicoot/NF1_organoid_data").resolve(), root_dir
)

In [2]:
if not in_notebook:
    args = parse_args()
    patient = args["patient"]

else:
    patient = "NF0014_T1"

In [3]:
# set paths
profiles_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/0.converted_profiles"
).resolve(strict=True)
# output_paths
sc_merged_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/1.combined_profiles/sc.parquet"
).resolve()
organoid_merged_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/1.combined_profiles/organoid.parquet"
).resolve()
organoid_merged_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# get all profiles in the directory recursively
profiles = list(profiles_path.glob("**/*.parquet"))
# filter out profiles that are not related
profiles = [x for x in profiles if "related" in str(x)]

In [5]:
sc_profiles = [str(x) for x in profiles if "sc" in str(x.name)]
organoid_profiles = [str(x) for x in profiles if "organoid" in str(x.name)]

In [6]:
# for p in sc_profiles:
#     try:
#         df = pd.read_parquet(p)['Intensity_Nuclei_DNA_MAX.Z_y']
#         print(f"Read {p} with shape {df.shape}")
#     except Exception as e:
# pass

for col in pd.read_parquet(
    "/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/image_based_profiles/0.converted_profiles/G2-2/sc_profiles_G2-2_related.parquet"
).columns:
    if "intensity" in col.lower():
        print(col)

Intensity_Nuclei_AGP_CM.X
Intensity_Nuclei_AGP_CM.Y
Intensity_Nuclei_AGP_CM.Z
Intensity_Nuclei_AGP_CMI.X
Intensity_Nuclei_AGP_CMI.Y
Intensity_Nuclei_AGP_CMI.Z
Intensity_Nuclei_AGP_DIFF.X
Intensity_Nuclei_AGP_DIFF.Y
Intensity_Nuclei_AGP_DIFF.Z
Intensity_Nuclei_AGP_EDGE.COUNT
Intensity_Nuclei_AGP_I.X
Intensity_Nuclei_AGP_I.Y
Intensity_Nuclei_AGP_I.Z
Intensity_Nuclei_AGP_INTEGRATED.INTENSITY
Intensity_Nuclei_AGP_INTEGRATED.INTENSITY.EDGE
Intensity_Nuclei_AGP_LOWER.QUARTILE.INTENSITY
Intensity_Nuclei_AGP_MAD.INTENSITY
Intensity_Nuclei_AGP_MASS.DISPLACEMENT
Intensity_Nuclei_AGP_MAX.INTENSITY
Intensity_Nuclei_AGP_MAX.INTENSITY.EDGE
Intensity_Nuclei_AGP_MAX.X
Intensity_Nuclei_AGP_MAX.Y
Intensity_Nuclei_AGP_MAX.Z
Intensity_Nuclei_AGP_MEAN.INTENSITY
Intensity_Nuclei_AGP_MEAN.INTENSITY.EDGE
Intensity_Nuclei_AGP_MEDIAN.INTENSITY
Intensity_Nuclei_AGP_MIN.INTENSITY
Intensity_Nuclei_AGP_MIN.INTENSITY.EDGE
Intensity_Nuclei_AGP_STD.INTENSITY
Intensity_Nuclei_AGP_STD.INTENSITY.EDGE
Intensity_Nuclei_AGP

In [7]:
# concat all sc profiles with duckdb
with duckdb.connect() as conn:
    sc_profile = conn.execute(
        f"SELECT * FROM read_parquet({sc_profiles}, union_by_name=true)"
    ).df()
    organoid_profile = conn.execute(
        f"SELECT * FROM read_parquet({organoid_profiles}, union_by_name=true)"
    ).df()
print(f"Single-cell profiles concatenated. Shape: {sc_profile.shape}")
print(f"Organoid profiles concatenated. Shape: {organoid_profile.shape}")
# drop imageset_1 and image_set_2 columns if they exist
if "image_set_1" in sc_profile.columns:
    sc_profile = sc_profile.drop(columns=["image_set_1"])
if "image_set_2" in sc_profile.columns:
    sc_profile = sc_profile.drop(columns=["image_set_2"])
if "image_set_1" in organoid_profile.columns:
    organoid_profile = organoid_profile.drop(columns=["image_set_1"])
if "image_set_2" in organoid_profile.columns:
    organoid_profile = organoid_profile.drop(columns=["image_set_2"])

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Single-cell profiles concatenated. Shape: (1406, 2567)
Organoid profiles concatenated. Shape: (105, 643)


## Remove all BF channels


In [8]:
print(f"Single-cell profiles shape: {sc_profile.shape}")
list_of_columns_to_drop = [col for col in sc_profile.columns if "BF" in col]
sc_profile = sc_profile.drop(columns=list_of_columns_to_drop)
print(f"Single-cell profiles shape after dropping BF channels: {sc_profile.shape}")

print(f"Organoid profiles shape: {organoid_profile.shape}")
list_of_columns_to_drop = [col for col in organoid_profile.columns if "BF" in col]
organoid_profile = organoid_profile.drop(columns=list_of_columns_to_drop)
print(f"Organoid profiles shape after dropping BF channels: {organoid_profile.shape}")

Single-cell profiles shape: (1406, 2565)
Single-cell profiles shape after dropping BF channels: (1406, 1870)
Organoid profiles shape: (105, 643)
Organoid profiles shape after dropping BF channels: (105, 454)


In [9]:
sc_profile.to_parquet(sc_merged_output_path, index=False)
organoid_profile.to_parquet(organoid_merged_output_path, index=False)
print(f"Single-cell profiles saved to {sc_merged_output_path}")
print(f"Organoid profiles saved to {organoid_merged_output_path}")

Single-cell profiles saved to /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/image_based_profiles/1.combined_profiles/sc.parquet
Organoid profiles saved to /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/image_based_profiles/1.combined_profiles/organoid.parquet


In [10]:
sc_profile

Unnamed: 0,object_id,image_set,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,...,Intensity_Cytoplasm_Mito_MAX.Z_y,Intensity_Cytoplasm_Mito_MEAN.INTENSITY_y,Intensity_Cytoplasm_Mito_MEAN.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_MEDIAN.INTENSITY_y,Intensity_Cytoplasm_Mito_MIN.INTENSITY_y,Intensity_Cytoplasm_Mito_MIN.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_STD.INTENSITY_y,Intensity_Cytoplasm_Mito_STD.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_UPPER.QUARTILE.INTENSITY_y,Intensity_Cytoplasm_Mito_VOLUME_y
0,255,E5-2,19,31267.0,1247.273682,712.686646,8.180062,49560.0,1186.0,1304.0,...,,,,,,,,,,
1,25,D5-2,-1,35478.0,1115.733765,982.765442,4.788883,1358544.0,822.0,1163.0,...,,,,,,,,,,
2,51,D5-2,-1,9615.0,1050.518799,1137.028442,4.491316,18270.0,1024.0,1082.0,...,,,,,,,,,,
3,63,D5-2,-1,3784.0,708.924438,1048.733398,3.542812,5168.0,691.0,729.0,...,,,,,,,,,,
4,76,D5-2,-1,2883.0,414.661469,1173.106812,2.997919,3876.0,397.0,431.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401,72,D11-3,34,72059.0,937.616699,902.791138,12.654977,122080.0,882.0,991.0,...,,,,,,,,,,
1402,109,D11-3,34,44175.0,801.196960,973.865845,14.527062,119392.0,765.0,847.0,...,,,,,,,,,,
1403,145,D11-3,34,41016.0,1052.388428,1240.983276,15.477399,71820.0,1005.0,1100.0,...,,,,,,,,,,
1404,182,D11-3,34,44480.0,811.228638,1047.775146,16.683836,73528.0,765.0,866.0,...,,,,,,,,,,
