This notebook combines all well fovs for each patient into a single file.


In [1]:
import argparse
import os
import pathlib
import sys

import duckdb
import pandas as pd
from arg_parsing_utils import parse_args
from notebook_init_utils import bandicoot_check, init_notebook

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path(os.path.expanduser("~/mnt/bandicoot/NF1_organoid_data")).resolve(),
    root_dir,
)

In [2]:
if not in_notebook:
    args = parse_args()
    patient = args["patient"]
    image_based_profiles_subparent_name = args["image_based_profiles_subparent_name"]

else:
    patient = "NF0014_T1"
    image_based_profiles_subparent_name = "convolution_1_image_based_profiles"

In [3]:
# set paths
profiles_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/0.converted_profiles"
).resolve(strict=True)
# output_paths
sc_merged_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/1.combined_profiles/sc.parquet"
).resolve()
organoid_merged_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/1.combined_profiles/organoid.parquet"
).resolve()
organoid_merged_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# get all profiles in the directory recursively
profiles = list(profiles_path.glob("**/*.parquet"))
# filter out profiles that are not related
profiles = [x for x in profiles if "related" in str(x)]

In [5]:
sc_profiles = [str(x) for x in profiles if "sc" in str(x.name)]
organoid_profiles = [str(x) for x in profiles if "organoid" in str(x.name)]

In [6]:
# for p in sc_profiles:
#     try:
#         df = pd.read_parquet(p)['Intensity_Nuclei_DNA_MAX.Z_y']
#         print(f"Read {p} with shape {df.shape}")
#     except Exception as e:
# pass

for col in pd.read_parquet(
    pathlib.Path(
        os.path.expanduser(
            f"~/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/{image_based_profiles_subparent_name}/0.converted_profiles/C4-2/sc_profiles_C4-2_related.parquet"
        )
    )
).columns:
    if "intensity" in col.lower():
        print(col)

Intensity_Nuclei_AGP_CM.X
Intensity_Nuclei_AGP_CM.Y
Intensity_Nuclei_AGP_CM.Z
Intensity_Nuclei_AGP_CMI.X
Intensity_Nuclei_AGP_CMI.Y
Intensity_Nuclei_AGP_CMI.Z
Intensity_Nuclei_AGP_DIFF.X
Intensity_Nuclei_AGP_DIFF.Y
Intensity_Nuclei_AGP_DIFF.Z
Intensity_Nuclei_AGP_EDGE.COUNT
Intensity_Nuclei_AGP_I.X
Intensity_Nuclei_AGP_I.Y
Intensity_Nuclei_AGP_I.Z
Intensity_Nuclei_AGP_INTEGRATED.INTENSITY
Intensity_Nuclei_AGP_INTEGRATED.INTENSITY.EDGE
Intensity_Nuclei_AGP_LOWER.QUARTILE.INTENSITY
Intensity_Nuclei_AGP_MAD.INTENSITY
Intensity_Nuclei_AGP_MASS.DISPLACEMENT
Intensity_Nuclei_AGP_MAX.INTENSITY
Intensity_Nuclei_AGP_MAX.INTENSITY.EDGE
Intensity_Nuclei_AGP_MAX.X
Intensity_Nuclei_AGP_MAX.Y
Intensity_Nuclei_AGP_MAX.Z
Intensity_Nuclei_AGP_MEAN.INTENSITY
Intensity_Nuclei_AGP_MEAN.INTENSITY.EDGE
Intensity_Nuclei_AGP_MEDIAN.INTENSITY
Intensity_Nuclei_AGP_MIN.INTENSITY
Intensity_Nuclei_AGP_MIN.INTENSITY.EDGE
Intensity_Nuclei_AGP_STD.INTENSITY
Intensity_Nuclei_AGP_STD.INTENSITY.EDGE
Intensity_Nuclei_AGP

In [7]:
# concat all sc profiles with duckdb
with duckdb.connect() as conn:
    sc_profile = conn.execute(
        f"SELECT * FROM read_parquet({sc_profiles}, union_by_name=true)"
    ).df()
    organoid_profile = conn.execute(
        f"SELECT * FROM read_parquet({organoid_profiles}, union_by_name=true)"
    ).df()
print(f"Single-cell profiles concatenated. Shape: {sc_profile.shape}")
print(f"Organoid profiles concatenated. Shape: {organoid_profile.shape}")
# drop imageset_1 and image_set_2 columns if they exist
if "image_set_1" in sc_profile.columns:
    sc_profile = sc_profile.drop(columns=["image_set_1"])
if "image_set_2" in sc_profile.columns:
    sc_profile = sc_profile.drop(columns=["image_set_2"])
if "image_set_1" in organoid_profile.columns:
    organoid_profile = organoid_profile.drop(columns=["image_set_1"])
if "image_set_2" in organoid_profile.columns:
    organoid_profile = organoid_profile.drop(columns=["image_set_2"])

Single-cell profiles concatenated. Shape: (30, 1360)
Organoid profiles concatenated. Shape: (1, 454)


## Remove all BF channels


In [8]:
print(f"Single-cell profiles shape: {sc_profile.shape}")
list_of_columns_to_drop = [col for col in sc_profile.columns if "BF" in col]
sc_profile = sc_profile.drop(columns=list_of_columns_to_drop)
print(f"Single-cell profiles shape after dropping BF channels: {sc_profile.shape}")

print(f"Organoid profiles shape: {organoid_profile.shape}")
list_of_columns_to_drop = [col for col in organoid_profile.columns if "BF" in col]
organoid_profile = organoid_profile.drop(columns=list_of_columns_to_drop)
print(f"Organoid profiles shape after dropping BF channels: {organoid_profile.shape}")

Single-cell profiles shape: (30, 1358)
Single-cell profiles shape after dropping BF channels: (30, 1358)
Organoid profiles shape: (1, 454)
Organoid profiles shape after dropping BF channels: (1, 454)


In [9]:
sc_profile.to_parquet(sc_merged_output_path, index=False)
organoid_profile.to_parquet(organoid_merged_output_path, index=False)
print(f"Single-cell profiles saved to {sc_merged_output_path}")
print(f"Organoid profiles saved to {organoid_merged_output_path}")

Single-cell profiles saved to /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/convolution_1_image_based_profiles/1.combined_profiles/sc.parquet
Organoid profiles saved to /home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/convolution_1_image_based_profiles/1.combined_profiles/organoid.parquet


In [10]:
sc_profile

Unnamed: 0,object_id,image_set,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.3,Texture_Cytoplasm_Mito_Difference.Variance_256.3,Texture_Cytoplasm_Mito_Entropy_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3
0,2,C4-2,32,5995.0,847.588806,74.060219,0.0,7304.0,804.0,892.0,...,0.015759,0.003884,0.020423,-0.413421,0.102524,0.999158,0.121734,0.017205,23.915774,6.923885
1,18,C4-2,32,28163.0,508.470001,255.540817,1.012534,38610.0,453.0,563.0,...,0.031411,0.003875,0.041915,-0.422158,0.148441,0.998147,0.088743,0.035999,5.313621,1.57714
2,28,C4-2,32,125455.0,912.757629,419.849945,16.152229,192600.0,858.0,965.0,...,0.094189,0.003833,0.141666,-0.530708,0.309519,0.993609,0.251323,0.120947,9.124662,2.542987
3,29,C4-2,32,66761.0,409.968811,652.16925,4.913228,973760.0,355.0,534.0,...,0.032753,0.003876,0.041724,-0.369338,0.135441,0.99818,0.271283,0.035434,49.103827,15.13515
4,33,C4-2,32,128119.0,693.761047,423.789062,19.189606,334152.0,639.0,758.0,...,0.08813,0.003847,0.118534,-0.394003,0.235301,0.994618,0.97449,0.098255,204.130495,60.071187
5,39,C4-2,32,12965.0,175.754959,363.513214,3.827227,19600.0,146.0,202.0,...,0.005109,0.003889,0.005824,-0.254247,0.03986,0.999758,0.032513,0.005351,6.175817,2.162641
6,48,C4-2,32,83239.0,742.896912,386.984192,4.943344,116748.0,671.0,812.0,...,0.042261,0.003871,0.055359,-0.409934,0.16642,0.997552,0.324096,0.046892,51.820071,15.246906
7,57,C4-2,32,79716.0,506.548248,557.98999,4.390549,111744.0,456.0,553.0,...,0.031206,0.003873,0.043535,-0.437722,0.153119,0.998041,0.032729,0.038891,0.914186,0.287058
8,66,C4-2,32,82275.0,418.282196,545.677307,13.582315,144200.0,349.0,489.0,...,0.039205,0.003873,0.047104,-0.284079,0.119175,0.997755,0.278011,0.041257,40.235163,13.623226
9,85,C4-2,32,89227.0,752.275391,542.383301,17.870554,135720.0,692.0,812.0,...,0.055923,0.003862,0.074484,-0.423008,0.195178,0.996561,0.22394,0.064134,15.508634,4.735271
