This notebook performs profile aggregation.

In [1]:
import argparse
import pathlib

import pandas as pd
from pycytominer import aggregate

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "NF0014"

In [3]:
# pathing
sc_fs_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/3.sc_fs_profiles.parquet"
).resolve(strict=True)
organoid_fs_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/3.organoid_fs_profiles.parquet"
).resolve(strict=True)


# output path
sc_agg_well_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/4.sc_agg_well_level_profiles.parquet"
).resolve()
sc_agg_well_parent_organoid_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/4.sc_agg_well_parent_organoid_level_profiles.parquet"
).resolve()
sc_consensus_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/4.sc_consensus_profiles.parquet"
).resolve()

organoid_agg_well_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/4.organoid_agg_well_level_profiles.parquet"
).resolve()
organoid_consensus_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/4.organoid_consensus_profiles.parquet"
).resolve()

In [4]:
# read in the data
sc_fs = pd.read_parquet(sc_fs_path)
organoid_fs = pd.read_parquet(organoid_fs_path)

### Aggregate the single cell profiles
We will aggregated with a few different stratifications:
1. Well
2. Well and parent organoid
3. Treatment - i.e. the consensus profile for each treatment

In [5]:
sc_fs.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,parent_organoid,Area.Size.Shape_Nuclei_EXTENT,Area.Size.Shape_Nuclei_EULER.NUMBER,...,Granularity_Cytoplasm_BF_GRANULARITY.4,Granularity_Cytoplasm_DNA_GRANULARITY.15,Granularity_Cytoplasm_ER_GRANULARITY.16,Granularity_Cytoplasm_ER_GRANULARITY.8,Texture_Cytoplasm_AGP_Correlation_256.1,Texture_Cytoplasm_AGP_Sum.Variance_256.1,Texture_Cytoplasm_DNA_Contrast_256.1,Texture_Cytoplasm_ER_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_Mito_Contrast_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.1
0,NF0014,15,%,1,DMSO,C4-2,C4,32,0.836761,-0.392232,...,-1.236246,-1.309578,0.0,-1.303511,0.442841,-0.229443,0.09705,0.020495,0.203806,-0.144187
1,NF0014,26,%,1,DMSO,C4-2,C4,32,0.214518,2.54951,...,0.158438,-0.988114,0.0,0.999034,0.345999,-0.401708,-0.286432,-0.417305,-0.236065,-0.123169
2,NF0014,37,%,1,DMSO,C4-2,C4,32,0.459903,-0.392232,...,0.022219,-0.974631,0.0,-1.201527,0.147025,-0.953462,-1.084089,0.727439,-1.443039,-0.31915
3,NF0014,43,%,1,DMSO,C4-2,C4,32,-1.2739,2.54951,...,-1.129586,0.435303,1.421085e-14,0.952505,-0.183131,0.153477,0.997786,-0.052029,1.067864,0.132546
4,NF0014,51,%,1,DMSO,C4-2,C4,32,0.505727,-0.392232,...,-1.257155,-1.801191,0.0,0.096698,-0.030948,-1.0673,-0.954839,0.243587,-0.498713,-0.591741


In [6]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "parent_organoid",
]
sc_features_columns = [col for col in sc_fs.columns if col not in sc_metadata_columns]
sc_features_df = sc_fs.drop(columns=sc_metadata_columns, errors="ignore")

In [7]:
# stratification approach #1
sc_well_agg = aggregate(
    population_df=sc_fs,
    strata=["Well"],
    features=sc_features_columns,
    operation="median",
)
sc_well_agg.to_parquet(sc_agg_well_output_path, index=False)

# stratification approach #2
sc_well_parent_organoid_agg = aggregate(
    population_df=sc_fs,
    strata=["Well", "parent_organoid"],
    features=sc_features_columns,
    operation="median",
)
sc_well_parent_organoid_agg.to_parquet(
    sc_agg_well_parent_organoid_output_path, index=False
)
# stratification approach #3
sc_consensus = aggregate(  # a.k.a. consensus
    population_df=sc_fs,
    strata=["treatment"],
    features=sc_features_columns,
    operation="median",
)
sc_consensus.to_parquet(sc_consensus_output_path, index=False)

### Aggregate the organoid profiles
We will aggregated with a few different stratifications:
1. Well
2. Treatment - i.e. the consensus profile for each treatment

In [8]:
organoid_fs.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count
0,NF0014,32,%,1,DMSO,C4-2,C4,29


In [9]:
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "parent_organoid",
]
organoidfeatures_columns = [
    col for col in organoid_fs.columns if col not in organoid_metadata_columns
]
organoid_features_df = organoid_fs.drop(columns=sc_metadata_columns, errors="ignore")

In [10]:
# stratification approach #1
organoid_well_agg = aggregate(
    population_df=organoid_fs,
    strata=["Well"],
    features=organoidfeatures_columns,
    operation="median",
)
organoid_well_agg.to_parquet(organoid_agg_well_output_path, index=False)

# stratification approach #2
organoid_consensus = aggregate(  # a.k.a. consensus
    population_df=organoid_fs,
    strata=["treatment"],
    features=organoidfeatures_columns,
    operation="median",
)
organoid_consensus.to_parquet(organoid_consensus_output_path, index=False)