This notebook performs profile normalization.
All profiles are normalized to the DMSO control treated profiles.

In [None]:
import argparse
import os
import pathlib
import sys

import numpy as np
import pandas as pd
from pycytominer import normalize

cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd
else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break
sys.path.append(str(root_dir / "utils"))
from arg_parsing_utils import parse_args
from notebook_init_utils import bandicoot_check, init_notebook

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path(os.path.expanduser("~/mnt/bandicoot")).resolve(), root_dir
)

In [2]:
if not in_notebook:
    args = parse_args()
    patient = args["patient"]

else:
    patient = "NF0014_T1"

In [3]:
# pathing
sc_annotated_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/2.annotated_profiles/sc_anno.parquet"
).resolve(strict=True)
organoid_annotated_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/2.annotated_profiles/organoid_anno.parquet"
).resolve(strict=True)


# output path
sc_normalized_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/sc_norm.parquet"
).resolve()
organoid_normalized_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/organoid_norm.parquet"
).resolve()

organoid_normalized_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# read in the data
sc_annotated_profiles = pd.read_parquet(sc_annotated_path)
organoid_annotated_profiles = pd.read_parquet(organoid_annotated_path)

In [5]:
sc_annotated_profiles.head()

Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,...,Intensity_Cytoplasm_Mito_MIN.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_STD.INTENSITY_y,Intensity_Cytoplasm_Mito_STD.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_UPPER.QUARTILE.INTENSITY_y,Intensity_Cytoplasm_Mito_VOLUME_y,Metadata_Target,Metadata_Class,Metadata_Therapeutic_Categories,Metadata_patient,Metadata_tumor
0,NF0014_T1,255,uM,1,Fimepinostat,E5-2,E5,19,31267.0,1247.273682,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
1,NF0014_T1,25,uM,1,Fimepinostat,D5-2,D5,-1,35478.0,1115.733765,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
2,NF0014_T1,51,uM,1,Fimepinostat,D5-2,D5,-1,9615.0,1050.518799,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
3,NF0014_T1,63,uM,1,Fimepinostat,D5-2,D5,-1,3784.0,708.924438,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
4,NF0014_T1,76,uM,1,Fimepinostat,D5-2,D5,-1,2883.0,414.661469,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1


### Normalize the single-cell profiles

In [6]:
sc_metadata_columns = [x for x in sc_annotated_profiles.columns if "Metadata" in x]

sc_metadata_columns += [
    "Area.Size.Shape_Cell_CENTER.X",
    "Area.Size.Shape_Cell_CENTER.Y",
    "Area.Size.Shape_Cell_CENTER.Z",
]
sc_features_columns = [
    col for col in sc_annotated_profiles.columns if col not in sc_metadata_columns
]

In [7]:
# find inf values and replace with NaN
sc_annotated_profiles[sc_features_columns] = sc_annotated_profiles[
    sc_features_columns
].replace([float("inf"), -float("inf")], np.nan)

In [8]:
# normalize the data
sc_normalized_profiles = normalize(
    sc_annotated_profiles,
    features=sc_features_columns,
    meta_features=sc_metadata_columns,
    method="standardize",
    samples="Metadata_treatment == 'DMSO'",
)
sc_normalized_profiles.to_parquet(sc_normalized_output_path, index=False)
sc_normalized_profiles.head()

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_parent_organoid,Metadata_Target,Metadata_Class,...,Intensity_Cytoplasm_Mito_MAX.Z_y,Intensity_Cytoplasm_Mito_MEAN.INTENSITY_y,Intensity_Cytoplasm_Mito_MEAN.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_MEDIAN.INTENSITY_y,Intensity_Cytoplasm_Mito_MIN.INTENSITY_y,Intensity_Cytoplasm_Mito_MIN.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_STD.INTENSITY_y,Intensity_Cytoplasm_Mito_STD.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_UPPER.QUARTILE.INTENSITY_y,Intensity_Cytoplasm_Mito_VOLUME_y
0,NF0014_T1,255,uM,1,Fimepinostat,E5-2,E5,19,PI3K and HDAC inhibitor,Small Molecule,...,,,,,,,,,,
1,NF0014_T1,25,uM,1,Fimepinostat,D5-2,D5,-1,PI3K and HDAC inhibitor,Small Molecule,...,,,,,,,,,,
2,NF0014_T1,51,uM,1,Fimepinostat,D5-2,D5,-1,PI3K and HDAC inhibitor,Small Molecule,...,,,,,,,,,,
3,NF0014_T1,63,uM,1,Fimepinostat,D5-2,D5,-1,PI3K and HDAC inhibitor,Small Molecule,...,,,,,,,,,,
4,NF0014_T1,76,uM,1,Fimepinostat,D5-2,D5,-1,PI3K and HDAC inhibitor,Small Molecule,...,,,,,,,,,,


### Normalize the organoid profiles

In [9]:
organoid_annotated_profiles.head()

Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,...,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,Metadata_Target,Metadata_Class,Metadata_Therapeutic_Categories,Metadata_patient,Metadata_tumor
0,NF0014_T1,19,uM,1,Fimepinostat,E5-2,E5,1.0,3268026.0,938.570068,...,0.888459,8.625683,1.346913,702.361139,188.51203,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
1,NF0014_T1,15,uM,1,Fimepinostat,D5-2,D5,11.0,5570203.0,950.582825,...,0.853993,7.177906,1.556027,297.955168,85.399048,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
2,NF0014_T1,47,uM,1,Ketotifen,G6-1,G6,9.0,6134128.0,795.807983,...,0.875041,13.244656,1.343332,1153.970316,303.579081,histamine H1 receptor antagonist,Small Molecule,Anti-Allergic Agents,NF0014,T1
3,NF0014_T1,40,uM,10,Mirdametinib,G8-1,G8,8.0,6416256.0,769.689636,...,0.943387,5.15355,0.835848,479.284825,121.910734,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T1
4,NF0014_T1,37,uM,10,Mirdametinib,C9-2,C9,13.0,14239851.0,817.411133,...,0.818021,12.778783,2.293907,971.36421,258.637076,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T1


In [10]:
organoid_metadata_columns = [
    x for x in organoid_annotated_profiles.columns if "Metadata" in x
]
organoid_metadata_columns += [
    "Area.Size.Shape_Organoid_CENTER.X",
    "Area.Size.Shape_Organoid_CENTER.Y",
    "Area.Size.Shape_Organoid_CENTER.Z",
]
organoid_features_columns = [
    col
    for col in organoid_annotated_profiles.columns
    if col not in organoid_metadata_columns
]
# normalize the data
organoid_normalized_profiles = normalize(
    organoid_annotated_profiles,
    features=organoid_features_columns,
    meta_features=organoid_metadata_columns,
    method="standardize",
    samples="Metadata_treatment == 'DMSO'",
)
organoid_normalized_profiles.to_parquet(organoid_normalized_output_path, index=False)
organoid_normalized_profiles.head()

Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_single_cell_count,Metadata_Target,Metadata_Class,...,Texture_Organoid_Mito_Difference.Entropy_256.3,Texture_Organoid_Mito_Difference.Variance_256.3,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3
0,NF0014_T1,19,uM,1,Fimepinostat,E5-2,E5,1.0,PI3K and HDAC inhibitor,Small Molecule,...,-0.744079,0.929269,-0.876963,-0.149888,-0.453953,0.924721,-1.012954,-0.840821,-0.826647,-0.812516
1,NF0014_T1,15,uM,1,Fimepinostat,D5-2,D5,11.0,PI3K and HDAC inhibitor,Small Molecule,...,-0.018758,0.263903,-0.497285,1.297494,-1.614111,0.248123,-1.166291,-0.38611,-1.094204,-1.08214
2,NF0014_T1,47,uM,1,Ketotifen,G6-1,G6,9.0,histamine H1 receptor antagonist,Small Molecule,...,-0.825789,0.752942,-0.748346,0.050014,-0.465444,0.661318,-0.523751,-0.848608,-0.527859,-0.511635
3,NF0014_T1,40,uM,10,Mirdametinib,G8-1,G8,8.0,MEK1/2 inhibitor,Small Molecule,...,-1.928917,1.998597,-1.865807,-0.72397,-1.48379,2.00297,-1.380694,-1.952115,-0.974235,-0.986668
4,NF0014_T1,37,uM,10,Mirdametinib,C9-2,C9,13.0,MEK1/2 inhibitor,Small Molecule,...,1.306989,-0.62237,1.532605,0.813374,0.531467,-0.458024,-0.573092,1.218386,-0.648673,-0.629151
