This notebook performs profile normalization.
All profiles are normalized to the DMSO control treated profiles.

In [1]:
import argparse
import pathlib

import numpy as np
import pandas as pd
from pycytominer import normalize

# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "NF0014"

In [3]:
# pathing
sc_annotated_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/2.annotated_profiles/sc_anno.parquet"
).resolve(strict=True)
organoid_annotated_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/2.annotated_profiles/organoid_anno.parquet"
).resolve(strict=True)


# output path
sc_normalized_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/sc_norm.parquet"
).resolve()
organoid_normalized_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/organoid_norm.parquet"
).resolve()

organoid_normalized_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# read in the data
sc_annotated_profiles = pd.read_parquet(sc_annotated_path)
organoid_annotated_profiles = pd.read_parquet(organoid_annotated_path)

### Normalize the single-cell profiles

In [6]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "Target",
    "Class",
    "Therapeutic Categories",
    "image_set",
    "Well",
    "parent_organoid",
]
sc_features_columns = [
    col for col in sc_annotated_profiles.columns if col not in sc_metadata_columns
]

In [7]:
# find inf values and replace with NaN
sc_annotated_profiles[sc_features_columns] = sc_annotated_profiles[
    sc_features_columns
].replace([float("inf"), -float("inf")], np.nan)

In [8]:
# normalize the data
sc_normalized_profiles = normalize(
    sc_annotated_profiles,
    features=sc_features_columns,
    meta_features=sc_metadata_columns,
    method="standardize",
    samples="treatment == 'DMSO'",
)
sc_normalized_profiles.to_parquet(sc_normalized_output_path, index=False)
sc_normalized_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.3,Texture_Cytoplasm_Mito_Difference.Variance_256.3,Texture_Cytoplasm_Mito_Entropy_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3
0,NF0014,25,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,0.076564,0.020575,0.033899,0.37894,0.274566,0.024535,0.009685,0.034226,0.132669,0.157663
1,NF0014,51,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,0.143096,-0.020886,0.093548,-0.170211,0.573598,-0.01253,0.078748,0.097408,0.197211,0.205372
2,NF0014,76,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,0.016383,0.054972,-0.007522,0.174039,0.198937,0.057107,-0.067183,-0.011351,-0.025754,-0.015079
3,NF0014,102,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,-0.228797,0.18067,-0.19331,0.601983,-0.54528,0.170874,-0.320329,-0.21069,-0.502716,-0.508395
4,NF0014,127,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,-0.117688,0.123971,-0.108999,-0.14889,-0.076024,0.120146,-0.181596,-0.118077,-0.224879,-0.235531


### Normalize the organoid profiles

In [9]:
organoid_annotated_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,...,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,Target,Class,Therapeutic Categories
0,NF0014,40,uM,10,Mirdametinib,G8-1,G8,8.0,6416256.0,769.689636,...,-0.538078,0.720674,0.943387,5.15355,0.835848,479.284825,121.910734,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
1,NF0014,24,uM,1,Fimepinostat,D5-1,D5,5.0,3727848.0,904.163208,...,-0.198021,0.423806,0.792245,4.166492,1.731223,266.549623,92.589375,PI3K and HDAC inhibitor,Small Molecule,Investigational
2,NF0014,85,uM,1,Mirdametinib,F8-1,F8,9.0,19925580.0,609.403259,...,-0.416777,0.919123,0.758222,9.48062,2.757967,167.525981,43.794476,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
3,NF0014,18,nM,10,Staurosporine,G11-1,G11,1.0,2721536.0,985.260254,...,-0.577786,0.694415,0.947585,6.919087,0.700396,862.401508,221.828718,Apoptosis induction,Small Molecule,Experimental
4,NF0014,24,uM,1,Binimetinib,G7-1,G7,7.0,6343511.0,925.20459,...,-0.525013,0.819592,0.90309,14.716854,1.289324,2075.41599,527.099865,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor


In [10]:
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "Target",
    "Class",
    "Therapeutic Categories",
    "image_set",
    "Well",
    "single_cell_count",
]
organoid_features_columns = [
    col
    for col in organoid_annotated_profiles.columns
    if col not in organoid_metadata_columns
]
# normalize the data
organoid_normalized_profiles = normalize(
    organoid_annotated_profiles,
    features=organoid_features_columns,
    meta_features=organoid_metadata_columns,
    method="standardize",
    samples="treatment == 'DMSO'",
)
organoid_normalized_profiles.to_parquet(organoid_normalized_output_path, index=False)
organoid_normalized_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Texture_Organoid_Mito_Difference.Entropy_256.3,Texture_Organoid_Mito_Difference.Variance_256.3,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3
0,NF0014,40,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,-1.928917,1.998597,-1.865807,-0.72397,-1.48379,2.00297,-1.380694,-1.952115,-0.974235,-0.986668
1,NF0014,24,uM,1,Fimepinostat,PI3K and HDAC inhibitor,Small Molecule,Investigational,D5-1,D5,...,0.968154,-0.821376,-0.079639,2.964748,-5.489025,-0.964006,-1.485235,-0.005153,-1.114982,-1.063338
2,NF0014,85,uM,1,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,F8-1,F8,...,1.718297,-2.243469,2.413318,0.591827,1.193602,-1.631894,-0.922406,2.227469,-1.180497,-1.190929
3,NF0014,18,nM,10,Staurosporine,Apoptosis induction,Small Molecule,Experimental,G11-1,G11,...,-2.242927,2.173837,-2.179663,-1.154694,-1.838068,2.085389,-1.193703,-2.246652,-0.720763,-0.725399
4,NF0014,24,uM,1,Binimetinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G7-1,G7,...,-0.950405,1.237115,-0.916919,-0.582249,-0.149224,1.211934,-0.367827,-0.966048,0.081775,0.072836
