This notebook performs profile normalization.
All profiles are normalized to the DMSO control treated profiles.

In [1]:
import argparse
import pathlib

import numpy as np
import pandas as pd
from pycytominer import normalize

# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "NF0014"

In [3]:
# pathing
sc_annotated_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/2.annotated_profiles/sc_anno.parquet"
).resolve(strict=True)
organoid_annotated_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/2.annotated_profiles/organoid_anno.parquet"
).resolve(strict=True)


# output path
sc_normalized_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/sc_norm.parquet"
).resolve()
organoid_normalized_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/organoid_norm.parquet"
).resolve()

organoid_normalized_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# read in the data
sc_annotated_profiles = pd.read_parquet(sc_annotated_path)
organoid_annotated_profiles = pd.read_parquet(organoid_annotated_path)

### Normalize the single-cell profiles

In [5]:
sc_annotated_profiles

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,parent_organoid,Colocalization_Nuclei_AGP.BF_MEAN.CORRELATION.COEFF,Colocalization_Nuclei_AGP.BF_MEDIAN.CORRELATION.COEFF,...,Area.Size.Shape_Cytoplasm_MAX.Y,Area.Size.Shape_Cytoplasm_MIN.Z,Area.Size.Shape_Cytoplasm_MAX.Z,Area.Size.Shape_Cytoplasm_EXTENT,Area.Size.Shape_Cytoplasm_EULER.NUMBER,Area.Size.Shape_Cytoplasm_EQUIVALENT.DIAMETER,Area.Size.Shape_Cytoplasm_SURFACE.AREA,Target,Class,Therapeutic Categories
0,NF0014,25,uM,10,Mirdametinib,G8-1,G8,40,-0.039767,-0.039767,...,909.0,0.0,29.0,0.100018,-44.0,86.478996,8537.206055,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
1,NF0014,51,uM,10,Mirdametinib,G8-1,G8,40,0.028712,0.028712,...,916.0,0.0,41.0,0.131350,-54.0,90.847000,7764.473145,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
2,NF0014,76,uM,10,Mirdametinib,G8-1,G8,40,-0.027481,-0.027481,...,762.0,0.0,41.0,0.162104,-36.0,81.312607,3883.599609,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
3,NF0014,102,uM,10,Mirdametinib,G8-1,G8,40,-0.012504,-0.012504,...,872.0,0.0,39.0,0.159934,-37.0,62.487492,1958.485352,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
4,NF0014,127,uM,10,Mirdametinib,G8-1,G8,40,0.063136,0.063136,...,786.0,0.0,41.0,0.211394,-55.0,73.653473,3029.156738,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1396,NF0014,36,uM,1,Trametinib,C10-1,C10,-1,-0.046983,-0.046983,...,1158.0,0.0,18.0,0.102583,-7.0,48.076462,1402.699829,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
1397,NF0014,72,uM,1,Trametinib,C10-1,C10,44,0.123355,0.123355,...,1117.0,0.0,31.0,0.109106,-25.0,84.423706,5054.512695,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
1398,NF0014,145,uM,1,Trametinib,C10-1,C10,44,0.018397,0.018397,...,1040.0,0.0,33.0,0.140407,-29.0,69.789490,1238.590454,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
1399,NF0014,182,uM,1,Trametinib,C10-1,C10,44,0.029158,0.029158,...,1142.0,0.0,45.0,0.243124,-46.0,148.760895,12245.774414,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor


In [6]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "Target",
    "Class",
    "Therapeutic Categories",
    "image_set",
    "Well",
    "parent_organoid",
]
sc_features_columns = [
    col for col in sc_annotated_profiles.columns if col not in sc_metadata_columns
]

In [7]:
# find inf values and replace with NaN
sc_annotated_profiles[sc_features_columns] = sc_annotated_profiles[
    sc_features_columns
].replace([float("inf"), -float("inf")], np.nan)

In [8]:
# normalize the data
sc_normalized_profiles = normalize(
    sc_annotated_profiles,
    features=sc_features_columns,
    meta_features=sc_metadata_columns,
    method="standardize",
    samples="treatment == 'DMSO'",
)
sc_normalized_profiles.to_parquet(sc_normalized_output_path, index=False)
sc_normalized_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Area.Size.Shape_Cytoplasm_MIN.X,Area.Size.Shape_Cytoplasm_MAX.X,Area.Size.Shape_Cytoplasm_MIN.Y,Area.Size.Shape_Cytoplasm_MAX.Y,Area.Size.Shape_Cytoplasm_MIN.Z,Area.Size.Shape_Cytoplasm_MAX.Z,Area.Size.Shape_Cytoplasm_EXTENT,Area.Size.Shape_Cytoplasm_EULER.NUMBER,Area.Size.Shape_Cytoplasm_EQUIVALENT.DIAMETER,Area.Size.Shape_Cytoplasm_SURFACE.AREA
0,NF0014,25,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,-0.368271,0.087323,-0.472441,0.024274,-0.733567,-1.092632,-0.965185,-0.394215,0.160842,0.942953
1,NF0014,51,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,-0.314821,0.005405,-0.035168,0.048724,-0.733567,-0.102714,-0.529325,-0.810092,0.301193,0.751769
2,NF0014,76,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,0.053986,-0.184031,-0.500205,-0.489164,-0.733567,-0.102714,-0.101508,-0.061513,-0.005162,-0.208408
3,NF0014,102,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,0.134162,-0.24547,0.232054,-0.104958,-0.733567,-0.267701,-0.131695,-0.103101,-0.610043,-0.684706
4,NF0014,127,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,-0.17585,-0.568023,-0.167044,-0.405337,-0.733567,-0.102714,0.584169,-0.85168,-0.251262,-0.419808


### Normalize the organoid profiles

In [9]:
organoid_annotated_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count,Colocalization_Organoid_AGP.BF_MEAN.CORRELATION.COEFF,Colocalization_Organoid_AGP.BF_MEDIAN.CORRELATION.COEFF,...,Area.Size.Shape_Organoid_MAX.Y,Area.Size.Shape_Organoid_MIN.Z,Area.Size.Shape_Organoid_MAX.Z,Area.Size.Shape_Organoid_EXTENT,Area.Size.Shape_Organoid_EULER.NUMBER,Area.Size.Shape_Organoid_EQUIVALENT.DIAMETER,Area.Size.Shape_Organoid_SURFACE.AREA,Target,Class,Therapeutic Categories
0,NF0014,40,uM,10,Mirdametinib,G8-1,G8,8.0,-0.047442,-0.047442,...,1162.0,0.0,41.0,0.267881,1.0,230.547821,9698.893555,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
1,NF0014,24,uM,1,Fimepinostat,D5-1,D5,5.0,0.034538,0.034538,...,1518.0,0.0,11.0,0.144968,5.0,192.377014,33464.886719,PI3K and HDAC inhibitor,Small Molecule,Investigational
2,NF0014,85,uM,1,Mirdametinib,F8-1,F8,9.0,0.01887,0.01887,...,1537.0,0.0,23.0,0.418138,14.0,336.359833,24271.376953,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor
3,NF0014,18,nM,10,Staurosporine,G11-1,G11,1.0,0.005859,0.005859,...,1085.0,0.0,19.0,0.557427,1.0,173.222839,3804.474365,Apoptosis induction,Small Molecule,Experimental
4,NF0014,24,uM,1,Binimetinib,G7-1,G7,7.0,-0.022475,-0.022475,...,1227.0,0.0,25.0,0.462178,1.0,229.673218,6281.933594,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor


In [10]:
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "Target",
    "Class",
    "Therapeutic Categories",
    "image_set",
    "Well",
    "single_cell_count",
]
organoid_features_columns = [
    col
    for col in organoid_annotated_profiles.columns
    if col not in organoid_metadata_columns
]
# normalize the data
organoid_normalized_profiles = normalize(
    organoid_annotated_profiles,
    features=organoid_features_columns,
    meta_features=organoid_metadata_columns,
    method="standardize",
    samples="treatment == 'DMSO'",
)
organoid_normalized_profiles.to_parquet(organoid_normalized_output_path, index=False)
organoid_normalized_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,Area.Size.Shape_Organoid_MIN.Y,Area.Size.Shape_Organoid_MAX.Y,Area.Size.Shape_Organoid_MIN.Z,Area.Size.Shape_Organoid_MAX.Z,Area.Size.Shape_Organoid_EXTENT,Area.Size.Shape_Organoid_EULER.NUMBER,Area.Size.Shape_Organoid_EQUIVALENT.DIAMETER,Area.Size.Shape_Organoid_SURFACE.AREA
0,NF0014,40,uM,10,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G8-1,G8,...,0.73947,-0.591772,0.300786,-0.758819,-0.357143,-0.229918,-1.66474,-0.40883,-2.100216,-1.007454
1,NF0014,24,uM,1,Fimepinostat,PI3K and HDAC inhibitor,Small Molecule,Investigational,D5-1,D5,...,-1.476846,2.007526,-1.1774,1.090707,-0.357143,-2.484878,-2.484073,-0.326823,-2.949785,1.041542
2,NF0014,85,uM,1,Mirdametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,F8-1,F8,...,-0.9736,1.404696,-1.1774,1.189417,-0.357143,-1.582894,-0.663129,-0.142307,0.254844,0.24892
3,NF0014,18,nM,10,Staurosporine,Apoptosis induction,Small Molecule,Experimental,G11-1,G11,...,2.452539,-0.021353,1.14036,-1.158857,-0.357143,-1.883555,0.265372,-0.40883,-3.3761,-1.515644
4,NF0014,24,uM,1,Binimetinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G7-1,G7,...,1.425714,0.7176,1.118031,-0.421125,-0.357143,-1.432563,-0.369556,-0.40883,-2.119682,-1.302049
