This notebook performs profile normalization.
All profiles are normalized to the DMSO control treated profiles.

In [1]:
import argparse
import pathlib

import pandas as pd
from pycytominer import normalize

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "NF0014"

In [3]:
# pathing
sc_annotated_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/1.sc_annotated_profiles.parquet"
).resolve(strict=True)
organoid_annotated_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/1.organoid_annotated_profiles.parquet"
).resolve(strict=True)


# output path
sc_normalized_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/2.sc_normalized_profiles.parquet"
).resolve()
organoid_normalized_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/2.organoid_normalized_profiles.parquet"
).resolve()

In [4]:
# read in the data
sc_annotated_profiles = pd.read_parquet(sc_annotated_path)
organoid_annotated_profiles = pd.read_parquet(organoid_annotated_path)

In [5]:
sc_annotated_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.1,Texture_Cytoplasm_Mito_Difference.Variance_256.1,Texture_Cytoplasm_Mito_Entropy_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_Mito_Sum.Average_256.1,Texture_Cytoplasm_Mito_Sum.Entropy_256.1,Texture_Cytoplasm_Mito_Sum.Variance_256.1,Texture_Cytoplasm_Mito_Variance_256.1
0,NF0014,15,%,1,DMSO,C4-2,C4,32,99661.0,473.778268,...,0.051024,0.003861,0.070853,-0.59996,0.242368,0.996241,0.699428,0.063534,121.523028,32.474614
1,NF0014,26,%,1,DMSO,C4-2,C4,32,156362.0,715.339418,...,0.061713,0.003853,0.091123,-0.59905,0.273598,0.995213,0.716101,0.078503,99.806008,26.505418
2,NF0014,37,%,1,DMSO,C4-2,C4,32,84453.0,503.486353,...,0.030409,0.003874,0.042813,-0.607538,0.191419,0.998169,0.093143,0.037626,3.992738,1.070541
3,NF0014,43,%,1,DMSO,C4-2,C4,32,131041.0,693.588457,...,0.051189,0.00386,0.071099,-0.587976,0.239543,0.996087,0.859732,0.06178,175.059103,46.919265
4,NF0014,51,%,1,DMSO,C4-2,C4,32,69045.0,399.909088,...,0.043201,0.003866,0.061412,-0.619343,0.231458,0.996829,0.5343,0.0539,83.157453,22.02088


In [44]:
[x for x in sc_annotated_profiles.columns if "image" in x]

['image_set', 'image_set_1', 'image_set_2']

In [6]:
metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "parent_organoid",
]
features_columns = [
    col for col in sc_annotated_profiles.columns if col not in metadata_columns
]

In [7]:
metadata_columns

['patient',
 'object_id',
 'unit',
 'dose',
 'treatment',
 'image_set',
 'Well',
 'parent_organoid']

In [8]:
print(features_columns)

['Area.Size.Shape_Nuclei_VOLUME', 'Area.Size.Shape_Nuclei_CENTER.X', 'Area.Size.Shape_Nuclei_CENTER.Y', 'Area.Size.Shape_Nuclei_CENTER.Z', 'Area.Size.Shape_Nuclei_BBOX.VOLUME', 'Area.Size.Shape_Nuclei_MIN.X', 'Area.Size.Shape_Nuclei_MAX.X', 'Area.Size.Shape_Nuclei_MIN.Y', 'Area.Size.Shape_Nuclei_MAX.Y', 'Area.Size.Shape_Nuclei_MIN.Z', 'Area.Size.Shape_Nuclei_MAX.Z', 'Area.Size.Shape_Nuclei_EXTENT', 'Area.Size.Shape_Nuclei_EULER.NUMBER', 'Area.Size.Shape_Nuclei_EQUIVALENT.DIAMETER', 'Area.Size.Shape_Nuclei_SURFACE.AREA', 'Colocalization_Nuclei_AGP.BF_MEAN.CORRELATION.COEFF', 'Colocalization_Nuclei_AGP.BF_MEDIAN.CORRELATION.COEFF', 'Colocalization_Nuclei_AGP.BF_MIN.CORRELATION.COEFF', 'Colocalization_Nuclei_AGP.BF_MAX.CORRELATION.COEFF', 'Colocalization_Nuclei_AGP.BF_MEAN.MANDERS.COEFF.M1', 'Colocalization_Nuclei_AGP.BF_MEDIAN.MANDERS.COEFF.M1', 'Colocalization_Nuclei_AGP.BF_MIN.MANDERS.COEFF.M1', 'Colocalization_Nuclei_AGP.BF_MAX.MANDERS.COEFF.M1', 'Colocalization_Nuclei_AGP.BF_MEAN.MAN

In [9]:
# normalize the data
sc_normalized_profiles = normalize(
    sc_annotated_profiles,
    features=features_columns,
    meta_features=metadata_columns,
    method="standardize",
    samples="treatment == 'DMSO'",
)

ValueError: could not convert string to float: 'C4-2'

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [20]:
samples = "treatment == 'DMSO'"

In [31]:
sc_annotated_profiles.query(samples).loc[:, features_columns]

Unnamed: 0,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,Area.Size.Shape_Nuclei_CENTER.Y,Area.Size.Shape_Nuclei_CENTER.Z,Area.Size.Shape_Nuclei_BBOX.VOLUME,Area.Size.Shape_Nuclei_MIN.X,Area.Size.Shape_Nuclei_MAX.X,Area.Size.Shape_Nuclei_MIN.Y,Area.Size.Shape_Nuclei_MAX.Y,Area.Size.Shape_Nuclei_MIN.Z,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.1,Texture_Cytoplasm_Mito_Difference.Variance_256.1,Texture_Cytoplasm_Mito_Entropy_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_Mito_Sum.Average_256.1,Texture_Cytoplasm_Mito_Sum.Entropy_256.1,Texture_Cytoplasm_Mito_Sum.Variance_256.1,Texture_Cytoplasm_Mito_Variance_256.1
0,99661.0,473.778268,746.900453,10.934618,142417.0,421,528,687,808,6,...,0.051024,0.003861,0.070853,-0.59996,0.242368,0.996241,0.699428,0.063534,121.523028,32.474614
1,156362.0,715.339418,224.483033,7.888688,256768.0,652,780,165,283,0,...,0.061713,0.003853,0.091123,-0.59905,0.273598,0.995213,0.716101,0.078503,99.806008,26.505418
2,84453.0,503.486353,253.488615,4.147455,130980.0,453,564,195,313,0,...,0.030409,0.003874,0.042813,-0.607538,0.191419,0.998169,0.093143,0.037626,3.992738,1.070541
3,131041.0,693.588457,424.839974,19.420853,334530.0,639,757,369,474,6,...,0.051189,0.00386,0.071099,-0.587976,0.239543,0.996087,0.859732,0.06178,175.059103,46.919265
4,69045.0,399.909088,694.579159,5.122876,105984.0,355,447,649,745,0,...,0.043201,0.003866,0.061412,-0.619343,0.231458,0.996829,0.5343,0.0539,83.157453,22.02088
5,86784.0,742.615724,386.904775,5.289858,130200.0,671,811,343,436,1,...,0.030958,0.003874,0.041485,-0.59946,0.186677,0.997857,0.495259,0.037377,107.698922,28.89957
6,103190.0,913.824343,419.788274,15.276907,160888.0,860,964,358,477,9,...,0.103144,0.003824,0.152031,-0.61663,0.355819,0.992648,0.420117,0.132571,18.670762,4.874049
7,76429.0,506.448168,557.908909,4.353884,106784.0,458,552,488,630,1,...,0.027901,0.003875,0.037989,-0.580134,0.174671,0.998338,0.065873,0.033324,2.148029,0.587854
8,39975.0,1104.534159,673.237273,3.934509,57288.0,1074,1136,605,737,1,...,0.012742,0.003885,0.016609,-0.58509,0.11666,0.999179,0.179568,0.015166,37.330151,10.29696
9,44843.0,1239.7706,1430.440448,1.992485,60522.0,1174,1305,1354,1508,1,...,0.004986,0.003889,0.005991,-0.383177,0.052841,0.999698,0.040712,0.005591,5.67538,1.794193


In [35]:
import numpy as np

In [None]:
sc_annotated_profiles.loc[:, features_columns]

  is_string = sc_annotated_profiles.loc[:, features_columns].applymap(lambda x: isinstance(x, str))


In [41]:
import numpy as np

# Boolean DataFrame: True where cell equals "C4-2"
mask = sc_annotated_profiles.loc[:, features_columns].applymap(lambda x: x == "C4-2")

# Get (row, column) indices where "C4-2" is found
rows, cols = np.where(mask)
for row, col in zip(rows, cols):
    print(f'"C4-2" found at row {row}, column "{sc_annotated_profiles.columns[col]}"')

"C4-2" found at row 0, column "Texture_Nuclei_Mito_Entropy_256.1"
"C4-2" found at row 0, column "Texture_Cell_Mito_Entropy_256.1"
"C4-2" found at row 1, column "Texture_Nuclei_Mito_Entropy_256.1"
"C4-2" found at row 1, column "Texture_Cell_Mito_Entropy_256.1"
"C4-2" found at row 2, column "Texture_Nuclei_Mito_Entropy_256.1"
"C4-2" found at row 2, column "Texture_Cell_Mito_Entropy_256.1"
"C4-2" found at row 3, column "Texture_Nuclei_Mito_Entropy_256.1"
"C4-2" found at row 3, column "Texture_Cell_Mito_Entropy_256.1"
"C4-2" found at row 4, column "Texture_Nuclei_Mito_Entropy_256.1"
"C4-2" found at row 4, column "Texture_Cell_Mito_Entropy_256.1"
"C4-2" found at row 5, column "Texture_Nuclei_Mito_Entropy_256.1"
"C4-2" found at row 5, column "Texture_Cell_Mito_Entropy_256.1"
"C4-2" found at row 6, column "Texture_Nuclei_Mito_Entropy_256.1"
"C4-2" found at row 6, column "Texture_Cell_Mito_Entropy_256.1"
"C4-2" found at row 7, column "Texture_Nuclei_Mito_Entropy_256.1"
"C4-2" found at row 7, c

  mask = sc_annotated_profiles.loc[:, features_columns].applymap(lambda x: x == "C4-2")


In [32]:
# scaler.fit(sc_annotated_profiles.query(samples).loc[:, features_columns])