This notebook performs profile normalization.
All profiles are normalized to the DMSO control treated profiles.

In [1]:
import argparse
import pathlib

import pandas as pd
from pycytominer import normalize

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "NF0014"

In [3]:
# pathing
sc_annotated_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/1.sc_annotated_profiles.parquet"
).resolve(strict=True)
organoid_annotated_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/1.organoid_annotated_profiles.parquet"
).resolve(strict=True)


# output path
sc_normalized_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/2.sc_normalized_profiles.parquet"
).resolve()
organoid_normalized_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/2.organoid_normalized_profiles.parquet"
).resolve()

In [4]:
# read in the data
sc_annotated_profiles = pd.read_parquet(sc_annotated_path)
organoid_annotated_profiles = pd.read_parquet(organoid_annotated_path)

### Normalize the single-cell profiles

In [5]:
sc_annotated_profiles.head()

Unnamed: 0,patient,object_id,MOA,unit,dose,treatment,image_set,Well,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.1,Texture_Cytoplasm_Mito_Difference.Variance_256.1,Texture_Cytoplasm_Mito_Entropy_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_Mito_Sum.Average_256.1,Texture_Cytoplasm_Mito_Sum.Entropy_256.1,Texture_Cytoplasm_Mito_Sum.Variance_256.1,Texture_Cytoplasm_Mito_Variance_256.1
0,NF0014,70,MEK1/MEK2 Inhibitor,uM,10,Mirdametinib,G8-1,G8,40,43811.0,...,0.051589,0.003864,0.072747,-0.592117,0.243435,0.996572,0.73999,0.063095,161.781247,42.669714
1,NF0014,122,MEK1/MEK2 Inhibitor,uM,10,Mirdametinib,G8-1,G8,40,59089.0,...,0.057172,0.00386,0.082322,-0.608133,0.263337,0.996108,0.806632,0.071535,165.755624,43.49038
2,NF0014,162,MEK1/MEK2 Inhibitor,uM,10,Mirdametinib,G8-1,G8,40,62055.0,...,0.044009,0.003868,0.062735,-0.60307,0.229471,0.997135,0.617856,0.05445,133.682412,35.208834
3,NF0014,165,MEK1/MEK2 Inhibitor,uM,10,Mirdametinib,G8-1,G8,40,54844.0,...,0.02155,0.00388,0.029536,-0.586036,0.155376,0.998651,0.310195,0.025525,70.092415,18.834159
4,NF0014,182,MEK1/MEK2 Inhibitor,uM,10,Mirdametinib,G8-1,G8,40,37713.0,...,0.033084,0.003874,0.04585,-0.6077,0.197999,0.997895,0.484859,0.040119,107.155814,28.216124


In [6]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "parent_organoid",
    "MOA",
]
sc_features_columns = [
    col for col in sc_annotated_profiles.columns if col not in sc_metadata_columns
]

In [7]:
# normalize the data
sc_normalized_profiles = normalize(
    sc_annotated_profiles,
    features=sc_features_columns,
    meta_features=sc_metadata_columns,
    method="standardize",
    samples="treatment == 'DMSO'",
)
sc_normalized_profiles.to_parquet(sc_normalized_output_path, index=False)
sc_normalized_profiles.head()

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,parent_organoid,MOA,Area.Size.Shape_Nuclei_VOLUME,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.1,Texture_Cytoplasm_Mito_Difference.Variance_256.1,Texture_Cytoplasm_Mito_Entropy_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_Mito_Sum.Average_256.1,Texture_Cytoplasm_Mito_Sum.Entropy_256.1,Texture_Cytoplasm_Mito_Sum.Variance_256.1,Texture_Cytoplasm_Mito_Variance_256.1
0,NF0014,70,uM,10,Mirdametinib,G8-1,G8,40,MEK1/MEK2 Inhibitor,-0.68873,...,0.07147,0.014529,0.043398,0.553207,0.433568,0.020056,0.042939,0.044455,0.210545,0.217315
1,NF0014,122,uM,10,Mirdametinib,G8-1,G8,40,MEK1/MEK2 Inhibitor,-0.389751,...,0.125826,-0.02077,0.094458,0.160591,0.654296,-0.011324,0.098997,0.100074,0.240877,0.241641
2,NF0014,162,uM,10,Mirdametinib,G8-1,G8,40,MEK1/MEK2 Inhibitor,-0.331708,...,-0.002328,0.054434,-0.009989,0.284718,0.278696,0.0581,-0.059797,-0.012521,-0.003908,-0.003837
3,NF0014,165,uM,10,Mirdametinib,G8-1,G8,40,MEK1/MEK2 Inhibitor,-0.472822,...,-0.220983,0.168983,-0.187017,0.70227,-0.54307,0.160654,-0.318593,-0.203146,-0.489233,-0.489208
4,NF0014,182,uM,10,Mirdametinib,G8-1,G8,40,MEK1/MEK2 Inhibitor,-0.808064,...,-0.108694,0.111325,-0.100026,0.171216,-0.070353,0.109511,-0.17167,-0.106965,-0.206362,-0.211112


### Normalize the organoid profiles

In [8]:
organoid_annotated_profiles.head()

Unnamed: 0,patient,object_id,MOA,unit,dose,treatment,image_set,Well,single_cell_count,Area.Size.Shape_Organoid_VOLUME,...,Texture_Organoid_Mito_Difference.Entropy_256.1,Texture_Organoid_Mito_Difference.Variance_256.1,Texture_Organoid_Mito_Entropy_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Organoid_Mito_Inverse.Difference.Moment_256.1,Texture_Organoid_Mito_Sum.Average_256.1,Texture_Organoid_Mito_Sum.Entropy_256.1,Texture_Organoid_Mito_Sum.Variance_256.1,Texture_Organoid_Mito_Variance_256.1
0,NF0014,40,MEK1/MEK2 Inhibitor,uM,10,Mirdametinib,G8-1,G8,8,6416256.0,...,0.545338,0.003446,0.976656,-0.609637,0.758225,0.948133,5.216739,0.823947,483.70571,121.888125
1,NF0014,24,PI3K and HDAC inhibitor,uM,1,Fimepinostat,D5-1,D5,5,3727848.0,...,1.310975,0.002675,1.979332,-0.295282,0.647958,0.850992,4.07191,1.607417,262.768396,81.736101
2,NF0014,85,MEK1/MEK2 Inhibitor,uM,1,Mirdametinib,F8-1,F8,9,19925581.0,...,1.711179,0.001925,3.50347,-0.448653,0.930773,0.769392,9.46117,2.711064,168.775477,43.661196
3,NF0014,18,Apoptosis,nM,10,STAURO,G11-1,G11,1,2721536.0,...,0.455617,0.003506,0.780322,-0.633195,0.717287,0.950478,6.900637,0.678164,866.667507,219.817098
4,NF0014,24,MEK1/MEK2 Inhibitor,uM,1,Binimetinib,G7-1,G7,6,6343511.0,...,0.838981,0.003178,1.516453,-0.58463,0.844911,0.908857,14.740146,1.266238,2093.731127,527.137452


In [9]:
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "single_cell_count",
    "MOA",
]
organoid_features_columns = [
    col
    for col in organoid_annotated_profiles.columns
    if col not in organoid_metadata_columns
]
# normalize the data
organoid_normalized_profiles = normalize(
    organoid_annotated_profiles,
    features=organoid_features_columns,
    meta_features=organoid_metadata_columns,
    method="standardize",
    samples="treatment == 'DMSO'",
)
organoid_normalized_profiles.to_parquet(organoid_normalized_output_path, index=False)
organoid_normalized_profiles.head()

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count,MOA,Area.Size.Shape_Organoid_VOLUME,...,Texture_Organoid_Mito_Difference.Entropy_256.1,Texture_Organoid_Mito_Difference.Variance_256.1,Texture_Organoid_Mito_Entropy_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Organoid_Mito_Inverse.Difference.Moment_256.1,Texture_Organoid_Mito_Sum.Average_256.1,Texture_Organoid_Mito_Sum.Entropy_256.1,Texture_Organoid_Mito_Sum.Variance_256.1,Texture_Organoid_Mito_Variance_256.1
0,NF0014,40,uM,10,Mirdametinib,G8-1,G8,8,MEK1/MEK2 Inhibitor,-1.718171,...,-2.133984,2.118421,-1.889329,-1.389515,-2.146901,2.142268,-1.376009,-1.968139,-0.980562,-0.987831
1,NF0014,24,uM,1,Fimepinostat,D5-1,D5,5,PI3K and HDAC inhibitor,-2.085979,...,0.595326,-0.249176,-0.130507,3.504391,-4.467319,-0.035009,-1.498039,-0.153214,-1.127113,-1.093515
2,NF0014,85,uM,1,Mirdametinib,F8-1,F8,9,MEK1/MEK2 Inhibitor,0.130074,...,2.021954,-2.551488,2.543025,1.116702,1.4841,-1.863943,-0.923582,2.403406,-1.18946,-1.193731
3,NF0014,18,nM,10,STAURO,G11-1,G11,1,Apoptosis,-2.223655,...,-2.453815,2.30287,-2.233725,-1.75627,-3.008386,2.194833,-1.196517,-2.305849,-0.726536,-0.730073
4,NF0014,24,uM,1,Binimetinib,G7-1,G7,6,MEK1/MEK2 Inhibitor,-1.728123,...,-1.087216,1.297326,-0.942457,-1.000208,-0.322736,1.26196,-0.360881,-0.943562,0.087396,0.078821
