This notebook performs IBP on the scDINO profile data.
We perform the following steps:
1. Load the scDINO profile data.
2. Normalize the data.
3. Feature select the data.
4. Aggregate the data.

In [1]:
import pathlib

import pandas as pd
from pycytominer import aggregate, feature_select, normalize

In [2]:
scDINO_profile_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve(strict=True)

scDINO_normalized_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated_normalized.parquet"
).resolve()
scDINO_feature_selected_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated_normalized_feature_selected.parquet"
).resolve()
scDINO_aggregated_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated_normalized_feature_selected_aggregated.parquet"
).resolve()

scDINO_profile = pd.read_parquet(scDINO_profile_path)
print(f"scDINO profile shape: {scDINO_profile.shape}")
scDINO_profile.head()

scDINO profile shape: (188065, 1546)


Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102,111256,Staurosporine,0.0,negative,...,0.034336,-0.005963,-0.032243,0.056527,-0.082777,0.036178,0.009163,0.018071,0.038674,-0.015397
1,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,106,111257,Staurosporine,0.0,negative,...,0.008716,-0.049256,-0.029706,0.013455,-0.090113,0.029853,0.014054,-0.042019,0.045846,-0.025033
2,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,121,111258,Staurosporine,0.0,negative,...,0.016367,0.026408,-0.000336,0.040077,-0.120304,0.012534,0.008838,0.048772,0.048959,0.016963
3,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,127,111259,Staurosporine,0.0,negative,...,-0.009971,-0.015348,-0.035576,0.037665,-0.102958,-0.030505,0.026679,-0.000865,0.092421,-0.001723
4,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,13,111241,Staurosporine,0.0,negative,...,0.016587,-0.023263,-0.012859,0.031189,-0.064073,0.017178,0.006649,0.058667,0.051056,-0.021589


## Normalization

In [3]:
metadata_columns = [x for x in scDINO_profile.columns if "metadata" in x.lower()]
features = [x for x in scDINO_profile.columns if x not in metadata_columns]
normalized_df = normalize(
    # df with annotated raw merged single cell features
    profiles=scDINO_profile,
    # specify samples used as normalization reference (negative control)
    samples="Metadata_compound == 'Staurosporine' and Metadata_dose == 0.0",
    # normalization method used
    method="standardize",
    features=features,
    meta_features=metadata_columns,
)

normalized_df.to_parquet(
    scDINO_normalized_path,
    index=False,
)
# check to see if the features have been normalized
print(normalized_df.shape)
normalized_df.head()

(188065, 1546)


Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102,111256,Staurosporine,0.0,negative,...,2.368147,0.052654,-0.157713,1.402609,0.177238,1.538436,-0.238824,0.268903,-0.480108,-0.35201
1,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,106,111257,Staurosporine,0.0,negative,...,1.238575,-1.691482,-0.051,-1.444994,-0.03534,1.254336,-0.015985,-1.869041,-0.215325,-1.100292
2,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,121,111258,Staurosporine,0.0,negative,...,1.575882,1.356789,1.184473,0.31503,-0.910127,0.47636,-0.253669,1.361193,-0.100408,2.16081
3,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,127,111259,Staurosporine,0.0,negative,...,0.414664,-0.325409,-0.297905,0.155579,-0.407515,-1.456983,0.559278,-0.404836,1.504038,0.709756
4,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,13,111241,Staurosporine,0.0,negative,...,1.585601,-0.644312,0.657657,-0.272598,0.719175,0.684934,-0.353381,1.713252,-0.023018,-0.83281


## Feature Selection

In [4]:
# define operations to be performed on the data
feature_select_ops = [
    "variance_threshold",
    "blocklist",
    "drop_na_columns",
    "correlation_threshold",
]
manual_block_list = [  # block list of features to be removed
    # these features are not useful for downstream analysis
    x
    for x in normalized_df.columns
    if "bounding" in x.lower()
    or "Location_Center_Y" in x.lower()
    or "Location_Center_X" in x.lower()
]
metadata_columns = [x for x in normalized_df.columns if "metadata" in x.lower()]
features = [
    x for x in normalized_df.columns if x not in metadata_columns + manual_block_list
]

# perform feature selection with the operations specified
feature_select_df = feature_select(
    normalized_df,
    operation=feature_select_ops,
    features=features,
)
# merge the metadata columns back into the feature selected dataframe
feature_select_df = pd.merge(
    feature_select_df,
    normalized_df[metadata_columns],
    how="left",
)
print(f"Number of features before feature selection: {len(features)}")
print(f"Number of features after feature selection: {len(feature_select_df.columns)}")
feature_select_df.to_parquet(
    scDINO_feature_selected_path,
    index=False,
)
feature_select_df.head()

Number of features before feature selection: 1536
Number of features after feature selection: 1546


Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102,111256,Staurosporine,0.0,negative,...,2.368147,0.052654,-0.157713,1.402609,0.177238,1.538436,-0.238824,0.268903,-0.480108,-0.35201
1,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,106,111257,Staurosporine,0.0,negative,...,1.238575,-1.691482,-0.051,-1.444994,-0.03534,1.254336,-0.015985,-1.869041,-0.215325,-1.100292
2,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,121,111258,Staurosporine,0.0,negative,...,1.575882,1.356789,1.184473,0.31503,-0.910127,0.47636,-0.253669,1.361193,-0.100408,2.16081
3,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,127,111259,Staurosporine,0.0,negative,...,0.414664,-0.325409,-0.297905,0.155579,-0.407515,-1.456983,0.559278,-0.404836,1.504038,0.709756
4,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,13,111241,Staurosporine,0.0,negative,...,1.585601,-0.644312,0.657657,-0.272598,0.719175,0.684934,-0.353381,1.713252,-0.023018,-0.83281


## Aggregation

In [5]:
metadata_cols = [
    "Metadata_Well",
    "Metadata_Time",
    "Metadata_compound",
    "Metadata_dose",
    "Metadata_control",
]
feature_cols = feature_select_df.columns[
    ~feature_select_df.columns.str.contains("Metadata")
].to_list()
aggregated_df = aggregate(
    feature_select_df,
    features=feature_cols,
    strata=["Metadata_Well", "Metadata_Time"],
    operation="median",
)
aggregated_df = pd.merge(
    aggregated_df,
    feature_select_df[metadata_cols],
    how="left",
    on=["Metadata_Well", "Metadata_Time"],
)
aggregated_df.drop_duplicates(
    subset=metadata_cols,
    inplace=True,
)
aggregated_df.reset_index(drop=True, inplace=True)
aggregated_df["Metadata_Time"] = (
    aggregated_df["Metadata_Time"].astype(int) - 1
)  # adjust for 0-based indexing
print(f"Number of samples before aggregation: {feature_select_df.shape[0]}")
print(f"Number of samples after aggregation: {aggregated_df.shape[0]}")
aggregated_df.to_parquet(
    scDINO_aggregated_path,
    index=False,
)
aggregated_df.head()

Number of samples before aggregation: 188065
Number of samples after aggregation: 390


Unnamed: 0,Metadata_Well,Metadata_Time,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_10,channel488-1_cls_feature_100,channel488-1_cls_feature_101,channel488-1_cls_feature_102,channel488-1_cls_feature_103,channel488-1_cls_feature_104,...,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99,Metadata_compound,Metadata_dose,Metadata_control
0,C-02,0,0.036395,0.086592,0.068658,0.263662,0.035191,0.227475,0.033971,0.100787,...,-0.233945,-0.151954,0.586218,0.025415,0.050097,-0.259022,-0.182365,Staurosporine,0.0,negative
1,C-02,1,0.126309,0.235881,-0.006457,0.001049,0.023703,-0.095641,-0.219018,-0.123594,...,-0.174913,-0.080899,0.111478,0.052516,-0.064528,-0.032682,-0.063839,Staurosporine,0.0,negative
2,C-02,2,0.134325,0.135026,-0.086505,-0.048807,0.075291,-0.083251,-0.217281,-0.087305,...,-0.087288,-0.058948,0.117263,-0.011659,-0.029003,-0.076662,-0.134727,Staurosporine,0.0,negative
3,C-02,3,0.020182,0.194855,-0.200572,0.038177,0.039418,-0.103094,-0.296713,-0.13065,...,-0.181671,-0.078114,0.069019,0.103721,0.055212,0.001864,-0.043144,Staurosporine,0.0,negative
4,C-02,4,0.085957,0.251722,-0.089373,0.002672,-0.001731,-0.116209,-0.296459,-0.23559,...,-0.08306,-0.200147,-0.003858,0.120898,-0.12683,-0.02238,-0.038543,Staurosporine,0.0,negative
