This notebook performs IBP on the scDINO profile data.
We perform the following steps:
1. Load the scDINO profile data.
2. Normalize the data.
3. Feature select the data.
4. Aggregate the data.

In [1]:
import pathlib

import pandas as pd
from pycytominer import aggregate, feature_select, normalize

In [2]:
scDINO_profile_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve(strict=True)

scDINO_normalized_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated_normalized.parquet"
).resolve()
scDINO_feature_selected_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated_normalized_feature_selected.parquet"
).resolve()
scDINO_aggregated_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated_normalized_feature_selected_aggregated.parquet"
).resolve()

scDINO_profile = pd.read_parquet(scDINO_profile_path)
print(f"scDINO profile shape: {scDINO_profile.shape}")
scDINO_profile.head()

scDINO profile shape: (140235, 1546)


Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102.0,145086,Staurosporine,0.0,negative,...,0.035033,-0.000629,-0.034413,0.056733,-0.078023,0.033735,0.006015,0.016154,0.03666,-0.014219
1,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,106.0,145087,Staurosporine,0.0,negative,...,-0.005406,-0.033099,-0.04295,0.012875,-0.074455,0.030203,0.032813,-0.017182,0.052587,-0.005798
2,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,122.0,145088,Staurosporine,0.0,negative,...,0.01569,0.027015,0.001716,0.039788,-0.119122,0.010958,0.007134,0.048436,0.046638,0.01856
3,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,128.0,145090,Staurosporine,0.0,negative,...,-0.009345,-0.016075,-0.035206,0.037123,-0.102122,-0.030821,0.02755,0.001286,0.091023,-0.000872
4,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,13.0,145073,Staurosporine,0.0,negative,...,0.026046,-0.004125,0.002168,0.03304,-0.054002,0.024171,-0.018173,0.041654,0.04535,-0.033588


## Normalization

In [3]:
metadata_columns = [x for x in scDINO_profile.columns if "metadata" in x.lower()]
features = [x for x in scDINO_profile.columns if x not in metadata_columns]
normalized_df = normalize(
    # df with annotated raw merged single cell features
    profiles=scDINO_profile,
    # specify samples used as normalization reference (negative control)
    samples="Metadata_compound == 'Staurosporine' and Metadata_dose == 0.0",
    # normalization method used
    method="standardize",
    features=features,
    meta_features=metadata_columns,
)

normalized_df.to_parquet(
    scDINO_normalized_path,
    index=False,
)
# check to see if the features have been normalized
print(normalized_df.shape)
normalized_df.head()

(140235, 1546)


Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102.0,145086,Staurosporine,0.0,negative,...,2.357905,0.202156,-0.278771,1.407479,0.261751,1.43974,-0.345966,0.178777,-0.525531,-0.259747
1,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,106.0,145087,Staurosporine,0.0,negative,...,0.625998,-1.097302,-0.640105,-1.409948,0.364399,1.282464,0.860702,-0.994511,0.053722,0.392298
2,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,122.0,145088,Staurosporine,0.0,negative,...,1.529507,1.308494,1.250326,0.318921,-0.920333,0.425651,-0.29558,1.315003,-0.162635,2.278403
3,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,128.0,145090,Staurosporine,0.0,negative,...,0.457299,-0.415988,-0.312368,0.147751,-0.431376,-1.434391,0.623737,-0.344507,1.451535,0.773698
4,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,13.0,145073,Staurosporine,0.0,negative,...,1.973008,0.062269,1.269449,-0.114555,0.952655,1.013934,-1.435072,1.076282,-0.209469,-1.7596


## Feature Selection

In [4]:
# define operations to be performed on the data
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "blocklist",
    "drop_na_columns",
    "correlation_threshold",
]
manual_block_list = [
    x
    for x in normalized_df.columns
    if "bounding" in x.lower()
    or "Location_Center_Y" in x.lower()
    or "Location_Center_X" in x.lower()
]
metadata_columns = [x for x in normalized_df.columns if "metadata" in x.lower()]
features = [
    x for x in normalized_df.columns if x not in metadata_columns + manual_block_list
]

# perform feature selection with the operations specified
feature_select_df = feature_select(
    normalized_df,
    operation=feature_select_ops,
    features=features,
)
# merge the metadata columns back into the feature selected dataframe
feature_select_df = pd.merge(
    feature_select_df,
    normalized_df[metadata_columns],
    how="left",
)
print(f"Number of features before feature selection: {len(features)}")
print(f"Number of features after feature selection: {len(feature_select_df.columns)}")
feature_select_df.to_parquet(
    scDINO_feature_selected_path,
    index=False,
)
feature_select_df.head()

Number of features before feature selection: 1536
Number of features after feature selection: 1546


Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102.0,145086,Staurosporine,0.0,negative,...,2.357905,0.202156,-0.278771,1.407479,0.261751,1.43974,-0.345966,0.178777,-0.525531,-0.259747
1,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,106.0,145087,Staurosporine,0.0,negative,...,0.625998,-1.097302,-0.640105,-1.409948,0.364399,1.282464,0.860702,-0.994511,0.053722,0.392298
2,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,122.0,145088,Staurosporine,0.0,negative,...,1.529507,1.308494,1.250326,0.318921,-0.920333,0.425651,-0.29558,1.315003,-0.162635,2.278403
3,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,128.0,145090,Staurosporine,0.0,negative,...,0.457299,-0.415988,-0.312368,0.147751,-0.431376,-1.434391,0.623737,-0.344507,1.451535,0.773698
4,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,13.0,145073,Staurosporine,0.0,negative,...,1.973008,0.062269,1.269449,-0.114555,0.952655,1.013934,-1.435072,1.076282,-0.209469,-1.7596


## Aggregation

In [5]:
metadata_cols = [
    "Metadata_Well",
    "Metadata_Time",
    "Metadata_compound",
    "Metadata_dose",
    "Metadata_control",
]
feature_cols = feature_select_df.columns[
    ~feature_select_df.columns.str.contains("Metadata")
].to_list()
aggregated_df = aggregate(
    feature_select_df,
    features=feature_cols,
    strata=["Metadata_Well", "Metadata_Time"],
    operation="median",
)
aggregated_df = pd.merge(
    aggregated_df,
    feature_select_df[metadata_cols],
    how="left",
    on=["Metadata_Well", "Metadata_Time"],
)
aggregated_df.drop_duplicates(
    subset=metadata_cols,
    inplace=True,
)
aggregated_df.reset_index(drop=True, inplace=True)
aggregated_df["Metadata_Time"] = (
    aggregated_df["Metadata_Time"].astype(int) - 1
)  # adjust for 0-based indexing
print(f"Number of samples before aggregation: {feature_select_df.shape[0]}")
print(f"Number of samples after aggregation: {aggregated_df.shape[0]}")
aggregated_df.to_parquet(
    scDINO_aggregated_path,
    index=False,
)
aggregated_df.head()

Number of samples before aggregation: 140235
Number of samples after aggregation: 389


Unnamed: 0,Metadata_Well,Metadata_Time,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_10,channel488-1_cls_feature_100,channel488-1_cls_feature_101,channel488-1_cls_feature_102,channel488-1_cls_feature_103,channel488-1_cls_feature_104,...,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99,Metadata_compound,Metadata_dose,Metadata_control
0,C-02,0,0.018916,0.239166,0.064957,0.311643,-0.064534,0.129686,0.002053,0.180979,...,-0.05951,-0.218752,0.42844,-0.032853,0.018116,-0.186052,-0.25793,Staurosporine,0.0,negative
1,C-02,1,0.060277,0.240965,-0.044576,0.059983,0.038035,-0.084567,-0.258948,-0.155832,...,-0.187375,-0.110289,0.096925,0.144873,-0.033542,0.004723,-0.075264,Staurosporine,0.0,negative
2,C-02,2,0.130321,0.184951,9.3e-05,-0.003949,0.034528,-0.124687,-0.202871,-0.103265,...,-0.063743,-0.012421,0.057629,0.033587,-0.022887,-0.04863,-0.119355,Staurosporine,0.0,negative
3,C-02,3,0.093349,0.227056,-0.077017,0.002847,-0.046308,-0.052228,-0.27439,-0.111946,...,-0.043093,-0.028822,0.102412,0.144641,-0.029151,-0.024309,-0.137908,Staurosporine,0.0,negative
4,C-02,4,0.100236,0.217678,-0.150183,0.016396,-0.008802,-0.069641,-0.279787,-0.219125,...,-0.070611,-0.147043,-0.017499,0.032548,-0.157934,0.035059,-0.028296,Staurosporine,0.0,negative
