This notebook combines the cellprofiler extracted morphology features and the scDINO extracted morphology features into one feature space. Downstream notebooks will normalize the data and perform feature selection.

In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# define data paths for import
# annotated features from cellprofiler including all time points
cellprofiler_fs_features_path = pathlib.Path(
    "../../6.process_CP_features/data/3.combined_data/profiles/combined_data.parquet"
).resolve(strict=True)

# scDINO features from the scDINO analysis including all time points
scdino_features = pathlib.Path(
    "../../7.scDINO_analysis/1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve(strict=True)

# set the output path
output_path = pathlib.Path(
    "../data/CP_scDINO_features/combined_CP_scDINO_data.parquet"
).resolve()

# make the parent directory
output_path.parent.mkdir(parents=True, exist_ok=True)

In [3]:
# load in the data
cellprofiler_data = pd.read_parquet(cellprofiler_fs_features_path)
scdino_data = pd.read_parquet(scdino_features)

print(f"cellprofiler data shape: {cellprofiler_data.shape}")
print(f"scDINO data shape: {scdino_data.shape}")

cellprofiler data shape: (216151, 2335)
scDINO data shape: (188065, 1546)


In [4]:
cellprofiler_data["Metadata_original_index"] = cellprofiler_data.index

In [5]:
scdino_data.head(1)

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102,111256,Staurosporine,0.0,negative,...,0.034336,-0.005963,-0.032243,0.056527,-0.082777,0.036178,0.009163,0.018071,0.038674,-0.015397


In [6]:
# append either CP or scDINO to the column names
for col in cellprofiler_data.columns:
    # ensure Metadata is not in the column name
    if not "Metadata" in col:
        cellprofiler_data.rename(columns={col: f"{col}_CP"}, inplace=True)
for col in scdino_data.columns:
    # ensure Metadata is not in the column name
    if not "Metadata" in col:
        scdino_data.rename(columns={col: f"{col}_scDINO"}, inplace=True)

In [7]:
# make the Metadata Columns objects
# these are the columns that are common between the two datasets
cellprofiler_metadata_columns = [
    "Metadata_Well",
    "Metadata_FOV",
    "Metadata_Time",
    "Metadata_ImageNumber",
    "Metadata_Nuclei_Number_Object_Number",
    "Metadata_compound",
    "Metadata_dose",
    "Metadata_control",
    "Metadata_original_index",
]

In [8]:
scdino_data.head()
# convert time to float
scdino_data["Metadata_Time"] = scdino_data["Metadata_Time"].astype(float)
scdino_data["Metadata_Time"] = scdino_data["Metadata_Time"] - 1
scdino_data.head()

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90_scDINO,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO
0,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,102,111256,Staurosporine,0.0,negative,...,0.034336,-0.005963,-0.032243,0.056527,-0.082777,0.036178,0.009163,0.018071,0.038674,-0.015397
1,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,106,111257,Staurosporine,0.0,negative,...,0.008716,-0.049256,-0.029706,0.013455,-0.090113,0.029853,0.014054,-0.042019,0.045846,-0.025033
2,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,121,111258,Staurosporine,0.0,negative,...,0.016367,0.026408,-0.000336,0.040077,-0.120304,0.012534,0.008838,0.048772,0.048959,0.016963
3,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,127,111259,Staurosporine,0.0,negative,...,-0.009971,-0.015348,-0.035576,0.037665,-0.102958,-0.030505,0.026679,-0.000865,0.092421,-0.001723
4,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,13,111241,Staurosporine,0.0,negative,...,0.016587,-0.023263,-0.012859,0.031189,-0.064073,0.017178,0.006649,0.058667,0.051056,-0.021589


In [9]:
for col in cellprofiler_metadata_columns:
    if col not in cellprofiler_data.columns:
        raise ValueError(f"{col} not found in cellprofiler data.")
    cellprofiler_data[col] = cellprofiler_data[col].astype(str)
    if col not in scdino_data.columns:
        raise ValueError(f"{col} not found in scDINO data.")
    scdino_data[col] = scdino_data[col].astype(str)

In [10]:
print(f"cellprofiler data shape after sorting: {cellprofiler_data.shape}")
print(f"scDINO data shape after sorting: {scdino_data.shape}")
merged_df = pd.merge(
    cellprofiler_data,
    scdino_data,
    how="inner",
    on=cellprofiler_metadata_columns,
)
print(f"merged data shape: {merged_df.shape}")
# drop duplicates
merged_df = merged_df.drop_duplicates(
    subset=cellprofiler_metadata_columns,
    keep="last",
)
print(f"merged data shape after dropping duplicates: {merged_df.shape}")

cellprofiler data shape after sorting: (216151, 2336)
scDINO data shape after sorting: (188065, 1546)
merged data shape: (188065, 3873)
merged data shape after dropping duplicates: (188065, 3873)


In [11]:
# merged_df.to_parquet(output_path)
print(f"merged_df shape: {merged_df.shape}")
# merged_df.head()
# drop rows with NaN values
merged_df = merged_df.dropna(axis=0, how="all")
merged_df.to_parquet(output_path, index=False)
print(f"merged_df shape: {merged_df.shape}")
merged_df.head()

merged_df shape: (188065, 3873)
merged_df shape: (188065, 3873)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_90_scDINO,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO
0,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,7,...,0.010555,-0.029674,-0.048203,0.030452,-0.091131,0.030641,0.043315,0.036352,0.043984,0.022231
1,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,9,...,-0.007216,0.010287,-0.03158,0.037308,-0.075982,0.011744,-0.021006,0.074796,0.023915,-0.014053
2,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,10,...,-0.0208,-0.00798,-0.021566,0.042379,-0.095577,-0.016842,0.021705,0.038181,0.037327,-0.001478
3,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,11,...,-0.006244,-0.063925,-0.032741,0.046174,-0.123933,0.014068,-0.016183,0.002253,0.053429,-0.004423
4,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,12,...,-0.028743,-0.025415,-0.011485,0.019648,-0.130011,-0.010214,0.030637,-0.035683,-0.000255,-0.01357
