This notebook performs a fuzzy match on single-cells from the profile data to the endpoint data.
This is necessary because  the endpoint data was not included in the tracking module.
Further this notebook provides information about how long the cell track is.

In [1]:
import pathlib

import numpy as np
import pandas as pd

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

In [2]:
sc_profile_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs.parquet"
).resolve(strict=True)
endpoint_sc_profile_file_path = pathlib.Path(
    "../../data/CP_feature_select/endpoints/features_selected_profile.parquet"
).resolve(strict=True)
sc_profile_df = pd.read_parquet(sc_profile_file_path)
endpoint_sc_profile_df = pd.read_parquet(endpoint_sc_profile_file_path)
endpoint_sc_profile_df["Metadata_Well_FOV"] = (
    endpoint_sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + endpoint_sc_profile_df["Metadata_FOV"].astype(str)
)
print(endpoint_sc_profile_df.shape)
sc_profile_df["Metadata_sc_unique_track_id"] = (
    sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + sc_profile_df["Metadata_FOV"].astype(str)
    + "_"
    + sc_profile_df["Metadata_track_id"].astype(str)
)
sc_profile_df["Metadata_Well_FOV"] = (
    sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + sc_profile_df["Metadata_FOV"].astype(str)
)
sc_profile_df["Metadata_Time"] = (
    sc_profile_df["Metadata_Time"].astype(float).astype(int)
)
endpoint_sc_profile_df["Metadata_Time"] = (
    endpoint_sc_profile_df["Metadata_Time"].astype(float).astype(int)
)
print(sc_profile_df.shape)
sc_profile_df.head()

(9918, 544)
(188065, 2380)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO,Metadata_Image_FileName_CL_488_1_crop,Metadata_Image_FileName_CL_488_2_crop,Metadata_Image_FileName_CL_561_crop,Metadata_Image_FileName_DNA_crop,Metadata_parent_path,Metadata_sc_unique_track_id,Metadata_Well_FOV
0,1,C-09,168,Staurosporine,39.06,positive,1,2,0,7,...,-0.12352,2.401852,1.516202,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_5,C-09_0002
1,1,C-09,168,Staurosporine,39.06,positive,1,2,0,9,...,-0.835988,-0.264486,0.153676,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_6,C-09_0002
2,1,C-09,168,Staurosporine,39.06,positive,1,2,0,10,...,-0.359857,0.659583,0.537619,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_7,C-09_0002
3,1,C-09,168,Staurosporine,39.06,positive,1,2,0,11,...,0.211796,0.443178,1.129714,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_8,C-09_0002
4,1,C-09,168,Staurosporine,39.06,positive,1,2,0,12,...,-1.694061,-0.22899,0.648714,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_9,C-09_0002


In [3]:
original_sc_profile_shape = sc_profile_df.shape
# drop rows with NaN values
sc_profile = sc_profile_df.dropna(how="any")
print(
    f"Dropped {original_sc_profile_shape[0] - sc_profile.shape[0]} rows with NaN values"
)
sc_profile.reset_index(drop=True, inplace=True)

Dropped 2563 rows with NaN values


In [4]:
original_endpoint_sc_profile_shape = endpoint_sc_profile_df.shape
# drop rows with NaN values
endpoint_sc_profile = endpoint_sc_profile_df.dropna(how="any")
print(
    f"Dropped {original_endpoint_sc_profile_shape[0] - endpoint_sc_profile.shape[0]} rows with NaN values"
)
endpoint_sc_profile.reset_index(drop=True, inplace=True)

Dropped 554 rows with NaN values


In [5]:
# drop all nan values in the location columns
endpoint_sc_profile_df = endpoint_sc_profile_df.dropna(
    subset=["Metadata_Nuclei_Location_Center_X", "Metadata_Nuclei_Location_Center_Y"]
)

In [6]:
final_timepoint_df = sc_profile.loc[
    sc_profile["Metadata_Time"] == sc_profile["Metadata_Time"].max()
]

In [7]:
endpoint_sc_profile_df["Metadata_sc_unique_track_id"] = (
    endpoint_sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + endpoint_sc_profile_df["Metadata_FOV"].astype(str)
    + "_"
    + endpoint_sc_profile_df["Metadata_track_id"].astype(str)
)

In [8]:
# merge on Metadata_sc_unique_track_id
print(final_timepoint_df.shape)
print(endpoint_sc_profile_df.shape)
# find all common columns
common_columns = [
    "Metadata_Well",
    "Metadata_FOV",
    "Metadata_sc_unique_track_id",
]
# Also fix other potential type mismatches
for col in common_columns:
    if col in final_timepoint_df.columns and col in endpoint_sc_profile_df.columns:
        # Get the dtypes
        dtype1 = final_timepoint_df[col].dtype
        dtype2 = endpoint_sc_profile_df[col].dtype

        # If they don't match, convert both to string (safest option)
        if dtype1 != dtype2:
            print(f"Converting {col} from {dtype1}, {dtype2} to string")
            final_timepoint_df[col] = final_timepoint_df[col].astype(str)
            endpoint_sc_profile_df[col] = endpoint_sc_profile_df[col].astype(str)

merged_df = pd.merge(
    final_timepoint_df,
    endpoint_sc_profile_df,
    on=common_columns,
    how="inner",
)
for col in merged_df.columns:
    merged_df[col] = merged_df[col].apply(
        lambda x: str(x) if isinstance(x, np.ndarray) else x
    )

merged_df.drop_duplicates(inplace=True)
print("Merged df shape:")
print(merged_df.shape)
Metadata_sc_unique_track_ids_with_ground_truth = merged_df[
    "Metadata_sc_unique_track_id"
].unique()
print(
    f"Number of unique tracks with ground truth: {len(Metadata_sc_unique_track_ids_with_ground_truth)}"
)

(15099, 2380)
(9918, 545)
Merged df shape:
(8744, 2922)
Number of unique tracks with ground truth: 7541


In [9]:
# write the ground truth ids to a parquet file
ground_truth_ids_df = pd.DataFrame(
    Metadata_sc_unique_track_ids_with_ground_truth,
    columns=["Metadata_sc_unique_track_id"],
)
ground_truth_ids_file_path = pathlib.Path("../results/ground_truth_ids.parquet")
ground_truth_ids_file_path.parent.mkdir(parents=True, exist_ok=True)
ground_truth_ids_df.to_parquet(ground_truth_ids_file_path, index=False)

In [10]:
# save the profiles as cleaned data
cleaned_sc_profile_file_path = pathlib.Path(
    "../results/cleaned_timelapse_profiles.parquet"
).resolve()
# save the endpoint profiles as cleaned data
endpoint_sc_profile_file_path = pathlib.Path(
    "../results/cleaned_endpoint_profiles.parquet"
).resolve()
sc_profile_df.to_parquet(cleaned_sc_profile_file_path, index=False)
endpoint_sc_profile_df.to_parquet(endpoint_sc_profile_file_path, index=False)