This notebook performs a fuzzy match on single-cells from the profile data to the endpoint data.
This is necessary because  the endpoint data was not included in the tracking module.
Further this notebook provides information about how long the cell track is.

In [1]:
import pathlib
import time

import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

In [2]:
sc_profile_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs.parquet"
).resolve(strict=True)
endpoint_sc_profile_file_path = pathlib.Path(
    "../../data/CP_feature_select/endpoints/features_selected_profile.parquet"
).resolve(strict=True)
sc_profile_df = pd.read_parquet(sc_profile_file_path)
endpoint_sc_profile_df = pd.read_parquet(endpoint_sc_profile_file_path)
endpoint_sc_profile_df["Metadata_Well_FOV"] = (
    endpoint_sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + endpoint_sc_profile_df["Metadata_FOV"].astype(str)
)
print(endpoint_sc_profile_df.shape)
sc_profile_df["Metadata_sc_unique_track_id"] = (
    sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + sc_profile_df["Metadata_FOV"].astype(str)
    + "_"
    + sc_profile_df["Metadata_track_id"].astype(str)
)
sc_profile_df["Metadata_Well_FOV"] = (
    sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + sc_profile_df["Metadata_FOV"].astype(str)
)
print(sc_profile_df.shape)
sc_profile_df.head()

(9918, 544)
(188065, 2375)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO,Metadata_sc_unique_track_id,Metadata_Well_FOV
0,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,7,...,-0.085856,0.080005,0.918828,1.373562,0.980376,-0.12352,2.401852,1.516202,C-09_0002_5,C-09_0002
1,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,9,...,0.358887,0.486711,0.074517,-1.604183,2.407552,-0.835988,-0.264486,0.153676,C-09_0002_6,C-09_0002
2,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,10,...,0.687862,-0.039333,-1.202663,0.373103,1.048282,-0.359857,0.659583,0.537619,C-09_0002_7,C-09_0002
3,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,11,...,0.934017,-0.800594,0.178355,-1.380925,-0.285477,0.211796,0.443178,1.129714,C-09_0002_8,C-09_0002
4,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,12,...,-0.786748,-0.963768,-0.90653,0.786611,-1.693796,-1.694061,-0.22899,0.648714,C-09_0002_9,C-09_0002


In [3]:
# drop all nan values in the location columns
endpoint_sc_profile_df = endpoint_sc_profile_df.dropna(
    subset=["Metadata_Nuclei_Location_Center_X", "Metadata_Nuclei_Location_Center_Y"]
)

In [4]:
last_time_point_df = sc_profile_df.loc[
    sc_profile_df["Metadata_Well_FOV"].isin(
        endpoint_sc_profile_df["Metadata_Well_FOV"].unique()
    )
]

In [5]:
# chunk the dataframe by well_fov so that there are fewer indexed records to search at once during fuzzy matching
dict_of_sc_well_fovs = {}
for well_fov in last_time_point_df["Metadata_Well_FOV"].unique():
    dict_of_sc_well_fovs[well_fov] = last_time_point_df[
        last_time_point_df["Metadata_Well_FOV"] == well_fov
    ].copy()
    # get only the last timepoint of the track

    dict_of_sc_well_fovs[well_fov].reset_index(drop=True, inplace=True)
dict_of_sc_well_fovs_endpoint = {}
for well_fov in endpoint_sc_profile_df["Metadata_Well_FOV"].unique():
    dict_of_sc_well_fovs_endpoint[well_fov] = endpoint_sc_profile_df[
        endpoint_sc_profile_df["Metadata_Well_FOV"] == well_fov
    ].copy()
    dict_of_sc_well_fovs_endpoint[well_fov].reset_index(drop=True, inplace=True)

In [6]:
start_time = time.time()

In [7]:
for well_fov in tqdm(list(dict_of_sc_well_fovs.keys()), desc="Processing Well-FOVs"):
    for i, row in tqdm(
        dict_of_sc_well_fovs[well_fov].iterrows(),
        total=len(dict_of_sc_well_fovs[well_fov]),
        desc="Outer Loop",
        leave=False,
    ):
        for j, row2 in dict_of_sc_well_fovs_endpoint[well_fov].iterrows():
            # check that the well_fov is the same
            if row["Metadata_Well_FOV"] == row2["Metadata_Well_FOV"]:
                distance = abs(
                    euclidean(
                        [
                            row["Metadata_Nuclei_Location_Center_X"],
                            row["Metadata_Nuclei_Location_Center_Y"],
                        ],
                        [
                            row2["Metadata_Nuclei_Location_Center_X"],
                            row2["Metadata_Nuclei_Location_Center_Y"],
                        ],
                    )
                )
                if distance < 10:
                    dict_of_sc_well_fovs_endpoint[well_fov].at[
                        j, "Metadata_sc_unique_track_id"
                    ] = row["Metadata_sc_unique_track_id"]

Processing Well-FOVs:   0%|          | 0/117 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1406 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1714 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1455 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1540 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1780 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1493 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1770 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1380 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1679 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1589 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1856 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1715 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1741 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1513 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1207 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1412 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1710 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1525 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1448 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1262 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1752 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1552 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1697 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1671 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1319 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1654 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1455 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1581 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1362 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1491 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1612 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1673 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1702 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1759 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1691 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1282 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1581 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1649 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1569 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1909 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1460 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1058 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1408 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1427 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1606 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1620 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1687 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1635 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1712 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1467 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1695 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1525 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1597 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1201 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1599 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1561 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1805 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1576 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1559 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1750 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1280 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1362 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1465 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1417 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1572 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1630 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1199 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1650 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1664 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1607 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1623 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1604 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1740 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1798 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1457 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1589 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1278 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1214 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1876 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1590 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1565 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1537 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1189 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/2026 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1719 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1616 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1622 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1415 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1624 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1736 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1805 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1604 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1497 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1470 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1833 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1678 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1562 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1403 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1240 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1596 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1614 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1818 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1077 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1709 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1517 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1248 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1792 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1698 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1165 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1339 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1794 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1939 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1450 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/2010 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1212 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1951 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1413 [00:00<?, ?it/s]

In [8]:
print("Fuzzy matching completed!")
print(f"Took {time.time() - start_time} seconds")
print(f"Took {round((time.time() - start_time) / 60, 2)} minutes")
print(f"Took {round((time.time() - start_time) / 3600, 2)} hours")

Fuzzy matching completed!
Took 764.6259286403656 seconds
Took 12.74 minutes
Took 0.21 hours


In [9]:
sc_well_fovs_endpoint_df = pd.concat(
    dict_of_sc_well_fovs_endpoint.values(), ignore_index=True
)
# drop the rows where Metadata_sc_unique_track_id is NaN
sc_well_fovs_endpoint_df = sc_well_fovs_endpoint_df.dropna(
    subset=["Metadata_sc_unique_track_id"]
)
print(sc_well_fovs_endpoint_df.shape)
sc_well_fovs_endpoint_df.reset_index(drop=True, inplace=True)
sc_well_fovs_endpoint_df["Metadata_Time"] = 13.0
sc_well_fovs_endpoint_df.head()

(5767, 545)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Correlation_AnnexinV_3_02_256,Nuclei_Texture_Correlation_AnnexinV_3_03_256,Nuclei_Texture_Correlation_DNA_3_02_256,Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DNA_3_03_256,Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Nuclei_Texture_SumAverage_DNA_3_01_256,Metadata_Well_FOV,Metadata_sc_unique_track_id
0,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,3.0,...,-0.471301,-0.589558,-1.321561,0.652602,0.668286,0.260828,0.034058,0.566404,C-09_0002,C-09_0002_32
1,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,6.0,...,-0.832951,-0.666071,0.643979,-1.267717,-1.853163,0.47445,-1.437757,0.347586,C-09_0002,C-09_0002_52
2,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,22.0,...,0.167245,-0.055811,-0.535309,-0.677362,0.194116,-0.309046,0.545504,2.499717,C-09_0002,C-09_0002_115
3,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,25.0,...,0.167245,-0.055811,-0.535309,-0.677362,0.194116,-0.309046,0.545504,2.499717,C-09_0002,C-09_0002_115
4,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,27.0,...,0.707274,-0.882432,-0.685778,0.05484,0.505591,-0.37758,0.083942,3.070313,C-09_0002,C-09_0002_104


In [10]:
# map the value counts to a new column for each Metadata_sc_unique_track_id
sc_profile_df["Metadata_sc_unique_track_id_count"] = sc_profile_df[
    "Metadata_sc_unique_track_id"
].map(sc_profile_df["Metadata_sc_unique_track_id"].value_counts())
sc_profile_df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO,Metadata_sc_unique_track_id,Metadata_Well_FOV,Metadata_sc_unique_track_id_count
0,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,7,...,0.080005,0.918828,1.373562,0.980376,-0.12352,2.401852,1.516202,C-09_0002_5,C-09_0002,2
1,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,9,...,0.486711,0.074517,-1.604183,2.407552,-0.835988,-0.264486,0.153676,C-09_0002_6,C-09_0002,5
2,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,10,...,-0.039333,-1.202663,0.373103,1.048282,-0.359857,0.659583,0.537619,C-09_0002_7,C-09_0002,2
3,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,11,...,-0.800594,0.178355,-1.380925,-0.285477,0.211796,0.443178,1.129714,C-09_0002_8,C-09_0002,8
4,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,12,...,-0.963768,-0.90653,0.786611,-1.693796,-1.694061,-0.22899,0.648714,C-09_0002_9,C-09_0002,13


In [11]:
# write the cleaned dataframe to a parquet file
output_sc_file_path = pathlib.Path("../results/cleaned_sc_profile.parquet").resolve(
    strict=False
)
output_sc_endpoint_file_path = pathlib.Path(
    "../results/cleaned_endpoint_sc_profile.parquet"
).resolve(strict=False)
output_sc_file_path.parent.mkdir(parents=True, exist_ok=True)

# we save the two profiles separately because they have different feature spaces

sc_profile_df.to_parquet(output_sc_file_path, index=False)
sc_well_fovs_endpoint_df.to_parquet(output_sc_endpoint_file_path, index=False)