In [1]:
import pathlib
import time

import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

In [2]:
sc_profile_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs.parquet"
).resolve(strict=True)
endpoint_sc_profile_file_path = pathlib.Path(
    "../../data/CP_feature_select/endpoints/features_selected_profile.parquet"
).resolve(strict=True)
sc_profile_df = pd.read_parquet(sc_profile_file_path)
endpoint_sc_profile_df = pd.read_parquet(endpoint_sc_profile_file_path)
endpoint_sc_profile_df["Metadata_Well_FOV"] = (
    endpoint_sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + endpoint_sc_profile_df["Metadata_FOV"].astype(str)
)
print(endpoint_sc_profile_df.shape)
sc_profile_df["Metadata_sc_unique_track_id"] = (
    sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + sc_profile_df["Metadata_FOV"].astype(str)
    + "_"
    + sc_profile_df["Metadata_track_id"].astype(str)
)
sc_profile_df["Metadata_Well_FOV"] = (
    sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + sc_profile_df["Metadata_FOV"].astype(str)
)
print(sc_profile_df.shape)
sc_profile_df.head()

(144324, 367)
(182804, 2375)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO,Metadata_sc_unique_track_id,Metadata_Well_FOV
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,101,...,1.589703,0.313944,1.126927,-0.143103,0.241127,-0.293259,-0.283715,1.434163,C-02_0001_17,C-02_0001
1,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,111,...,-1.208776,0.10275,0.845704,0.08393,-1.990931,-0.030848,-1.033722,-0.942127,C-02_0001_18,C-02_0001
2,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,11,...,-0.075728,0.810937,0.30094,-0.22878,1.782329,0.153739,-0.763335,0.725093,C-02_0001_5,C-02_0001
3,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,128,...,0.509754,-0.711263,0.067196,-0.149771,1.40565,0.063245,2.16211,3.187469,C-02_0001_19,C-02_0001
4,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,132,...,0.344723,-0.25113,-1.851114,0.669517,-0.439855,1.576201,0.747753,0.895601,C-02_0001_20,C-02_0001


In [3]:
# drop all nan values in the location columns
endpoint_sc_profile_df = endpoint_sc_profile_df.dropna(
    subset=["Metadata_Nuclei_Location_Center_X", "Metadata_Nuclei_Location_Center_Y"]
)

In [4]:
last_time_point_df = sc_profile_df.loc[
    sc_profile_df["Metadata_Well_FOV"].isin(
        endpoint_sc_profile_df["Metadata_Well_FOV"].unique()
    )
]

In [5]:
print(len(last_time_point_df["Metadata_Well_FOV"].unique()))
print(len(endpoint_sc_profile_df["Metadata_Well_FOV"].unique()))

80
80


In [6]:
dict_of_sc_well_fovs = {}
for well_fov in last_time_point_df["Metadata_Well_FOV"].unique():
    dict_of_sc_well_fovs[well_fov] = last_time_point_df[
        last_time_point_df["Metadata_Well_FOV"] == well_fov
    ].copy()
    # get only the last timepoint of the track

    dict_of_sc_well_fovs[well_fov].reset_index(drop=True, inplace=True)
dict_of_sc_well_fovs_endpoint = {}
for well_fov in endpoint_sc_profile_df["Metadata_Well_FOV"].unique():
    dict_of_sc_well_fovs_endpoint[well_fov] = endpoint_sc_profile_df[
        endpoint_sc_profile_df["Metadata_Well_FOV"] == well_fov
    ].copy()
    dict_of_sc_well_fovs_endpoint[well_fov].reset_index(drop=True, inplace=True)

In [7]:
start_time = time.time()

In [8]:
for well_fov in tqdm(list(dict_of_sc_well_fovs.keys()), desc="Processing Well-FOVs"):
    for i, row in tqdm(
        dict_of_sc_well_fovs[well_fov].iterrows(),
        total=len(dict_of_sc_well_fovs[well_fov]),
        desc="Outer Loop",
        leave=False,
    ):
        for j, row2 in dict_of_sc_well_fovs_endpoint[well_fov].iterrows():
            # check that the well_fov is the same
            if row["Metadata_Well_FOV"] == row2["Metadata_Well_FOV"]:
                distance = abs(
                    euclidean(
                        [
                            row["Metadata_Nuclei_Location_Center_X"],
                            row["Metadata_Nuclei_Location_Center_Y"],
                        ],
                        [
                            row2["Metadata_Nuclei_Location_Center_X"],
                            row2["Metadata_Nuclei_Location_Center_Y"],
                        ],
                    )
                )
                if distance < 10:
                    dict_of_sc_well_fovs_endpoint[well_fov].at[
                        j, "Metadata_sc_unique_track_id"
                    ] = row["Metadata_sc_unique_track_id"]

Processing Well-FOVs:   0%|          | 0/80 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1421 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1687 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1588 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1182 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1692 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1628 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1481 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1682 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1582 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1566 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1464 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1427 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1217 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1581 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1452 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1631 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1462 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1434 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1584 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1369 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1610 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1657 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1623 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1852 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1244 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1616 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1202 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1613 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1674 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1370 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1819 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1549 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1615 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1339 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1481 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1383 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1651 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1466 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/888 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1523 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1608 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1564 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1486 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1274 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1596 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1702 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1603 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1764 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1829 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1676 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1814 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1695 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1793 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1771 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1731 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1768 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/2036 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1718 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1752 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1526 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1618 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1617 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1533 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1613 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1906 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1843 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1600 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1766 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/2035 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1745 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1838 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1638 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1496 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1028 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1764 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1507 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1548 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1213 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1444 [00:00<?, ?it/s]

Outer Loop:   0%|          | 0/1524 [00:00<?, ?it/s]

In [9]:
print("Fuzzy matching completed!")
print(f"Took {time.time() - start_time} seconds")
print(f"Took {round((time.time() - start_time) / 60, 2)} minutes")
print(f"Took {round((time.time() - start_time) / 3600, 2)} hours")

Fuzzy matching completed!
Took 8285.907985687256 seconds
Took 138.1 minutes
Took 2.3 hours


In [10]:
sc_well_fovs_endpoint_df = pd.concat(
    dict_of_sc_well_fovs_endpoint.values(), ignore_index=True
)
# drop the rows where Metadata_sc_unique_track_id is NaN
sc_well_fovs_endpoint_df = sc_well_fovs_endpoint_df.dropna(
    subset=["Metadata_sc_unique_track_id"]
)
print(sc_well_fovs_endpoint_df.shape)
sc_well_fovs_endpoint_df.reset_index(drop=True, inplace=True)
sc_well_fovs_endpoint_df["Metadata_Time"] = 13.0
sc_well_fovs_endpoint_df.head()

(11340, 368)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Cells_Texture_Correlation_AnnexinV_3_01_256,Cells_Texture_Correlation_AnnexinV_3_02_256,Cells_Texture_Correlation_AnnexinV_3_03_256,Cells_Texture_Correlation_DNA_3_02_256,Cells_Texture_DifferenceVariance_AnnexinV_3_02_256,Cells_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Cells_Texture_SumAverage_AnnexinV_3_00_256,Cells_Texture_SumAverage_DNA_3_01_256,Metadata_Well_FOV,Metadata_sc_unique_track_id
0,1,C-09,69,Staurosporine,39.06,positive,1,2,13.0,23,...,-0.105321,0.258335,0.245637,-0.370941,0.063992,0.382514,-0.135229,2.375118,C-09_0002,C-09_0002_129
1,1,C-09,69,Staurosporine,39.06,positive,1,2,13.0,25,...,-0.650287,-0.535347,-0.911682,-0.218577,0.378465,0.509536,-0.265312,3.65782,C-09_0002,C-09_0002_39
2,1,C-09,69,Staurosporine,39.06,positive,1,2,13.0,36,...,-0.153755,0.307227,-0.233445,-0.14766,-0.076404,0.218955,-0.069362,2.324912,C-09_0002,C-09_0002_66
3,1,C-09,69,Staurosporine,39.06,positive,1,2,13.0,38,...,0.270074,0.500432,0.445759,-0.1628,-0.105987,0.30409,-0.0258,2.870185,C-09_0002,C-09_0002_65
4,1,C-09,69,Staurosporine,39.06,positive,1,2,13.0,42,...,-0.312933,0.471374,-0.05758,-1.426125,-0.291901,0.369278,-0.142464,0.237744,C-09_0002,C-09_0002_75


In [11]:
# find the unique values in the Metadata_sc_unique_track_id column that have more than 11 time values
sc_profile_df["Metadata_sc_unique_track_id"].value_counts()
# map the value counts to a new column for each Metadata_sc_unique_track_id
sc_profile_df["Metadata_sc_unique_track_id_count"] = sc_profile_df[
    "Metadata_sc_unique_track_id"
].map(sc_profile_df["Metadata_sc_unique_track_id"].value_counts())
sc_profile_df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO,Metadata_sc_unique_track_id,Metadata_Well_FOV,Metadata_sc_unique_track_id_count
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,101,...,0.313944,1.126927,-0.143103,0.241127,-0.293259,-0.283715,1.434163,C-02_0001_17,C-02_0001,13
1,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,111,...,0.10275,0.845704,0.08393,-1.990931,-0.030848,-1.033722,-0.942127,C-02_0001_18,C-02_0001,13
2,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,11,...,0.810937,0.30094,-0.22878,1.782329,0.153739,-0.763335,0.725093,C-02_0001_5,C-02_0001,1
3,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,128,...,-0.711263,0.067196,-0.149771,1.40565,0.063245,2.16211,3.187469,C-02_0001_19,C-02_0001,13
4,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,132,...,-0.25113,-1.851114,0.669517,-0.439855,1.576201,0.747753,0.895601,C-02_0001_20,C-02_0001,6


In [12]:
# write the cleaned dataframe to a parquet file
output_sc_file_path = pathlib.Path("../results/cleaned_sc_profile.parquet").resolve(
    strict=False
)
output_sc_endpoint_file_path = pathlib.Path(
    "../results/cleaned_endpoint_sc_profile.parquet"
).resolve(strict=False)
output_sc_file_path.parent.mkdir(parents=True, exist_ok=True)

sc_profile_df.to_parquet(output_sc_file_path, index=False)
sc_well_fovs_endpoint_df.to_parquet(output_sc_endpoint_file_path, index=False)