This notebook performs a fuzzy match on single-cells from the profile data to the endpoint data.
This is necessary because  the endpoint data was not included in the tracking module.
Further this notebook provides information about how long the cell track is.

In [1]:
import pathlib
import time

import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

In [2]:
sc_profile_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs.parquet"
).resolve(strict=True)
endpoint_sc_profile_file_path = pathlib.Path(
    "../../data/CP_feature_select/endpoints/features_selected_profile.parquet"
).resolve(strict=True)
sc_profile_df = pd.read_parquet(sc_profile_file_path)
endpoint_sc_profile_df = pd.read_parquet(endpoint_sc_profile_file_path)
endpoint_sc_profile_df["Metadata_Well_FOV"] = (
    endpoint_sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + endpoint_sc_profile_df["Metadata_FOV"].astype(str)
)
print(endpoint_sc_profile_df.shape)
sc_profile_df["Metadata_sc_unique_track_id"] = (
    sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + sc_profile_df["Metadata_FOV"].astype(str)
    + "_"
    + sc_profile_df["Metadata_track_id"].astype(str)
)
sc_profile_df["Metadata_Well_FOV"] = (
    sc_profile_df["Metadata_Well"].astype(str)
    + "_"
    + sc_profile_df["Metadata_FOV"].astype(str)
)
print(sc_profile_df.shape)
sc_profile_df.head()

(9918, 544)
(188065, 2380)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO,Metadata_Image_FileName_CL_488_1_crop,Metadata_Image_FileName_CL_488_2_crop,Metadata_Image_FileName_CL_561_crop,Metadata_Image_FileName_DNA_crop,Metadata_parent_path,Metadata_sc_unique_track_id,Metadata_Well_FOV
0,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,7,...,-0.12352,2.401852,1.516202,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_5,C-09_0002
1,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,9,...,-0.835988,-0.264486,0.153676,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_6,C-09_0002
2,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,10,...,-0.359857,0.659583,0.537619,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_7,C-09_0002
3,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,11,...,0.211796,0.443178,1.129714,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_8,C-09_0002
4,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,12,...,-1.694061,-0.22899,0.648714,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_9,C-09_0002


In [3]:
original_sc_profile_shape = sc_profile_df.shape
# drop rows with NaN values
sc_profile = sc_profile_df.dropna(how="any")
print(
    f"Dropped {original_sc_profile_shape[0] - sc_profile.shape[0]} rows with NaN values"
)
sc_profile.reset_index(drop=True, inplace=True)

Dropped 2563 rows with NaN values


In [4]:
original_endpoint_sc_profile_shape = endpoint_sc_profile_df.shape
# drop rows with NaN values
endpoint_sc_profile = endpoint_sc_profile_df.dropna(how="any")
print(
    f"Dropped {original_endpoint_sc_profile_shape[0] - endpoint_sc_profile.shape[0]} rows with NaN values"
)
endpoint_sc_profile.reset_index(drop=True, inplace=True)

Dropped 554 rows with NaN values


In [5]:
# drop all nan values in the location columns
endpoint_sc_profile_df = endpoint_sc_profile_df.dropna(
    subset=["Metadata_Nuclei_Location_Center_X", "Metadata_Nuclei_Location_Center_Y"]
)

In [6]:
last_time_point_df = sc_profile_df.loc[
    sc_profile_df["Metadata_Well_FOV"].isin(
        endpoint_sc_profile_df["Metadata_Well_FOV"].unique()
    )
]

In [7]:
# chunk the dataframe by well_fov so that there are fewer indexed records to search at once during fuzzy matching
dict_of_sc_well_fovs = {}
for well_fov in last_time_point_df["Metadata_Well_FOV"].unique():
    dict_of_sc_well_fovs[well_fov] = last_time_point_df[
        last_time_point_df["Metadata_Well_FOV"] == well_fov
    ].copy()
    # get only the last timepoint of the track

    dict_of_sc_well_fovs[well_fov].reset_index(drop=True, inplace=True)
dict_of_sc_well_fovs_endpoint = {}
for well_fov in endpoint_sc_profile_df["Metadata_Well_FOV"].unique():
    dict_of_sc_well_fovs_endpoint[well_fov] = endpoint_sc_profile_df[
        endpoint_sc_profile_df["Metadata_Well_FOV"] == well_fov
    ].copy()
    dict_of_sc_well_fovs_endpoint[well_fov].reset_index(drop=True, inplace=True)

In [8]:
start_time = time.time()

In [9]:
for well_fov in tqdm(list(dict_of_sc_well_fovs.keys()), desc="Processing Well-FOVs"):
    used_track_ids = set()  # Track which IDs have already been assigned

    # For each endpoint cell, find the best available match
    for j, endpoint_row in tqdm(
        dict_of_sc_well_fovs_endpoint[well_fov].iterrows(),
        total=len(dict_of_sc_well_fovs_endpoint[well_fov]),
        desc="Matching endpoint cells",
        leave=False,
    ):
        best_distance = float("inf")
        best_track_id = None

        # Find the closest available tracked cell to this endpoint cell
        for i, tracked_row in dict_of_sc_well_fovs[well_fov].iterrows():
            track_id = tracked_row["Metadata_sc_unique_track_id"]

            # Skip if this track ID has already been used
            if track_id in used_track_ids:
                continue

            if tracked_row["Metadata_Well_FOV"] == endpoint_row["Metadata_Well_FOV"]:
                distance = euclidean(
                    [
                        tracked_row["Metadata_Nuclei_Location_Center_X"],
                        tracked_row["Metadata_Nuclei_Location_Center_Y"],
                    ],
                    [
                        endpoint_row["Metadata_Nuclei_Location_Center_X"],
                        endpoint_row["Metadata_Nuclei_Location_Center_Y"],
                    ],
                )

                # Update best match if this is closer and within threshold
                if distance < 10 and distance < best_distance:
                    best_distance = distance
                    best_track_id = track_id

        # Assign the best match (if any) to this endpoint cell
        if best_track_id is not None:
            dict_of_sc_well_fovs_endpoint[well_fov].at[
                j, "Metadata_sc_unique_track_id"
            ] = best_track_id
            used_track_ids.add(best_track_id)  # Mark this track ID as used

Processing Well-FOVs:   0%|          | 0/117 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/51 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/111 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/75 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/100 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/129 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/94 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/116 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/85 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/52 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/99 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/82 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/47 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/64 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/86 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/48 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/87 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/65 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/88 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/31 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/65 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/104 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/38 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/115 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/116 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/35 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/106 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/74 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/100 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/92 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/112 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/114 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/94 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/105 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/119 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/113 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/27 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/116 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/99 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/105 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/88 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/89 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/22 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/52 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/73 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/107 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/116 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/121 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/111 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/115 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/33 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/112 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/41 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/111 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/70 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/110 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/98 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/111 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/99 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/86 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/121 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/83 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/75 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/102 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/88 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/90 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/111 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/43 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/46 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/122 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/84 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/95 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/88 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/59 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/125 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/87 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/107 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/28 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/60 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/116 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/95 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/55 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/100 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/43 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/106 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/99 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/41 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/116 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/82 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/102 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/74 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/117 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/98 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/101 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/76 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/128 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/112 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/53 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/39 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/63 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/57 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/98 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/101 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/21 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/118 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/47 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/78 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/67 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/46 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/25 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/29 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/123 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/120 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/90 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/68 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/61 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/128 [00:00<?, ?it/s]

Matching endpoint cells:   0%|          | 0/87 [00:00<?, ?it/s]

In [10]:
print("Fuzzy matching completed!")
print(f"Took {time.time() - start_time} seconds")
print(f"Took {round((time.time() - start_time) / 60, 2)} minutes")
print(f"Took {round((time.time() - start_time) / 3600, 2)} hours")

Fuzzy matching completed!
Took 1321.0126721858978 seconds
Took 22.02 minutes
Took 0.37 hours


In [11]:
sc_well_fovs_endpoint_df = pd.concat(
    dict_of_sc_well_fovs_endpoint.values(), ignore_index=True
)
# drop the rows where Metadata_sc_unique_track_id is NaN
sc_well_fovs_endpoint_df = sc_well_fovs_endpoint_df.dropna(
    subset=["Metadata_sc_unique_track_id"]
)
print(sc_well_fovs_endpoint_df.shape)
sc_well_fovs_endpoint_df.reset_index(drop=True, inplace=True)
sc_well_fovs_endpoint_df["Metadata_Time"] = 13.0
sc_well_fovs_endpoint_df.head()

(4979, 545)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Correlation_AnnexinV_3_02_256,Nuclei_Texture_Correlation_AnnexinV_3_03_256,Nuclei_Texture_Correlation_DNA_3_02_256,Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DNA_3_03_256,Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Nuclei_Texture_SumAverage_DNA_3_01_256,Metadata_Well_FOV,Metadata_sc_unique_track_id
0,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,3.0,...,-0.471301,-0.589558,-1.321561,0.652602,0.668286,0.260828,0.034058,0.566404,C-09_0002,C-09_0002_32
1,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,6.0,...,-0.832951,-0.666071,0.643979,-1.267717,-1.853163,0.47445,-1.437757,0.347586,C-09_0002,C-09_0002_52
2,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,22.0,...,0.167245,-0.055811,-0.535309,-0.677362,0.194116,-0.309046,0.545504,2.499717,C-09_0002,C-09_0002_115
3,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,27.0,...,0.707274,-0.882432,-0.685778,0.05484,0.505591,-0.37758,0.083942,3.070313,C-09_0002,C-09_0002_104
4,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,31.0,...,-0.519578,-0.373536,-1.63672,-0.63538,0.294652,0.469701,0.303541,0.351592,C-09_0002,C-09_0002_167


In [12]:
# map the value counts to a new column for each Metadata_sc_unique_track_id
sc_profile_df["Metadata_sc_unique_track_id_count"] = sc_profile_df[
    "Metadata_sc_unique_track_id"
].map(sc_profile_df["Metadata_sc_unique_track_id"].value_counts())
sc_profile_df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO,Metadata_Image_FileName_CL_488_1_crop,Metadata_Image_FileName_CL_488_2_crop,Metadata_Image_FileName_CL_561_crop,Metadata_Image_FileName_DNA_crop,Metadata_parent_path,Metadata_sc_unique_track_id,Metadata_Well_FOV,Metadata_sc_unique_track_id_count
0,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,7,...,2.401852,1.516202,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_5,C-09_0002,2
1,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,9,...,-0.264486,0.153676,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_6,C-09_0002,5
2,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,10,...,0.659583,0.537619,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_7,C-09_0002,2
3,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,11,...,0.443178,1.129714,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_8,C-09_0002,8
4,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,12,...,-0.22899,0.648714,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_9,C-09_0002,13


In [13]:
# write the cleaned dataframe to a parquet file
output_sc_file_path = pathlib.Path("../results/cleaned_sc_profile.parquet").resolve(
    strict=False
)
output_sc_endpoint_file_path = pathlib.Path(
    "../results/cleaned_endpoint_sc_profile.parquet"
).resolve(strict=False)
output_sc_file_path.parent.mkdir(parents=True, exist_ok=True)

# we save the two profiles separately because they have different feature spaces

sc_profile_df.to_parquet(output_sc_file_path, index=False)
sc_well_fovs_endpoint_df.to_parquet(output_sc_endpoint_file_path, index=False)

In [14]:
sc_well_fovs_endpoint_df.isna().sum().sum()

801

In [15]:
sc_profile_df.isna().sum().sum()

9529