This is quite the complex data splitting procedure.
The data is split into holdout data, training, validation, and testing.
The training and validation data only contains single-cells that have ground truth labels at the terminal time point.
While testing and holdout data contains cells that do and no not have ground truth labels at the terminal time point.

In [1]:
import itertools
import pathlib

import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
from sklearn.model_selection import train_test_split

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

In [2]:
# read in the data
sc_file_path = pathlib.Path("../results/cleaned_sc_profile.parquet").resolve(
    strict=True
)
sc_endpoint_file_path = pathlib.Path(
    "../results/cleaned_endpoint_sc_profile.parquet"
).resolve(strict=True)

train_test_wells_file_path = pathlib.Path(
    "../../5.bulk_timelapse_model/data_splits/train_test_wells.parquet"
).resolve(strict=True)

sc_profile = pd.read_parquet(sc_file_path)
sc_endpoint_profile = pd.read_parquet(sc_endpoint_file_path)
train_test_wells = pd.read_parquet(train_test_wells_file_path)
print(f"sc_profile shape: {sc_profile.shape}")
print(f"sc_endpoint_profile shape: {sc_endpoint_profile.shape}")

sc_profile shape: (185502, 2383)
sc_endpoint_profile shape: (4733, 545)


In [3]:
sc_endpoint_profile.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Correlation_AnnexinV_3_02_256,Nuclei_Texture_Correlation_AnnexinV_3_03_256,Nuclei_Texture_Correlation_DNA_3_02_256,Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DNA_3_03_256,Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Nuclei_Texture_SumAverage_DNA_3_01_256,Metadata_Well_FOV,Metadata_sc_unique_track_id
0,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,3.0,...,-0.471301,-0.589558,-1.321561,0.652602,0.668286,0.260828,0.034058,0.566404,C-09_0002,C-09_0002_32
1,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,6.0,...,-0.832951,-0.666071,0.643979,-1.267717,-1.853163,0.47445,-1.437757,0.347586,C-09_0002,C-09_0002_52
2,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,22.0,...,0.167245,-0.055811,-0.535309,-0.677362,0.194116,-0.309046,0.545504,2.499717,C-09_0002,C-09_0002_115
3,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,27.0,...,0.707274,-0.882432,-0.685778,0.05484,0.505591,-0.37758,0.083942,3.070313,C-09_0002,C-09_0002_104
4,1,C-09,153,Staurosporine,39.06,positive,1,2,13.0,31.0,...,-0.519578,-0.373536,-1.63672,-0.63538,0.294652,0.469701,0.303541,0.351592,C-09_0002,C-09_0002_167


In [4]:
sc_profile.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Metadata_Image_FileName_CL_488_1_crop,Metadata_Image_FileName_CL_488_2_crop,Metadata_Image_FileName_CL_561_crop,Metadata_Image_FileName_DNA_crop,Metadata_parent_path,Metadata_sc_unique_track_id,Metadata_Well_FOV,Metadata_sc_unique_track_id_count,Metadata_ground_truth_present,Metadata_data_split
0,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,7,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_5,C-09_0002,2,False,train
1,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,9,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_6,C-09_0002,5,False,train
2,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,10,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_7,C-09_0002,2,False,train
3,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,11,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_8,C-09_0002,8,False,train
4,1,C-09,168,Staurosporine,39.06,positive,1,2,0.0,12,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_9,C-09_0002,13,True,train


In [5]:
# drop NaN rows
before_shape = sc_profile.shape
print(f"sc_profile shape before dropping NaNs: {before_shape}")
sc_profile = sc_profile.dropna()
print(f"sc_profile shape after dropping NaNs: {sc_profile.shape}")
print(f"Dropped {before_shape[0] - sc_profile.shape[0]} rows with NaNs")
# same for endpoint profile
sc_endpoint_profile_before_shape = sc_endpoint_profile.shape
print(
    f"sc_endpoint_profile shape before dropping NaNs: {sc_endpoint_profile_before_shape}"
)
sc_endpoint_profile = sc_endpoint_profile.dropna()
print(f"sc_endpoint_profile shape after dropping NaNs: {sc_endpoint_profile.shape}")
print(
    f"Dropped {sc_endpoint_profile_before_shape[0] - sc_endpoint_profile.shape[0]} rows with NaNs"
)

sc_profile shape before dropping NaNs: (185502, 2383)
sc_profile shape after dropping NaNs: (185502, 2383)
Dropped 0 rows with NaNs
sc_endpoint_profile shape before dropping NaNs: (4733, 545)
sc_endpoint_profile shape after dropping NaNs: (4733, 545)
Dropped 0 rows with NaNs


In [6]:
# add a ground truth column to the sc_profile dataframe based on if the track id is in the endpoint profile
sc_profile["Metadata_ground_truth_present"] = (
    sc_profile["Metadata_sc_unique_track_id"]
    .isin(sc_endpoint_profile["Metadata_sc_unique_track_id"])
    .astype(bool)
)

At this point there are two subsets to the dataset and will be split into the following datasplits:
- Single-cells that have a single cell ground truth
     - Holdout wells: 1/3 of wells
    - Train: 80% of the all cells with a single cell ground truth from the non-holdout wells
    - Validation:  10% of the all cells with a single cell ground truth from the non-holdout wells
    - Test: 10% of the all cells with a single cell ground truth from the non-holdout wells
- Single-cells that do not have a single cell ground truth
    - Holdout wells: 1/3 of wells
    - Test: 100% of the all cells with a single cell ground truth from the non-holdout wells


### hold out wells regardless of ground truth

In [7]:
index_data_split_and_ground_truth_dict = {
    "index": [],
    "data_split": [],
    "ground_truth": [],
}

In [8]:
# map the data_split to the sc_profile dataframe via the well
sc_profile["Metadata_data_split"] = sc_profile["Metadata_Well"].map(
    train_test_wells.set_index("Metadata_Well")["data_split"]
)
sc_profile.loc[sc_profile["Metadata_data_split"] == "test", "Metadata_data_split"] = (
    "well_holdout"
)
holdout_df = sc_profile.loc[sc_profile["Metadata_data_split"] == "well_holdout"]
index_data_split_and_ground_truth_dict["index"].append(holdout_df.index.tolist())
index_data_split_and_ground_truth_dict["data_split"].append(
    holdout_df["Metadata_data_split"].tolist()
)
index_data_split_and_ground_truth_dict["ground_truth"].append(
    holdout_df["Metadata_ground_truth_present"].tolist()
)
# get the non holdout wells
non_holdout_wells = sc_profile.loc[sc_profile["Metadata_data_split"] != "well_holdout"]
print(f"sc_profile shape after mapping data_split: {non_holdout_wells.shape}")
print(f"holdout_df shape: {holdout_df.shape}")

sc_profile shape after mapping data_split: (121129, 2383)
holdout_df shape: (64373, 2383)


### Cells that have a single cell ground truth

In [9]:
cell_wout_ground_truth_df = non_holdout_wells.loc[
    non_holdout_wells["Metadata_ground_truth_present"] == False
].copy()
cell_w_ground_truth_df = non_holdout_wells.loc[
    non_holdout_wells["Metadata_ground_truth_present"] == True
].copy()

print(f"cell_w_ground_truth_df shape: {cell_w_ground_truth_df.shape}")
print(f"cell_wout_ground_truth_df shape: {cell_wout_ground_truth_df.shape}")

cell_w_ground_truth_df shape: (32432, 2383)
cell_wout_ground_truth_df shape: (88697, 2383)


##

In [10]:
# split the data into 80, 10, 10 stratified by the well
train_sc_w_ground_truth_df, test_sc_w_ground_truth_df = train_test_split(
    cell_w_ground_truth_df,
    test_size=0.2,
    stratify=cell_w_ground_truth_df["Metadata_Well"],
    random_state=0,
)
test_sc_w_ground_truth_df, val_sc_w_ground_truth_df = train_test_split(
    test_sc_w_ground_truth_df,
    test_size=0.5,  # 50% of 20% is 10%
    stratify=test_sc_w_ground_truth_df["Metadata_Well"],
    random_state=0,
)

train_sc_w_ground_truth_df["Metadata_data_split"] = "train"
train_sc_w_ground_truth_df["Metadata_ground_truth_present"] = True
val_sc_w_ground_truth_df["Metadata_data_split"] = "val"
val_sc_w_ground_truth_df["Metadata_ground_truth_present"] = True
test_sc_w_ground_truth_df["Metadata_data_split"] = "test"
test_sc_w_ground_truth_df["Metadata_ground_truth_present"] = True

print(f"train_sc_w_ground_truth_df shape: {train_sc_w_ground_truth_df.shape[0]}")
print(f"val_sc_w_ground_truth_df shape: {val_sc_w_ground_truth_df.shape[0]}")
print(f"test_sc_w_ground_truth_df shape: {test_sc_w_ground_truth_df.shape[0]}")
assert (
    train_sc_w_ground_truth_df.shape[0]
    + val_sc_w_ground_truth_df.shape[0]
    + test_sc_w_ground_truth_df.shape[0]
    == cell_w_ground_truth_df.shape[0]
)
assert (
    np.round(train_sc_w_ground_truth_df.shape[0] / cell_w_ground_truth_df.shape[0], 2)
    == 0.8
)
assert (
    np.round(val_sc_w_ground_truth_df.shape[0] / cell_w_ground_truth_df.shape[0], 2)
    == 0.1
)
assert (
    np.round(test_sc_w_ground_truth_df.shape[0] / cell_w_ground_truth_df.shape[0], 2)
    == 0.1
)

# add to records
index_data_split_and_ground_truth_dict["index"].append(
    train_sc_w_ground_truth_df.index.tolist()
)
index_data_split_and_ground_truth_dict["data_split"].append(
    train_sc_w_ground_truth_df["Metadata_data_split"].tolist()
)
index_data_split_and_ground_truth_dict["ground_truth"].append(
    train_sc_w_ground_truth_df["Metadata_ground_truth_present"].tolist()
)
index_data_split_and_ground_truth_dict["index"].append(
    val_sc_w_ground_truth_df.index.tolist()
)
index_data_split_and_ground_truth_dict["data_split"].append(
    val_sc_w_ground_truth_df["Metadata_data_split"].tolist()
)
index_data_split_and_ground_truth_dict["ground_truth"].append(
    val_sc_w_ground_truth_df["Metadata_ground_truth_present"].tolist()
)
index_data_split_and_ground_truth_dict["index"].append(
    test_sc_w_ground_truth_df.index.tolist()
)
index_data_split_and_ground_truth_dict["data_split"].append(
    test_sc_w_ground_truth_df["Metadata_data_split"].tolist()
)
index_data_split_and_ground_truth_dict["ground_truth"].append(
    test_sc_w_ground_truth_df["Metadata_ground_truth_present"].tolist()
)

train_sc_w_ground_truth_df shape: 25945
val_sc_w_ground_truth_df shape: 3244
test_sc_w_ground_truth_df shape: 3243


#### Non tracked cells

In [11]:
cell_wout_ground_truth_df["Metadata_data_split"] = "test"
# add to records
index_data_split_and_ground_truth_dict["index"].append(
    cell_wout_ground_truth_df.index.tolist()
)
index_data_split_and_ground_truth_dict["data_split"].append(
    cell_wout_ground_truth_df["Metadata_data_split"].tolist()
)
index_data_split_and_ground_truth_dict["ground_truth"].append(
    cell_wout_ground_truth_df["Metadata_ground_truth_present"].tolist()
)
print(f"test_sc_wo_ground_truth_df shape: {cell_wout_ground_truth_df.shape[0]}")

test_sc_wo_ground_truth_df shape: 88697


### Fetch the indices from each ground truth and data split and add the status back to sc_profile

In [12]:
# flatten each list in the dictionar

for key in index_data_split_and_ground_truth_dict.keys():
    index_data_split_and_ground_truth_dict[key] = list(
        itertools.chain.from_iterable(index_data_split_and_ground_truth_dict[key])
    )
data_split_data_df = pd.DataFrame.from_dict(
    index_data_split_and_ground_truth_dict,
    orient="columns",
)
assert data_split_data_df.shape[0] == sc_profile.shape[0]

In [13]:
# sort the dataframe by index
data_split_data_df.sort_values(
    by=["index"],
    inplace=True,
)
# make the index the index column in data_split_data_df
data_split_data_df.set_index("index", inplace=True)
data_split_data_df.reset_index(drop=False, inplace=True)
data_split_data_df.head()

Unnamed: 0,index,data_split,ground_truth
0,0,test,False
1,1,test,False
2,2,test,False
3,3,test,False
4,4,val,True


In [14]:
# add the data_split and ground truth columns to the sc_profile dataframe by index
sc_profile_with_data_splits_df = pd.concat(
    [sc_profile, data_split_data_df],
    axis=1,
)
sc_profile_with_data_splits_df.drop(
    columns=["Metadata_data_split", "Metadata_ground_truth_present"],
    inplace=True,
)
sc_profile_with_data_splits_df.rename(
    columns={
        "data_split": "Metadata_data_split",
        "ground_truth": "Metadata_ground_truth_present",
    },
    inplace=True,
)

In [15]:
sc_profile_with_data_splits_df.isna().sum().sum()

0

In [16]:
sc_profile_with_data_splits_df

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Metadata_Image_FileName_CL_488_2_crop,Metadata_Image_FileName_CL_561_crop,Metadata_Image_FileName_DNA_crop,Metadata_parent_path,Metadata_sc_unique_track_id,Metadata_Well_FOV,Metadata_sc_unique_track_id_count,index,Metadata_data_split,Metadata_ground_truth_present
0,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,7,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_5,C-09_0002,2,0,test,False
1,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,9,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_6,C-09_0002,5,1,test,False
2,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,10,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_7,C-09_0002,2,2,test,False
3,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,11,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_8,C-09_0002,8,3,test,False
4,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,12,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_9,C-09_0002,13,4,val,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185497,1,E-04,161,Staurosporine,1.22,test,8,0003,7.0,149,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,E-04_0003_105,E-04_0003,10,185497,test,False
185498,1,E-04,161,Staurosporine,1.22,test,8,0003,7.0,150,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,E-04_0003_112,E-04_0003,9,185498,test,False
185499,1,E-04,161,Staurosporine,1.22,test,8,0003,7.0,152,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,E-04_0003_100,E-04_0003,11,185499,test,True
185500,1,E-04,161,Staurosporine,1.22,test,8,0003,7.0,153,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,E-04_0003_106,E-04_0003,13,185500,train,True


In [17]:
sc_profile_with_data_splits_df.dropna(inplace=True)
sc_profile_with_data_splits_df

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Metadata_Image_FileName_CL_488_2_crop,Metadata_Image_FileName_CL_561_crop,Metadata_Image_FileName_DNA_crop,Metadata_parent_path,Metadata_sc_unique_track_id,Metadata_Well_FOV,Metadata_sc_unique_track_id_count,index,Metadata_data_split,Metadata_ground_truth_present
0,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,7,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_5,C-09_0002,2,0,test,False
1,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,9,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_6,C-09_0002,5,1,test,False
2,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,10,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_7,C-09_0002,2,2,test,False
3,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,11,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_8,C-09_0002,8,3,test,False
4,1,C-09,168,Staurosporine,39.06,positive,1,0002,0.0,12,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,C-09_0002_9,C-09_0002,13,4,val,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185497,1,E-04,161,Staurosporine,1.22,test,8,0003,7.0,149,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,E-04_0003_105,E-04_0003,10,185497,test,False
185498,1,E-04,161,Staurosporine,1.22,test,8,0003,7.0,150,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,E-04_0003_112,E-04_0003,9,185498,test,False
185499,1,E-04,161,Staurosporine,1.22,test,8,0003,7.0,152,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,E-04_0003_100,E-04_0003,11,185499,test,True
185500,1,E-04,161,Staurosporine,1.22,test,8,0003,7.0,153,...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,/home/lippincm/4TB_A/live_cell_timelapse_apopt...,E-04_0003_106,E-04_0003,13,185500,train,True


In [18]:
# final breakdown of the data
train_gt = sc_profile_with_data_splits_df[
    sc_profile_with_data_splits_df["Metadata_data_split"] == "train"
].copy()
train_gt = train_gt[train_gt["Metadata_ground_truth_present"] == True].copy()
val_gt = sc_profile_with_data_splits_df[
    sc_profile_with_data_splits_df["Metadata_data_split"] == "val"
].copy()
val_gt = val_gt[val_gt["Metadata_ground_truth_present"] == True].copy()
test_gt = sc_profile_with_data_splits_df[
    sc_profile_with_data_splits_df["Metadata_data_split"] == "test"
].copy()
test_gt = test_gt[test_gt["Metadata_ground_truth_present"] == True].copy()
test_wo_gt = sc_profile_with_data_splits_df[
    sc_profile_with_data_splits_df["Metadata_data_split"] == "test"
].copy()
test_wo_gt = test_wo_gt[test_wo_gt["Metadata_ground_truth_present"] == False].copy()
holdout_w_gt = sc_profile_with_data_splits_df[
    sc_profile_with_data_splits_df["Metadata_data_split"] == "well_holdout"
].copy()
holdout_w_gt = holdout_w_gt[
    holdout_w_gt["Metadata_ground_truth_present"] == True
].copy()
holdout_wo_gt = sc_profile_with_data_splits_df[
    sc_profile_with_data_splits_df["Metadata_data_split"] == "well_holdout"
].copy()
holdout_wo_gt = holdout_wo_gt[
    holdout_wo_gt["Metadata_ground_truth_present"] == False
].copy()
# assertion time :)
assert (
    sc_profile_with_data_splits_df.shape[0]
    == train_gt.shape[0]
    + val_gt.shape[0]
    + test_gt.shape[0]
    + test_wo_gt.shape[0]
    + holdout_w_gt.shape[0]
    + holdout_wo_gt.shape[0]
)

In [19]:
# write the data splits to a parquet file
# this writes the indexes, ground truth, and data splits to a parquet file
# we do not write the sc_profile dataframe to a parquet file
data_split_file_path = pathlib.Path("../results/data_splits.parquet").resolve()
data_split_data_df.to_parquet(
    data_split_file_path,
    index=False,
)

In [20]:
# save the processed split data
sc_profile.to_parquet(
    sc_file_path,
    index=False,
)
sc_endpoint_profile.to_parquet(
    sc_endpoint_file_path,
    index=False,
)