In [1]:
import itertools
import json
import pathlib

import numpy as np
import pandas as pd

In [2]:
bulk_data_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve(strict=True)
whole_image_final_data_file_path = pathlib.Path(
    "../../data/CP_aggregated/endpoints/aggregated_whole_image.parquet"
).resolve(strict=True)
ground_truth_file_path = pathlib.Path(
    "../../1.ground_truth/data/0.ground_truth/ground_truth.csv"
).resolve(strict=True)
data_splits_dir = pathlib.Path("../data_splits/").resolve()
data_splits_dir.mkdir(parents=True, exist_ok=True)

# Load the data
bulk_df = pd.read_parquet(bulk_data_file_path)
ground_truth_df = pd.read_csv(ground_truth_file_path)
whole_image_final_df = pd.read_parquet(whole_image_final_data_file_path)
bulk_df["Metadata_dose"] = bulk_df["Metadata_dose"].astype("float64")
bulk_df["Metadata_Time"] = bulk_df["Metadata_Time"].astype("float64")
# get the final_timepoint only for the bulk data
bulk_df = bulk_df[bulk_df["Metadata_Time"] == bulk_df["Metadata_Time"].max()]
bulk_df.drop(columns=["Metadata_Time"], inplace=True)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
1450,C-02,0.0,1,174,Staurosporine,negative,13,3,3.0,3,...,0.033469,0.06268,0.357989,0.291235,-0.693899,0.37725,0.338003,0.342171,0.058405,-0.129908
1451,C-02,0.0,1,174,Staurosporine,negative,13,3,5.0,5,...,0.033469,0.06268,0.357989,0.291235,-0.693899,0.37725,0.338003,0.342171,0.058405,-0.129908
1452,C-02,0.0,1,174,Staurosporine,negative,13,3,8.0,8,...,0.033469,0.06268,0.357989,0.291235,-0.693899,0.37725,0.338003,0.342171,0.058405,-0.129908
1453,C-02,0.0,1,174,Staurosporine,negative,13,3,11.0,11,...,0.033469,0.06268,0.357989,0.291235,-0.693899,0.37725,0.338003,0.342171,0.058405,-0.129908
1454,C-02,0.0,1,174,Staurosporine,negative,13,3,12.0,12,...,0.033469,0.06268,0.357989,0.291235,-0.693899,0.37725,0.338003,0.342171,0.058405,-0.129908


In [3]:
# prepend "Terminal" to all columns in the whole image final dataframe
for col in whole_image_final_df.columns:
    if col == "Metadata_dose":
        continue
    if col == "Metadata_Well":
        continue
    whole_image_final_df.rename(columns={col: "Terminal_" + col}, inplace=True)

In [4]:
print("Bulk data shape: ", bulk_df.shape)
print("Whole image final data shape: ", whole_image_final_df.shape)

Bulk data shape:  (10467, 2424)
Whole image final data shape:  (30, 25)


In [5]:
bulk_df = pd.merge(
    bulk_df,
    ground_truth_df[["Metadata_dose", "apoptosis"]],
    how="left",
    left_on="Metadata_dose",
    right_on="Metadata_dose",
)
gt = bulk_df.pop("apoptosis")
bulk_df.insert(3, "Metadata_apoptosis_ground_truth", gt)

bulk_df = pd.merge(
    bulk_df,
    whole_image_final_df,
    how="left",
    left_on=["Metadata_dose", "Metadata_Well"],
    right_on=["Metadata_dose", "Metadata_Well"],
)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_plate,Metadata_apoptosis_ground_truth,Metadata_number_of_singlecells,Metadata_compound,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Cells_Number_Object_Number,...,Terminal_Texture_DifferenceEntropy_DNA_3_00_256,Terminal_Texture_DifferenceVariance_AnnexinV_3_02_256,Terminal_Texture_DifferenceVariance_DNA_3_00_256,Terminal_Texture_InfoMeas1_AnnexinV_3_03_256,Terminal_Texture_InfoMeas1_DNA_3_01_256,Terminal_Texture_InfoMeas2_AnnexinV_3_03_256,Terminal_Texture_InfoMeas2_DNA_3_00_256,Terminal_Texture_InverseDifferenceMoment_AnnexinV_3_00_256,Terminal_Texture_SumVariance_AnnexinV_3_03_256,Terminal_Texture_SumVariance_DNA_3_02_256
0,C-02,0.0,1,control,174,Staurosporine,negative,13,3,3.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745
1,C-02,0.0,1,control,174,Staurosporine,negative,13,3,5.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745
2,C-02,0.0,1,control,174,Staurosporine,negative,13,3,8.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745
3,C-02,0.0,1,control,174,Staurosporine,negative,13,3,11.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745
4,C-02,0.0,1,control,174,Staurosporine,negative,13,3,12.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745


In [6]:
dose_wells = bulk_df.copy()
dose_wells = dose_wells[["Metadata_dose", "Metadata_Well"]]
dose_wells = dose_wells.drop_duplicates()
dose_wells = dose_wells.reset_index(drop=True)

In [7]:
# there are 10 doses, with three wells each
# one well is needed for each dose for training
# select one well per dose
test_wells = []
for dose in dose_wells["Metadata_dose"].unique():
    wells = dose_wells[dose_wells["Metadata_dose"] == dose]["Metadata_Well"].tolist()
    selected_well = np.random.choice(wells, 1)[0]
    print(f"Selected well {selected_well} for dose {dose}")
    test_wells.append(str(selected_well))

train_wells = dose_wells[~dose_wells["Metadata_Well"].isin(test_wells)][
    "Metadata_Well"
].tolist()

Selected well C-02 for dose 0.0
Selected well D-03 for dose 0.61
Selected well C-04 for dose 1.22
Selected well D-05 for dose 2.44
Selected well D-06 for dose 4.88
Selected well D-07 for dose 9.77
Selected well C-08 for dose 19.53
Selected well D-09 for dose 39.06
Selected well C-10 for dose 78.13
Selected well D-11 for dose 156.25


In [8]:
train_df = bulk_df[bulk_df["Metadata_Well"].isin(train_wells)]
test_df = bulk_df[bulk_df["Metadata_Well"].isin(test_wells)]
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
# write the train and test dataframes to parquet files
train_df_file_path = data_splits_dir / "train.parquet"
train_df.to_parquet(train_df_file_path, index=False)
test_df_file_path = data_splits_dir / "test.parquet"
test_df.to_parquet(test_df_file_path, index=False)

In [9]:
print("Train data shape: ", train_df.shape)
train_df.head()

Train data shape:  (6814, 2448)


Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_plate,Metadata_apoptosis_ground_truth,Metadata_number_of_singlecells,Metadata_compound,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Cells_Number_Object_Number,...,Terminal_Texture_DifferenceEntropy_DNA_3_00_256,Terminal_Texture_DifferenceVariance_AnnexinV_3_02_256,Terminal_Texture_DifferenceVariance_DNA_3_00_256,Terminal_Texture_InfoMeas1_AnnexinV_3_03_256,Terminal_Texture_InfoMeas1_DNA_3_01_256,Terminal_Texture_InfoMeas2_AnnexinV_3_03_256,Terminal_Texture_InfoMeas2_DNA_3_00_256,Terminal_Texture_InverseDifferenceMoment_AnnexinV_3_00_256,Terminal_Texture_SumVariance_AnnexinV_3_03_256,Terminal_Texture_SumVariance_DNA_3_02_256
0,C-03,0.61,1,negative,185,Staurosporine,test,13,2,1.0,...,0.424535,0.357361,-0.701164,0.012743,0.76919,-0.252766,0.186356,1.041248,-1.172505,0.076896
1,C-03,0.61,1,negative,185,Staurosporine,test,13,2,6.0,...,0.424535,0.357361,-0.701164,0.012743,0.76919,-0.252766,0.186356,1.041248,-1.172505,0.076896
2,C-03,0.61,1,negative,185,Staurosporine,test,13,2,9.0,...,0.424535,0.357361,-0.701164,0.012743,0.76919,-0.252766,0.186356,1.041248,-1.172505,0.076896
3,C-03,0.61,1,negative,185,Staurosporine,test,13,2,12.0,...,0.424535,0.357361,-0.701164,0.012743,0.76919,-0.252766,0.186356,1.041248,-1.172505,0.076896
4,C-03,0.61,1,negative,185,Staurosporine,test,13,2,14.0,...,0.424535,0.357361,-0.701164,0.012743,0.76919,-0.252766,0.186356,1.041248,-1.172505,0.076896


In [10]:
print("Test data shape: ", test_df.shape)
test_df.head()

Test data shape:  (3653, 2448)


Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_plate,Metadata_apoptosis_ground_truth,Metadata_number_of_singlecells,Metadata_compound,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Cells_Number_Object_Number,...,Terminal_Texture_DifferenceEntropy_DNA_3_00_256,Terminal_Texture_DifferenceVariance_AnnexinV_3_02_256,Terminal_Texture_DifferenceVariance_DNA_3_00_256,Terminal_Texture_InfoMeas1_AnnexinV_3_03_256,Terminal_Texture_InfoMeas1_DNA_3_01_256,Terminal_Texture_InfoMeas2_AnnexinV_3_03_256,Terminal_Texture_InfoMeas2_DNA_3_00_256,Terminal_Texture_InverseDifferenceMoment_AnnexinV_3_00_256,Terminal_Texture_SumVariance_AnnexinV_3_03_256,Terminal_Texture_SumVariance_DNA_3_02_256
0,C-02,0.0,1,control,174,Staurosporine,negative,13,3,3.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745
1,C-02,0.0,1,control,174,Staurosporine,negative,13,3,5.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745
2,C-02,0.0,1,control,174,Staurosporine,negative,13,3,8.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745
3,C-02,0.0,1,control,174,Staurosporine,negative,13,3,11.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745
4,C-02,0.0,1,control,174,Staurosporine,negative,13,3,12.0,...,0.670063,-1.416381,-0.288271,0.776827,0.938617,-1.208536,0.292902,1.162458,-0.191628,0.117745


In [11]:
# make a df with the wells used for training and testing with their respective doses
test_well_df = pd.DataFrame(test_wells, columns=["Metadata_Well"])
train_well_df = pd.DataFrame(train_wells, columns=["Metadata_Well"])
test_well_df["data_split"] = "test"
train_well_df["data_split"] = "train"
train_test_well_df = pd.concat([train_well_df, test_well_df], axis=0)
# save the train test well df to a parquet file
train_test_well_file_path = data_splits_dir / "train_test_wells.parquet"
train_test_well_df.to_parquet(train_test_well_file_path, index=False)