In [1]:
import itertools
import json
import pathlib

import numpy as np
import pandas as pd

In [2]:
bulk_data_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve(strict=True)
whole_image_final_data_file_path = pathlib.Path(
    "../../data/CP_aggregated/endpoints/aggregated_profile.parquet"
).resolve(strict=True)
ground_truth_file_path = pathlib.Path(
    "../../1.ground_truth/data/0.ground_truth/ground_truth.csv"
).resolve(strict=True)
data_splits_dir = pathlib.Path("../data_splits/").resolve()
data_splits_dir.mkdir(parents=True, exist_ok=True)

# Load the data
bulk_df = pd.read_parquet(bulk_data_file_path)
ground_truth_df = pd.read_csv(ground_truth_file_path)
whole_image_final_df = pd.read_parquet(whole_image_final_data_file_path)
bulk_df["Metadata_dose"] = bulk_df["Metadata_dose"].astype("float64")
bulk_df["Metadata_Time"] = bulk_df["Metadata_Time"].astype("float64")
# get the final_timepoint only for the bulk data
bulk_df = bulk_df[bulk_df["Metadata_Time"] == bulk_df["Metadata_Time"].max()]
bulk_df.drop(columns=["Metadata_Time"], inplace=True)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_MinorAxisLength_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
4,C-02,0.0,154.0,1,Staurosporine,negative,0.035033,0.554286,-0.355493,0.352657,...,-0.093603,-0.027693,0.373045,-0.008521,-0.774369,0.443792,0.342298,0.417925,0.019141,-0.034194
17,C-03,0.61,167.0,1,Staurosporine,test,-0.370743,0.297396,0.328665,0.264511,...,0.083305,0.286193,0.466636,0.320256,-0.579143,0.518275,0.181053,0.148265,0.286599,-0.173756
30,C-04,1.22,166.0,1,Staurosporine,test,-0.239434,0.368518,0.282078,0.088974,...,-0.2148,0.087457,0.451985,0.43219,-0.666231,0.225114,0.175131,0.353043,0.296713,0.229739
43,C-05,2.44,164.0,1,Staurosporine,test,-0.540085,0.050681,0.477738,-0.119831,...,0.02974,-0.004862,0.333109,0.324664,-0.544039,0.429608,0.062017,0.499831,0.039398,-0.056706
56,C-06,4.88,134.0,1,Staurosporine,test,-0.853027,-0.335576,0.908984,-0.505968,...,-0.06909,0.12943,0.158417,0.18103,-0.739533,0.438399,0.052955,0.28011,-0.007682,-0.045407


In [3]:
# prepend "Terminal" to all columns in the whole image final dataframe
for col in whole_image_final_df.columns:
    if col == "Metadata_dose":
        continue
    if col == "Metadata_Well":
        continue
    whole_image_final_df.rename(columns={col: "Terminal_" + col}, inplace=True)

In [4]:
print("Bulk data shape: ", bulk_df.shape)
print("Whole image final data shape: ", whole_image_final_df.shape)

Bulk data shape:  (29, 2395)
Whole image final data shape:  (30, 524)


In [5]:
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_MinorAxisLength_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
4,C-02,0.0,154.0,1,Staurosporine,negative,0.035033,0.554286,-0.355493,0.352657,...,-0.093603,-0.027693,0.373045,-0.008521,-0.774369,0.443792,0.342298,0.417925,0.019141,-0.034194
17,C-03,0.61,167.0,1,Staurosporine,test,-0.370743,0.297396,0.328665,0.264511,...,0.083305,0.286193,0.466636,0.320256,-0.579143,0.518275,0.181053,0.148265,0.286599,-0.173756
30,C-04,1.22,166.0,1,Staurosporine,test,-0.239434,0.368518,0.282078,0.088974,...,-0.2148,0.087457,0.451985,0.43219,-0.666231,0.225114,0.175131,0.353043,0.296713,0.229739
43,C-05,2.44,164.0,1,Staurosporine,test,-0.540085,0.050681,0.477738,-0.119831,...,0.02974,-0.004862,0.333109,0.324664,-0.544039,0.429608,0.062017,0.499831,0.039398,-0.056706
56,C-06,4.88,134.0,1,Staurosporine,test,-0.853027,-0.335576,0.908984,-0.505968,...,-0.06909,0.12943,0.158417,0.18103,-0.739533,0.438399,0.052955,0.28011,-0.007682,-0.045407


In [6]:
ground_truth_df

Unnamed: 0,Metadata_dose,apoptosis
0,0.61,negative
1,1.22,negative
2,2.44,negative
3,4.88,negative
4,9.77,negative
5,19.53,negative
6,39.06,positive
7,78.13,positive
8,156.25,positive
9,0.0,control


In [7]:
bulk_df = pd.merge(
    bulk_df,
    ground_truth_df[["Metadata_dose", "apoptosis"]],
    how="left",
    left_on="Metadata_dose",
    right_on="Metadata_dose",
)
gt = bulk_df.pop("apoptosis")
bulk_df.insert(3, "Metadata_apoptosis_ground_truth", gt)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_apoptosis_ground_truth,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,C-02,0.0,154.0,control,1,Staurosporine,negative,0.035033,0.554286,-0.355493,...,-0.093603,-0.027693,0.373045,-0.008521,-0.774369,0.443792,0.342298,0.417925,0.019141,-0.034194
1,C-03,0.61,167.0,negative,1,Staurosporine,test,-0.370743,0.297396,0.328665,...,0.083305,0.286193,0.466636,0.320256,-0.579143,0.518275,0.181053,0.148265,0.286599,-0.173756
2,C-04,1.22,166.0,negative,1,Staurosporine,test,-0.239434,0.368518,0.282078,...,-0.2148,0.087457,0.451985,0.43219,-0.666231,0.225114,0.175131,0.353043,0.296713,0.229739
3,C-05,2.44,164.0,negative,1,Staurosporine,test,-0.540085,0.050681,0.477738,...,0.02974,-0.004862,0.333109,0.324664,-0.544039,0.429608,0.062017,0.499831,0.039398,-0.056706
4,C-06,4.88,134.0,negative,1,Staurosporine,test,-0.853027,-0.335576,0.908984,...,-0.06909,0.12943,0.158417,0.18103,-0.739533,0.438399,0.052955,0.28011,-0.007682,-0.045407


In [8]:
bulk_df = pd.merge(
    bulk_df,
    whole_image_final_df,
    how="left",
    left_on=["Metadata_dose", "Metadata_Well"],
    right_on=["Metadata_dose", "Metadata_Well"],
)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_apoptosis_ground_truth,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,...,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_00_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_01_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_02_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_03_256,Terminal_Nuclei_Texture_Correlation_DNA_3_02_256,Terminal_Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_DNA_3_01_256,Terminal_Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Terminal_Nuclei_Texture_SumAverage_DNA_3_00_256
0,C-02,0.0,154.0,control,1,Staurosporine,negative,0.035033,0.554286,-0.355493,...,-0.4321,-0.587543,-0.440194,-0.486455,0.670119,-0.103772,0.463732,0.521993,-0.073682,0.416877
1,C-03,0.61,167.0,negative,1,Staurosporine,test,-0.370743,0.297396,0.328665,...,-0.739634,-0.648602,-0.791632,-0.665333,0.670119,-0.526875,0.332559,0.521993,-0.121325,0.416877
2,C-04,1.22,166.0,negative,1,Staurosporine,test,-0.239434,0.368518,0.282078,...,-0.83764,-0.648602,-0.839328,-0.665333,0.670119,-0.774637,-0.010179,0.521993,-0.121325,0.416877
3,C-05,2.44,164.0,negative,1,Staurosporine,test,-0.540085,0.050681,0.477738,...,-0.458198,-0.648602,-0.647667,-0.665333,0.670119,-0.588308,0.165274,0.521993,-0.104207,0.416877
4,C-06,4.88,134.0,negative,1,Staurosporine,test,-0.853027,-0.335576,0.908984,...,-0.594116,-0.648602,-0.60168,-0.665333,0.670119,-0.662763,0.175557,0.521993,-0.108781,0.416877


In [9]:
dose_wells = bulk_df.copy()
dose_wells = dose_wells[["Metadata_dose", "Metadata_Well"]]
dose_wells = dose_wells.drop_duplicates()
dose_wells = dose_wells.reset_index(drop=True)

In [10]:
# there are 10 doses, with three wells each
# one well is needed for each dose for training
# select one well per dose
test_wells = []
for dose in dose_wells["Metadata_dose"].unique():
    wells = dose_wells[dose_wells["Metadata_dose"] == dose]["Metadata_Well"].tolist()
    selected_well = np.random.choice(wells, 1)[0]
    print(f"Selected well {selected_well} for dose {dose}")
    test_wells.append(str(selected_well))

train_wells = dose_wells[~dose_wells["Metadata_Well"].isin(test_wells)][
    "Metadata_Well"
].tolist()

Selected well D-02 for dose 0.0
Selected well C-03 for dose 0.61
Selected well C-04 for dose 1.22
Selected well C-05 for dose 2.44
Selected well D-06 for dose 4.88
Selected well E-08 for dose 19.53
Selected well E-09 for dose 39.06
Selected well E-10 for dose 78.13
Selected well C-11 for dose 156.25
Selected well D-07 for dose 9.77


In [11]:
train_df = bulk_df[bulk_df["Metadata_Well"].isin(train_wells)]
test_df = bulk_df[bulk_df["Metadata_Well"].isin(test_wells)]
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
# write the train and test dataframes to parquet files
train_df_file_path = data_splits_dir / "train.parquet"
train_df.to_parquet(train_df_file_path, index=False)
test_df_file_path = data_splits_dir / "test.parquet"
test_df.to_parquet(test_df_file_path, index=False)

In [12]:
print("Train data shape: ", train_df.shape)
train_df.head()

Train data shape:  (19, 2918)


Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_apoptosis_ground_truth,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,...,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_00_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_01_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_02_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_03_256,Terminal_Nuclei_Texture_Correlation_DNA_3_02_256,Terminal_Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_DNA_3_01_256,Terminal_Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Terminal_Nuclei_Texture_SumAverage_DNA_3_00_256
0,C-02,0.0,154.0,control,1,Staurosporine,negative,0.035033,0.554286,-0.355493,...,-0.4321,-0.587543,-0.440194,-0.486455,0.670119,-0.103772,0.463732,0.521993,-0.073682,0.416877
1,C-06,4.88,134.0,negative,1,Staurosporine,test,-0.853027,-0.335576,0.908984,...,-0.594116,-0.648602,-0.60168,-0.665333,0.670119,-0.662763,0.175557,0.521993,-0.108781,0.416877
2,C-08,19.53,133.0,negative,1,Staurosporine,test,-0.946101,-0.547828,1.173377,...,-0.5842,-0.580215,-0.466295,-0.665333,0.670119,0.048387,0.436127,0.521993,-0.099644,0.416877
3,C-09,39.06,88.0,positive,1,Staurosporine,positive,-0.983683,-0.845926,1.188321,...,-0.223819,-0.343047,-0.069238,-0.485761,-0.68228,0.087037,0.501674,0.11508,0.019048,1.008354
4,C-10,78.13,111.0,positive,1,Staurosporine,test,-1.073107,-1.094211,1.275899,...,-0.034566,-0.254892,0.156787,-0.33671,-0.601644,-0.183362,0.438211,-0.123443,0.121158,2.583897


In [13]:
# missing C-07 add it as a test well
if "C-07" not in test_df["Metadata_Well"].unique():
    c_07_df = bulk_df[bulk_df["Metadata_Well"] == "C-07"]
    c_07_df = c_07_df.reset_index(drop=True)
    test_df = pd.concat([test_df, c_07_df], ignore_index=True)
print("Test data shape: ", test_df.shape)

Test data shape:  (10, 2918)


In [14]:
# make a df with the wells used for training and testing with their respective doses
test_well_df = pd.DataFrame(test_wells, columns=["Metadata_Well"])
train_well_df = pd.DataFrame(train_wells, columns=["Metadata_Well"])
test_well_df["data_split"] = "test"
train_well_df["data_split"] = "train"
train_test_well_df = pd.concat([train_well_df, test_well_df], axis=0)
# save the train test well df to a parquet file
train_test_well_file_path = data_splits_dir / "train_test_wells.parquet"
train_test_well_df.to_parquet(train_test_well_file_path, index=False)