In [1]:
import itertools
import json
import pathlib

import numpy as np
import pandas as pd

In [2]:
bulk_data_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve(strict=True)
whole_image_final_data_file_path = pathlib.Path(
    "../../data/CP_aggregated/endpoints/aggregated_profile.parquet"
).resolve(strict=True)
ground_truth_file_path = pathlib.Path(
    "../../1.ground_truth/data/0.ground_truth/ground_truth.csv"
).resolve(strict=True)
data_splits_dir = pathlib.Path("../data_splits/").resolve()
data_splits_dir.mkdir(parents=True, exist_ok=True)

# Load the data
bulk_df = pd.read_parquet(bulk_data_file_path)
ground_truth_df = pd.read_csv(ground_truth_file_path)
whole_image_final_df = pd.read_parquet(whole_image_final_data_file_path)
bulk_df["Metadata_dose"] = bulk_df["Metadata_dose"].astype("float64")
bulk_df["Metadata_Time"] = bulk_df["Metadata_Time"].astype("float64")
# get the final_timepoint only for the bulk data
bulk_df = bulk_df[bulk_df["Metadata_Time"] == bulk_df["Metadata_Time"].max()]
bulk_df.drop(columns=["Metadata_Time"], inplace=True)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_FormFactor_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
4,C-02,0.0,175.0,1,Staurosporine,negative,0.045544,0.423336,0.05703,-0.445734,...,0.035974,-0.044032,0.49004,0.287544,-0.584386,0.145022,0.071476,0.225753,-0.012832,-0.152288
17,C-03,0.61,179.0,1,Staurosporine,test,0.119876,0.39144,-0.130041,-0.510291,...,0.066651,0.002412,0.532592,0.328678,-0.679102,0.124509,0.068227,0.234068,0.07515,-0.096884
30,C-04,1.22,174.0,1,Staurosporine,test,0.203221,0.351816,-0.23049,-0.577859,...,-0.099588,-0.060728,0.492346,0.243888,-0.577691,0.047189,-0.009075,0.222638,0.122493,-0.01279
43,C-05,2.44,175.0,1,Staurosporine,test,0.283922,0.424011,-0.336284,-0.638914,...,-0.051959,-0.064873,0.391529,0.204624,-0.427636,0.237936,0.082126,0.37474,-0.056254,-0.061668
56,C-06,4.88,149.0,1,Staurosporine,test,0.446401,0.397404,-0.490245,-0.750406,...,0.022113,0.026482,0.358817,0.204848,-0.696369,0.20369,0.002269,0.184915,0.00807,-0.073982


In [3]:
# prepend "Terminal" to all columns in the whole image final dataframe
for col in whole_image_final_df.columns:
    if col == "Metadata_dose":
        continue
    if col == "Metadata_Well":
        continue
    whole_image_final_df.rename(columns={col: "Terminal_" + col}, inplace=True)

In [4]:
print("Bulk data shape: ", bulk_df.shape)
print("Whole image final data shape: ", whole_image_final_df.shape)

Bulk data shape:  (30, 2342)
Whole image final data shape:  (30, 517)


In [5]:
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_FormFactor_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
4,C-02,0.0,175.0,1,Staurosporine,negative,0.045544,0.423336,0.05703,-0.445734,...,0.035974,-0.044032,0.49004,0.287544,-0.584386,0.145022,0.071476,0.225753,-0.012832,-0.152288
17,C-03,0.61,179.0,1,Staurosporine,test,0.119876,0.39144,-0.130041,-0.510291,...,0.066651,0.002412,0.532592,0.328678,-0.679102,0.124509,0.068227,0.234068,0.07515,-0.096884
30,C-04,1.22,174.0,1,Staurosporine,test,0.203221,0.351816,-0.23049,-0.577859,...,-0.099588,-0.060728,0.492346,0.243888,-0.577691,0.047189,-0.009075,0.222638,0.122493,-0.01279
43,C-05,2.44,175.0,1,Staurosporine,test,0.283922,0.424011,-0.336284,-0.638914,...,-0.051959,-0.064873,0.391529,0.204624,-0.427636,0.237936,0.082126,0.37474,-0.056254,-0.061668
56,C-06,4.88,149.0,1,Staurosporine,test,0.446401,0.397404,-0.490245,-0.750406,...,0.022113,0.026482,0.358817,0.204848,-0.696369,0.20369,0.002269,0.184915,0.00807,-0.073982


In [6]:
ground_truth_df

Unnamed: 0,Metadata_dose,apoptosis
0,0.61,negative
1,1.22,negative
2,2.44,negative
3,4.88,positive
4,9.77,negative
5,19.53,positive
6,39.06,positive
7,78.13,positive
8,156.25,positive
9,0.0,control


In [7]:
bulk_df = pd.merge(
    bulk_df,
    ground_truth_df[["Metadata_dose", "apoptosis"]],
    how="left",
    left_on="Metadata_dose",
    right_on="Metadata_dose",
)
gt = bulk_df.pop("apoptosis")
bulk_df.insert(3, "Metadata_apoptosis_ground_truth", gt)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_apoptosis_ground_truth,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,C-02,0.0,175.0,control,1,Staurosporine,negative,0.045544,0.423336,0.05703,...,0.035974,-0.044032,0.49004,0.287544,-0.584386,0.145022,0.071476,0.225753,-0.012832,-0.152288
1,C-03,0.61,179.0,negative,1,Staurosporine,test,0.119876,0.39144,-0.130041,...,0.066651,0.002412,0.532592,0.328678,-0.679102,0.124509,0.068227,0.234068,0.07515,-0.096884
2,C-04,1.22,174.0,negative,1,Staurosporine,test,0.203221,0.351816,-0.23049,...,-0.099588,-0.060728,0.492346,0.243888,-0.577691,0.047189,-0.009075,0.222638,0.122493,-0.01279
3,C-05,2.44,175.0,negative,1,Staurosporine,test,0.283922,0.424011,-0.336284,...,-0.051959,-0.064873,0.391529,0.204624,-0.427636,0.237936,0.082126,0.37474,-0.056254,-0.061668
4,C-06,4.88,149.0,positive,1,Staurosporine,test,0.446401,0.397404,-0.490245,...,0.022113,0.026482,0.358817,0.204848,-0.696369,0.20369,0.002269,0.184915,0.00807,-0.073982


In [8]:
bulk_df = pd.merge(
    bulk_df,
    whole_image_final_df,
    how="left",
    left_on=["Metadata_dose", "Metadata_Well"],
    right_on=["Metadata_dose", "Metadata_Well"],
)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_apoptosis_ground_truth,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,...,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_00_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_01_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_02_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_03_256,Terminal_Nuclei_Texture_Correlation_DNA_3_02_256,Terminal_Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_DNA_3_03_256,Terminal_Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Terminal_Nuclei_Texture_SumAverage_DNA_3_01_256
0,C-02,0.0,175.0,control,1,Staurosporine,negative,0.045544,0.423336,0.05703,...,-0.533153,-0.65405,-0.435819,-0.635175,0.643979,-0.08743,0.527117,0.47445,-0.01127,0.347586
1,C-03,0.61,179.0,negative,1,Staurosporine,test,0.119876,0.39144,-0.130041,...,-0.808012,-0.65405,-0.832951,-0.666071,0.643979,-0.666506,0.176932,0.47445,-0.02013,0.347586
2,C-04,1.22,174.0,negative,1,Staurosporine,test,0.203221,0.351816,-0.23049,...,-0.727441,-0.65405,-0.832951,-0.666071,0.643979,-0.623357,0.140025,0.47445,-0.02013,0.347586
3,C-05,2.44,175.0,negative,1,Staurosporine,test,0.283922,0.424011,-0.336284,...,-0.625957,-0.65405,-0.832951,-0.666071,0.643979,-0.607609,0.219762,0.47445,-0.015241,0.347586
4,C-06,4.88,149.0,positive,1,Staurosporine,test,0.446401,0.397404,-0.490245,...,-0.593535,-0.65405,-0.583079,-0.666071,0.643979,-0.250069,0.447626,0.47445,-0.02013,0.347586


In [9]:
dose_wells = bulk_df.copy()
dose_wells = dose_wells[["Metadata_dose", "Metadata_Well"]]
dose_wells = dose_wells.drop_duplicates()
dose_wells = dose_wells.reset_index(drop=True)

In [10]:
# there are 10 doses, with three wells each
# one well is needed for each dose for training
# select one well per dose
test_wells = []
for dose in dose_wells["Metadata_dose"].unique():
    wells = dose_wells[dose_wells["Metadata_dose"] == dose]["Metadata_Well"].tolist()
    selected_well = np.random.choice(wells, 1)[0]
    print(f"Selected well {selected_well} for dose {dose}")
    test_wells.append(str(selected_well))

train_wells = dose_wells[~dose_wells["Metadata_Well"].isin(test_wells)][
    "Metadata_Well"
].tolist()

Selected well D-02 for dose 0.0
Selected well C-03 for dose 0.61
Selected well C-04 for dose 1.22
Selected well C-05 for dose 2.44
Selected well D-06 for dose 4.88
Selected well E-08 for dose 19.53
Selected well E-09 for dose 39.06
Selected well E-10 for dose 78.13
Selected well C-11 for dose 156.25
Selected well D-07 for dose 9.77


In [11]:
train_df = bulk_df[bulk_df["Metadata_Well"].isin(train_wells)]
test_df = bulk_df[bulk_df["Metadata_Well"].isin(test_wells)]
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
# write the train and test dataframes to parquet files
train_df_file_path = data_splits_dir / "train.parquet"
train_df.to_parquet(train_df_file_path, index=False)
test_df_file_path = data_splits_dir / "test.parquet"
test_df.to_parquet(test_df_file_path, index=False)

In [12]:
print("Train data shape: ", train_df.shape)
train_df.head()

Train data shape:  (20, 2858)


Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_apoptosis_ground_truth,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,...,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_00_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_01_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_02_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_03_256,Terminal_Nuclei_Texture_Correlation_DNA_3_02_256,Terminal_Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_DNA_3_03_256,Terminal_Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Terminal_Nuclei_Texture_SumAverage_DNA_3_01_256
0,C-02,0.0,175.0,control,1,Staurosporine,negative,0.045544,0.423336,0.05703,...,-0.4321,-0.587543,-0.440194,-0.486455,0.670119,-0.103772,0.463732,0.521993,-0.073682,0.416877
1,C-06,4.88,134.0,negative,1,Staurosporine,test,-0.853027,-0.335576,0.908984,...,-0.594116,-0.648602,-0.60168,-0.665333,0.670119,-0.662763,0.175557,0.521993,-0.108781,0.416877
2,C-08,19.53,133.0,negative,1,Staurosporine,test,-0.946101,-0.547828,1.173377,...,-0.5842,-0.580215,-0.466295,-0.665333,0.670119,0.048387,0.436127,0.521993,-0.099644,0.416877
3,C-09,39.06,88.0,positive,1,Staurosporine,positive,-0.983683,-0.845926,1.188321,...,-0.223819,-0.343047,-0.069238,-0.485761,-0.68228,0.087037,0.501674,0.11508,0.019048,1.008354
4,C-10,78.13,111.0,positive,1,Staurosporine,test,-1.073107,-1.094211,1.275899,...,-0.034566,-0.254892,0.156787,-0.33671,-0.601644,-0.183362,0.438211,-0.123443,0.121158,2.583897


In [13]:
# missing C-07 add it as a test well
if "C-07" not in test_df["Metadata_Well"].unique():
    c_07_df = bulk_df[bulk_df["Metadata_Well"] == "C-07"]
    c_07_df = c_07_df.reset_index(drop=True)
    test_df = pd.concat([test_df, c_07_df], ignore_index=True)
print("Test data shape: ", test_df.shape)

Test data shape:  (10, 2858)


In [14]:
# make a df with the wells used for training and testing with their respective doses
test_well_df = pd.DataFrame(test_wells, columns=["Metadata_Well"])
train_well_df = pd.DataFrame(train_wells, columns=["Metadata_Well"])
test_well_df["data_split"] = "test"
train_well_df["data_split"] = "train"
train_test_well_df = pd.concat([train_well_df, test_well_df], axis=0)
# save the train test well df to a parquet file
train_test_well_file_path = data_splits_dir / "train_test_wells.parquet"
train_test_well_df.to_parquet(train_test_well_file_path, index=False)