In [None]:
import pathlib

import joblib
import numpy as np
import pandas as pd
import pycytominer

In [2]:
model_file_dir = pathlib.Path(
    "../models/multi_regression_model_ntrees_100.joblib"
).resolve()
shuffled_model_file_dir = pathlib.Path(
    "../models/shuffled_multi_regression_model_ntrees_100.joblib"
).resolve()
train_test_wells_path = pathlib.Path(
    "../data_splits/train_test_wells.parquet"
).resolve()

predictions_save_path = pathlib.Path(
    "../results/predicted_terminal_profiles_from_all_time_points.parquet"
).resolve()

profile_data_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve()
terminal_column_names = pathlib.Path("../results/terminal_columns.txt").resolve(
    strict=True
)
terminal_column_names = [
    line.strip() for line in terminal_column_names.read_text().splitlines()
]

data_split_df = pd.read_parquet(train_test_wells_path)
df = pd.read_parquet(profile_data_path)
metadata_cols = [cols for cols in df.columns if "Metadata" in cols]
features_cols = [cols for cols in df.columns if "Metadata" not in cols]
features_cols = features_cols
aggregate_df = pycytominer.aggregate(
    population_df=df,
    strata=["Metadata_Well", "Metadata_Time"],
    features=features_cols,
    operation="median",
)


metadata_df = df[metadata_cols]
metadata_df = metadata_df.drop_duplicates(subset=["Metadata_Well", "Metadata_Time"])
metadata_df = metadata_df.reset_index(drop=True)
aggregate_df = pd.merge(
    metadata_df, aggregate_df, on=["Metadata_Well", "Metadata_Time"]
)
print(aggregate_df.shape)
aggregate_df.head()

(390, 2425)


Unnamed: 0,Metadata_Well,Metadata_Time,Metadata_dose,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,18.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
1,C-02,1.0,0.0,1,185,Staurosporine,negative,2,3,10.0,...,-0.073834,0.133605,-0.037798,0.005786,-0.243411,0.357136,-0.009882,0.177561,-0.066328,-0.071994
2,C-02,10.0,0.0,1,173,Staurosporine,negative,11,3,8.0,...,-0.040882,-0.081997,0.103874,0.148222,-0.595298,0.27276,-0.000339,0.206419,-0.037941,-0.095736
3,C-02,11.0,0.0,1,174,Staurosporine,negative,12,3,2.0,...,-0.06713,0.04054,0.169835,0.272656,-0.523787,0.283778,0.10986,0.27764,0.116211,-0.166754
4,C-02,12.0,0.0,1,174,Staurosporine,negative,13,3,3.0,...,0.033469,0.06268,0.357989,0.291235,-0.693899,0.37725,0.338003,0.342171,0.058405,-0.129908


In [3]:
# map the train/test wells to the aggregate data
aggregate_df["Metadata_data_split"] = aggregate_df["Metadata_Well"].map(
    data_split_df.set_index("Metadata_Well")["data_split"]
)
data_split = aggregate_df.pop("Metadata_data_split")
aggregate_df.insert(0, "Metadata_data_split", data_split)
aggregate_df["Metadata_Time"] = aggregate_df["Metadata_Time"].astype(float)
aggregate_df["Metadata_data_split"].unique()

array(['test', 'train'], dtype=object)

In [4]:
aggregate_df.head(15)

Unnamed: 0,Metadata_data_split,Metadata_Well,Metadata_Time,Metadata_dose,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_control,Metadata_ImageNumber,Metadata_FOV,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,test,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
1,test,C-02,1.0,0.0,1,185,Staurosporine,negative,2,3,...,-0.073834,0.133605,-0.037798,0.005786,-0.243411,0.357136,-0.009882,0.177561,-0.066328,-0.071994
2,test,C-02,10.0,0.0,1,173,Staurosporine,negative,11,3,...,-0.040882,-0.081997,0.103874,0.148222,-0.595298,0.27276,-0.000339,0.206419,-0.037941,-0.095736
3,test,C-02,11.0,0.0,1,174,Staurosporine,negative,12,3,...,-0.06713,0.04054,0.169835,0.272656,-0.523787,0.283778,0.10986,0.27764,0.116211,-0.166754
4,test,C-02,12.0,0.0,1,174,Staurosporine,negative,13,3,...,0.033469,0.06268,0.357989,0.291235,-0.693899,0.37725,0.338003,0.342171,0.058405,-0.129908
5,test,C-02,2.0,0.0,1,193,Staurosporine,negative,3,3,...,-0.046514,0.170776,0.089048,0.103085,-0.318048,0.227679,0.011619,0.135376,-0.095054,-0.128167
6,test,C-02,3.0,0.0,1,181,Staurosporine,negative,4,3,...,-0.017163,0.114952,0.118392,0.082461,-0.270823,0.340168,0.008793,0.147805,-0.133435,-0.130127
7,test,C-02,4.0,0.0,1,181,Staurosporine,negative,5,3,...,-0.037211,0.029985,0.092362,-0.042179,-0.391595,0.230738,-0.124246,0.204051,-0.036376,-0.144271
8,test,C-02,5.0,0.0,1,183,Staurosporine,negative,6,3,...,-0.1699,0.115446,0.115627,-0.092404,-0.437182,0.33079,-0.017349,0.196531,-0.048901,-0.120669
9,test,C-02,6.0,0.0,1,181,Staurosporine,negative,7,3,...,-0.106118,0.063911,0.161602,-0.044504,-0.396253,0.240502,-0.024445,0.185498,-0.040558,-0.102551


In [5]:
# if the data_split is train and the time is not 12 then set to non_trained_pair
# where 12 is the last time point
aggregate_df["Metadata_data_split"] = aggregate_df.apply(
    lambda x: (
        "non_trained_pair"
        if (x["Metadata_data_split"] == "train" and x["Metadata_Time"] != 12.0)
        else x["Metadata_data_split"]
    ),
    axis=1,
)

In [6]:
# load the model
model = joblib.load(model_file_dir)

metadata_columns = [x for x in aggregate_df.columns if "Metadata_" in x]
# remove metadata columns
features = aggregate_df.drop(columns=metadata_columns)
metadata_df = aggregate_df[metadata_columns]
# predict the terminal feature space
predictions = model.predict(features)
predictions_df = pd.DataFrame(predictions, columns=terminal_column_names)
# insert the metadata columns
for col in metadata_columns:
    predictions_df.insert(0, col, metadata_df[col])
predictions_df["shuffled"] = False

In [7]:
# load the model
shuffled_model = joblib.load(shuffled_model_file_dir)

metadata_columns = [x for x in aggregate_df.columns if "Metadata_" in x]
shuffled_profile_df = aggregate_df.copy()
for col in shuffled_profile_df.columns:
    shuffled_profile_df[col] = np.random.permutation(shuffled_profile_df[col])
# remove metadata columns
features = shuffled_profile_df.drop(columns=metadata_columns)
metadata_df = aggregate_df[metadata_columns]


# predict the terminal feature space
predictions = shuffled_model.predict(features)
shuffled_predictions_df = pd.DataFrame(predictions, columns=terminal_column_names)
# insert the metadata columns
for col in metadata_columns:
    shuffled_predictions_df.insert(0, col, metadata_df[col])
shuffled_predictions_df["shuffled"] = True

In [8]:
final_predictions_df = pd.concat([predictions_df, shuffled_predictions_df], axis=0)
# save the predictions
final_predictions_df.to_parquet(predictions_save_path, index=False)
final_predictions_df.head()

Unnamed: 0,Metadata_image_path,Metadata_original_index,Metadata_Nuclei_AreaShape_BoundingBoxArea,Metadata_distance,Metadata_coordinates_y,Metadata_parent_id,Metadata_parent_track_id,Metadata_id,Metadata_x,Metadata_y,...,Terminal_Texture_DifferenceVariance_AnnexinV_3_02_256,Terminal_Texture_DifferenceVariance_DNA_3_00_256,Terminal_Texture_InfoMeas1_AnnexinV_3_03_256,Terminal_Texture_InfoMeas1_DNA_3_01_256,Terminal_Texture_InfoMeas2_AnnexinV_3_03_256,Terminal_Texture_InfoMeas2_DNA_3_00_256,Terminal_Texture_InverseDifferenceMoment_AnnexinV_3_00_256,Terminal_Texture_SumVariance_AnnexinV_3_03_256,Terminal_Texture_SumVariance_DNA_3_02_256,shuffled
0,../0.pre-process_images/data/processed_images/...,53685,-0.583917,0.40744,"[1114.0, 90.0]",-1.0,-1,1000017.0,1114.0,90.0,...,1.135598,0.343934,0.068354,-0.959864,0.437552,2.457649,-0.748607,-0.006133,1.238921,False
1,../0.pre-process_images/data/processed_images/...,54566,0.054783,0.560859,"[1762.0, 87.0]",-1.0,-1,2000016.0,1762.0,87.0,...,1.106629,0.218864,-0.397224,-0.250449,0.286604,2.312183,1.421078,0.020617,-0.421854,False
2,../0.pre-process_images/data/processed_images/...,55122,0.86203,0.11412,"[215.0, 97.0]",10000010.0,-1,11000011.0,215.0,97.0,...,1.087481,0.173915,-0.446475,-0.583135,0.263619,-0.541023,1.501801,0.260832,-0.014531,False
3,../0.pre-process_images/data/processed_images/...,54286,1.922095,0.543618,"[361.0, 58.0]",11000007.0,-1,12000009.0,361.0,58.0,...,1.126041,-0.773366,-0.412851,-0.554986,0.274789,0.167624,1.35607,0.27829,0.165282,False
4,../0.pre-process_images/data/processed_images/...,54424,0.033493,0.491229,"[1815.0, 95.0]",12000011.0,-1,13000013.0,1815.0,95.0,...,0.313455,0.416276,0.100353,-0.563842,0.32672,0.342416,1.479265,0.444493,-0.119535,False
