In [1]:
import pathlib

import joblib
import numpy as np
import pandas as pd
import pycytominer

In [2]:
train_test_wells_path = pathlib.Path(
    "../data_splits/train_test_wells.parquet"
).resolve()

predictions_save_path = pathlib.Path(
    "../results/predicted_terminal_profiles_from_all_time_points.parquet"
).resolve()

profile_data_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve()
terminal_column_names = pathlib.Path("../results/terminal_columns.txt").resolve(
    strict=True
)
terminal_column_names = [
    line.strip() for line in terminal_column_names.read_text().splitlines()
]

data_split_df = pd.read_parquet(train_test_wells_path)
df = pd.read_parquet(profile_data_path)
metadata_cols = [cols for cols in df.columns if "Metadata" in cols]
features_cols = [cols for cols in df.columns if "Metadata" not in cols]
features_cols = features_cols
aggregate_df = pycytominer.aggregate(
    population_df=df,
    strata=["Metadata_Well", "Metadata_Time"],
    features=features_cols,
    operation="median",
)


metadata_df = df[metadata_cols]
metadata_df = metadata_df.drop_duplicates(subset=["Metadata_Well", "Metadata_Time"])
metadata_df = metadata_df.reset_index(drop=True)
aggregate_df = pd.merge(
    metadata_df, aggregate_df, on=["Metadata_Well", "Metadata_Time"]
)
print(aggregate_df.shape)
aggregate_df.head()

(389, 2396)


Unnamed: 0,Metadata_Well,Metadata_Time,Metadata_dose,Metadata_number_of_singlecells,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,C-02,0.0,0.0,183.0,1,Staurosporine,negative,-0.07711,0.475957,0.091597,...,0.229725,-0.149357,0.096118,-0.092986,0.045031,0.164684,0.055407,0.004974,-0.242772,0.102413
1,C-02,1.0,0.0,180.0,1,Staurosporine,negative,-0.296513,0.28499,0.226765,...,-0.073892,0.133531,-0.0378,0.005971,-0.243315,0.357192,-0.009844,0.177669,-0.066157,-0.072011
2,C-02,10.0,0.0,173.0,1,Staurosporine,negative,-0.004368,0.455522,-0.063581,...,-0.026653,-0.161838,0.110146,0.150975,-0.567474,0.259897,-0.010044,0.20663,-0.001741,-0.023353
3,C-02,11.0,0.0,174.0,1,Staurosporine,negative,0.187565,0.547188,-0.132059,...,-0.092784,0.05755,0.197094,0.240283,-0.608368,0.200364,0.065202,0.29587,0.237916,-0.137402
4,C-02,12.0,0.0,154.0,1,Staurosporine,negative,0.035033,0.554286,-0.355493,...,-0.093603,-0.027693,0.373045,-0.008521,-0.774369,0.443792,0.342298,0.417925,0.019141,-0.034194


In [11]:
models_path = pathlib.Path("../models/").resolve(strict=True)
models = pathlib.Path(models_path).glob("*.joblib")
models_dict = {
    "model_name": [],
    "model_path": [],
    "shuffled": [],
    "feature": [],
}

for model_path in models:
    print(model_path.name)
    # print(model_path.name.split("singlefeature")[1].strip(".joblib").strip("_"))
    models_dict["model_name"].append(model_path.name)
    models_dict["model_path"].append(model_path)
    models_dict["shuffled"].append(
        "shuffled" if "shuffled" in model_path.name else "not_shuffled"
    )
    models_dict["feature"].append(
        model_path.name.split("singlefeature")[1].strip(".joblib").strip("_")
        if "singlefeature" in model_path.name
        else "all_terminal_features"
    )

train_shuffled_elastic_net_model_singlefeature_Terminal_Cytoplasm_Intensity_MaxIntensity_AnnexinV.joblib
train_elastic_net_model_singlefeature_Terminal_Cytoplasm_Intensity_MaxIntensity_AnnexinV.joblib
train_elastic_net_model_singlefeature_Terminal_Cells_Intensity_MaxIntensityEdge_AnnexinV.joblib
train_shuffled_elastic_net_model_singlefeature_Terminal_Cells_Intensity_MaxIntensityEdge_AnnexinV.joblib
train_shuffled_elastic_net_model_all_terminal_features.joblib
train_elastic_net_model_all_terminal_features.joblib


In [12]:
# map the train/test wells to the aggregate data
aggregate_df["Metadata_data_split"] = aggregate_df["Metadata_Well"].map(
    data_split_df.set_index("Metadata_Well")["data_split"]
)
data_split = aggregate_df.pop("Metadata_data_split")
aggregate_df.insert(0, "Metadata_data_split", data_split)
aggregate_df["Metadata_Time"] = aggregate_df["Metadata_Time"].astype(float)
# drop NaN values in the terminal columns
aggregate_df = aggregate_df.dropna(subset="Metadata_data_split")
aggregate_df["Metadata_data_split"].unique()

array(['train', 'test'], dtype=object)

In [13]:
# if the data_split is train and the time is not 12 then set to non_trained_pair
# where 12 is the last time point
aggregate_df["Metadata_data_split"] = aggregate_df.apply(
    lambda x: (
        "non_trained_pair"
        if (x["Metadata_data_split"] == "train" and x["Metadata_Time"] != 12.0)
        else x["Metadata_data_split"]
    ),
    axis=1,
)

In [14]:
metadata_columns = [x for x in aggregate_df.columns if "metadata" in x.lower()]
aggregate_features_df = aggregate_df.drop(columns=metadata_columns, errors="ignore")

In [15]:
models_dict["model_name"]

['train_shuffled_elastic_net_model_singlefeature_Terminal_Cytoplasm_Intensity_MaxIntensity_AnnexinV.joblib',
 'train_elastic_net_model_singlefeature_Terminal_Cytoplasm_Intensity_MaxIntensity_AnnexinV.joblib',
 'train_elastic_net_model_singlefeature_Terminal_Cells_Intensity_MaxIntensityEdge_AnnexinV.joblib',
 'train_shuffled_elastic_net_model_singlefeature_Terminal_Cells_Intensity_MaxIntensityEdge_AnnexinV.joblib',
 'train_shuffled_elastic_net_model_all_terminal_features.joblib',
 'train_elastic_net_model_all_terminal_features.joblib']

In [19]:
aggregate_df

Unnamed: 0,Metadata_data_split,Metadata_Well,Metadata_Time,Metadata_dose,Metadata_number_of_singlecells,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,non_trained_pair,C-02,0.0,0.0,183.0,1,Staurosporine,negative,-0.077110,0.475957,...,0.229725,-0.149357,0.096118,-0.092986,0.045031,0.164684,0.055407,0.004974,-0.242772,0.102413
1,non_trained_pair,C-02,1.0,0.0,180.0,1,Staurosporine,negative,-0.296513,0.284990,...,-0.073892,0.133531,-0.037800,0.005971,-0.243315,0.357192,-0.009844,0.177669,-0.066157,-0.072011
2,non_trained_pair,C-02,10.0,0.0,173.0,1,Staurosporine,negative,-0.004368,0.455522,...,-0.026653,-0.161838,0.110146,0.150975,-0.567474,0.259897,-0.010044,0.206630,-0.001741,-0.023353
3,non_trained_pair,C-02,11.0,0.0,174.0,1,Staurosporine,negative,0.187565,0.547188,...,-0.092784,0.057550,0.197094,0.240283,-0.608368,0.200364,0.065202,0.295870,0.237916,-0.137402
4,train,C-02,12.0,0.0,154.0,1,Staurosporine,negative,0.035033,0.554286,...,-0.093603,-0.027693,0.373045,-0.008521,-0.774369,0.443792,0.342298,0.417925,0.019141,-0.034194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,test,E-11,5.0,156.25,180.0,1,Staurosporine,test,-0.777405,-0.709105,...,-0.220834,0.182320,-0.155501,0.247160,0.735149,0.010485,-0.120143,-0.159440,-0.193372,-0.154826
385,test,E-11,6.0,156.25,130.0,1,Staurosporine,test,-0.806110,-0.837921,...,-0.397589,0.022287,-0.427573,0.074307,1.095767,-0.264573,-0.048047,-0.421024,-0.220308,-0.011652
386,test,E-11,7.0,156.25,127.0,1,Staurosporine,test,-0.819519,-0.969240,...,-0.521659,0.734587,-0.052496,0.345912,0.866253,-0.187128,-0.184685,-0.310724,-0.152335,-0.077080
387,test,E-11,8.0,156.25,122.0,1,Staurosporine,test,-0.939584,-1.125350,...,-0.736949,0.609114,0.059758,0.393962,1.762298,-0.199162,-0.128697,-0.320877,-0.096763,-0.015893


In [16]:
results_dict = {}
for i, model_name in enumerate(models_dict["feature"]):
    model = joblib.load(models_dict["model_path"][i])
    if models_dict["feature"][i] != "all_terminal_features":
        print(models_dict["feature"][i])
        predicted_df = pd.DataFrame(
            model.predict(aggregate_features_df),
            columns=[models_dict["feature"][i]],
        )
    else:
        print("all_terminal_features")
        predicted_df = pd.DataFrame(
            model.predict(aggregate_features_df),
            columns=terminal_column_names,
        )
    predicted_df[metadata_columns] = aggregate_df[metadata_columns]
    predicted_df["shuffled"] = models_dict["shuffled"][i]
    # drop nan value
    predicted_df = predicted_df.dropna()

    # check if a key for the feature already exists in results_dict
    if f"{models_dict['feature'][i]}" in results_dict:
        temporary_df = pd.concat(
            [results_dict[f"{models_dict['feature'][i]}"], predicted_df],
            ignore_index=True,
            sort=False,
        )
        results_dict[f"{models_dict['feature'][i]}"] = temporary_df
    else:
        results_dict[f"{models_dict['feature'][i]}"] = predicted_df

    print(results_dict[f"{models_dict['feature'][i]}"].shape)

Terminal_Cytoplasm_Intensity_MaxIntensity_AnnexinV
(365, 10)
Terminal_Cytoplasm_Intensity_MaxIntensity_AnnexinV
(730, 10)
Terminal_Cells_Intensity_MaxIntensityEdge_AnnexinV
(365, 10)
Terminal_Cells_Intensity_MaxIntensityEdge_AnnexinV
(730, 10)
all_terminal_features
(365, 526)
all_terminal_features
(730, 526)


In [17]:
for model in results_dict.keys():
    save_path = pathlib.Path(f"../results/{model}.parquet").resolve()
    save_path.parent.mkdir(parents=True, exist_ok=True)
    results_dict[model].to_parquet(save_path, index=False)