In [1]:
import pathlib

import joblib
import numpy as np
import pandas as pd

In [2]:
# load the training data
profile_file_dir = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve(strict=True)
model_file_dir = pathlib.Path(
    "../models/multi_regression_model_ntrees_1000.joblib"
).resolve(strict=True)
shuffled_model_file_dir = pathlib.Path(
    "../models/shuffled_multi_regression_model_ntrees_1000.joblib"
).resolve(strict=True)
terminal_column_names = pathlib.Path("../results/terminal_columns.txt").resolve(
    strict=True
)
predictions_save_path = pathlib.Path(
    "../results/predicted_terminal_profiles.parquet"
).resolve()
terminal_column_names = [
    line.strip() for line in terminal_column_names.read_text().splitlines()
]
results_dir = pathlib.Path("../results/").resolve()
results_dir.mkdir(parents=True, exist_ok=True)
profile_df = pd.read_parquet(profile_file_dir)
print(profile_df.shape)
profile_df.head()

(142040, 2425)


Unnamed: 0,Metadata_Well,Metadata_Time,Metadata_dose,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,18.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
1,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,21.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
2,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,27.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
3,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,38.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
4,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,41.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387


## Get the non-shuffled predictions

In [3]:
# load the model
model = joblib.load(model_file_dir)
shuffled_model = joblib.load(shuffled_model_file_dir)

metadata_columns = [x for x in profile_df.columns if "Metadata_" in x]
# remove metadata columns
features = profile_df.drop(columns=metadata_columns)
metadata_df = profile_df[metadata_columns]
# predict the terminal feature space
predictions = model.predict(features)
predictions_df = pd.DataFrame(predictions, columns=terminal_column_names)
# insert the metadata columns
for col in metadata_columns:
    predictions_df.insert(0, col, metadata_df[col])
predictions_df["shuffled"] = False

## Get the shuffled predictions

In [4]:
# load the model
shuffled_model = joblib.load(shuffled_model_file_dir)

metadata_columns = [x for x in profile_df.columns if "Metadata_" in x]
shuffled_profile_df = profile_df.copy()
for col in shuffled_profile_df.columns:
    shuffled_profile_df[col] = np.random.permutation(shuffled_profile_df[col])
# remove metadata columns
features = shuffled_profile_df.drop(columns=metadata_columns)
metadata_df = profile_df[metadata_columns]


# predict the terminal feature space
predictions = shuffled_model.predict(features)
shuffled_predictions_df = pd.DataFrame(predictions, columns=terminal_column_names)
# insert the metadata columns
for col in metadata_columns:
    shuffled_predictions_df.insert(0, col, metadata_df[col])
shuffled_predictions_df["shuffled"] = True

In [5]:
final_predictions_df = pd.concat([predictions_df, shuffled_predictions_df], axis=0)
# save the predictions
final_predictions_df.to_parquet(predictions_save_path, index=False)
final_predictions_df

Unnamed: 0,Metadata_image_path,Metadata_original_index,Metadata_Nuclei_AreaShape_BoundingBoxArea,Metadata_distance,Metadata_coordinates_y,Metadata_parent_id,Metadata_parent_track_id,Metadata_id,Metadata_x,Metadata_y,...,Terminal_Texture_DifferenceVariance_AnnexinV_3_02_256,Terminal_Texture_DifferenceVariance_DNA_3_00_256,Terminal_Texture_InfoMeas1_AnnexinV_3_03_256,Terminal_Texture_InfoMeas1_DNA_3_01_256,Terminal_Texture_InfoMeas2_AnnexinV_3_03_256,Terminal_Texture_InfoMeas2_DNA_3_00_256,Terminal_Texture_InverseDifferenceMoment_AnnexinV_3_00_256,Terminal_Texture_SumVariance_AnnexinV_3_03_256,Terminal_Texture_SumVariance_DNA_3_02_256,shuffled
0,../0.pre-process_images/data/processed_images/...,53685,-0.583917,0.407440,"[1114.0, 90.0]",-1.0,-1,1000017.0,1114.0,90.0,...,1.114376,0.358004,0.124886,-0.983630,0.400730,2.402267,-0.751686,0.096900,1.042224,False
1,../0.pre-process_images/data/processed_images/...,53686,-2.201958,0.405451,"[1007.0, 91.0]",-1.0,-1,1000019.0,1007.0,91.0,...,1.114376,0.358004,0.124886,-0.983630,0.400730,2.402267,-0.751686,0.096900,1.042224,False
2,../0.pre-process_images/data/processed_images/...,53687,-0.839397,0.477388,"[455.0, 126.0]",-1.0,-1,1000023.0,455.0,126.0,...,1.114376,0.358004,0.124886,-0.983630,0.400730,2.402267,-0.751686,0.096900,1.042224,False
3,../0.pre-process_images/data/processed_images/...,53689,0.041477,0.439914,"[489.0, 212.0]",-1.0,-1,1000033.0,489.0,212.0,...,1.114376,0.358004,0.124886,-0.983630,0.400730,2.402267,-0.751686,0.096900,1.042224,False
4,../0.pre-process_images/data/processed_images/...,53690,-0.729399,0.487140,"[1121.0, 230.0]",-1.0,-1,1000037.0,1121.0,230.0,...,1.114376,0.358004,0.124886,-0.983630,0.400730,2.402267,-0.751686,0.096900,1.042224,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142035,../0.pre-process_images/data/processed_images/...,88979,-1.350358,0.275464,"[1720.0, 1657.0]",9000120.0,-1,10000123.0,1720.0,1657.0,...,0.506177,-0.273741,-0.595206,-2.645686,0.584688,0.752218,-0.089401,0.460364,1.834157,True
142036,../0.pre-process_images/data/processed_images/...,88980,-1.071813,0.438745,"[591.0, 1750.0]",9000125.0,-1,10000128.0,591.0,1750.0,...,0.496251,-0.290017,-0.577522,-2.705288,0.642291,0.653935,-0.055990,0.440545,1.877803,True
142037,../0.pre-process_images/data/processed_images/...,88981,-1.595193,0.502916,"[1800.0, 1762.0]",9000127.0,-1,10000129.0,1800.0,1762.0,...,0.513810,-0.254332,-0.608967,-2.488247,0.543390,0.722138,-0.149686,0.451874,1.916069,True
142038,../0.pre-process_images/data/processed_images/...,88983,-1.315761,0.515235,"[60.0, 1799.0]",9000131.0,-1,10000134.0,60.0,1799.0,...,0.515279,-0.275084,-0.562313,-2.306159,0.543688,0.712340,-0.067451,0.251544,1.812782,True
