In [1]:
import logging
import pathlib
import pprint
import sys

import joblib
import numpy as np
import optuna
import pandas as pd
import toml
import torch
from optuna.samplers import RandomSampler

sys.path.append("../ML_utils/")

from create_optimized_model import optimized_model_create
from extract_best_trial import extract_best_trial_params
from objective_creation import objective_model_optimizer
from parameter_set import parameter_set
from parameters import Parameters

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

In [2]:
# read in the data
sc_file_path = pathlib.Path("../results/cleaned_sc_profile.parquet").resolve(
    strict=True
)
sc_endpoint_file_path = pathlib.Path(
    "../results/cleaned_endpoint_sc_profile.parquet"
).resolve(strict=True)

data_split_file_path = pathlib.Path("../results/data_splits.parquet").resolve(
    strict=True
)

sc_profile = pd.read_parquet(sc_file_path)
sc_endpoint_profile = pd.read_parquet(sc_endpoint_file_path)
data_split_df = pd.read_parquet(data_split_file_path)
print(f"sc_profile shape: {sc_profile.shape}")
print(f"sc_endpoint_profile shape: {sc_endpoint_profile.shape}")
print(f"data_split_df shape: {data_split_df.shape}")
data_split_df.head()

sc_profile shape: (182804, 2376)
sc_endpoint_profile shape: (11340, 368)
data_split_df shape: (14926, 3)


Unnamed: 0,index,data_split,data_x_or_y
0,7440,train_gt,X
1,7461,train_gt,X
2,7463,train_gt,X
3,7468,train_gt,X
4,7479,train_gt,X


In [3]:
# keep only the last timepoint
sc_profile["Metadata_Time"] = sc_profile["Metadata_Time"].astype("float64")
sc_profile = sc_profile[
    sc_profile["Metadata_Time"] == sc_profile["Metadata_Time"].max()
]
# drop Na values
sc_profile.dropna(inplace=True)
print(f"sc_profile shape after dropping NaN: {sc_profile.shape}")
sc_endpoint_profile.dropna(inplace=True)
print(f"sc_endpoint_profile shape after dropping NaN: {sc_endpoint_profile.shape}")
# hardcode the features that should exist in the y data
# this will be replaced in the future by an arg or config passed through
selected_y_features = ["Cells_Intensity_MeanIntensityEdge_AnnexinV"]
metadata_y_features = [x for x in sc_endpoint_profile.columns if "Metadata_" in x]
sc_endpoint_profile = sc_endpoint_profile[metadata_y_features + selected_y_features]
print(
    f"sc_endpoint_profile shape after selecting features: {sc_endpoint_profile.shape}"
)

sc_profile shape after dropping NaN: (14237, 2376)
sc_endpoint_profile shape after dropping NaN: (11136, 368)
sc_endpoint_profile shape after selecting features: (11136, 26)


In [4]:
profile_data_splits_df = data_split_df.loc[data_split_df["data_x_or_y"] == "X"]
profile_data_splits_df
endpoint_data_splits_df = data_split_df.loc[data_split_df["data_x_or_y"] == "y"]
endpoint_data_splits_df
# replace the index with the index column
profile_data_splits_df = profile_data_splits_df.set_index(
    "index", drop=True, verify_integrity=True
)
endpoint_data_splits_df = endpoint_data_splits_df.set_index(
    "index", drop=True, verify_integrity=True
)
# remove the index name from profile_data_splits_df
profile_data_splits_df.index.name = None
# remove the index name from endpoint_data_splits_df
endpoint_data_splits_df.index.name = None

In [5]:
# get only indexes from sc_profile that are in the train_gt split
train_gt_X = sc_profile.loc[
    sc_profile.index.isin(
        profile_data_splits_df.loc[
            profile_data_splits_df["data_split"] == "train_gt"
        ].index
    )
]
val_gt_X = sc_profile.loc[
    sc_profile.index.isin(
        profile_data_splits_df.loc[
            profile_data_splits_df["data_split"] == "val_gt"
        ].index
    )
]
train_gt_y = sc_endpoint_profile.loc[
    sc_endpoint_profile.index.isin(
        endpoint_data_splits_df.loc[
            endpoint_data_splits_df["data_split"] == "train_gt"
        ].index
    )
]
val_gt_y = sc_endpoint_profile.loc[
    sc_endpoint_profile.index.isin(
        endpoint_data_splits_df.loc[
            endpoint_data_splits_df["data_split"] == "val_gt"
        ].index
    )
]

# assertion checks
assert train_gt_X.shape[0] == train_gt_y.shape[0]
assert val_gt_X.shape[0] == val_gt_y.shape[0]
assert train_gt_X.shape[1] == val_gt_X.shape[1]
assert train_gt_y.shape[1] == val_gt_y.shape[1]

In [6]:
# get metadata
metadata_X_cols = [x for x in train_gt_X.columns if "Metadata_" in x]
metadata_y_cols = [x for x in train_gt_y.columns if "Metadata_" in x]


train_gt_X_metadata = train_gt_X[metadata_X_cols]
train_gt_X.drop(columns=metadata_X_cols, inplace=True)
train_gt_y_metadata = train_gt_y[metadata_y_cols]
train_gt_y.drop(columns=metadata_y_cols, inplace=True)
val_gt_X_metadata = val_gt_X[metadata_X_cols]
val_gt_X.drop(columns=metadata_X_cols, inplace=True)
val_gt_y_metadata = val_gt_y[metadata_y_cols]
val_gt_y.drop(columns=metadata_y_cols, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_gt_X.drop(columns=metadata_X_cols, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_gt_y.drop(columns=metadata_y_cols, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_gt_X.drop(columns=metadata_X_cols, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_g

In [7]:
# shuffle the data
shuffled_train_gt_X = train_gt_X.copy()
for col in shuffled_train_gt_X.columns:
    if col.startswith("Metadata_"):
        continue
    shuffled_train_gt_X[col] = np.random.permutation(shuffled_train_gt_X[col].values)
shuffled_val_gt_X = val_gt_X.copy()
for col in shuffled_val_gt_X.columns:
    if col.startswith("Metadata_"):
        continue
    shuffled_val_gt_X[col] = np.random.permutation(shuffled_val_gt_X[col].values)

In [8]:
# number of input features
n_features = train_gt_X.shape[1]
# number of output features
n_outputs = train_gt_y.shape[1]
# number of metadata features
n_metadata_features = train_gt_X_metadata.shape[1]

print(f"n_features: {n_features}")
print(f"n_outputs: {n_outputs}")
print(f"n_metadata_features: {n_metadata_features}")

n_features: 2338
n_outputs: 1
n_metadata_features: 38


In [9]:
params = Parameters()
ml_configs = toml.load("../ML_utils/regression_class_config.toml")
mlp_params = parameter_set(params, ml_configs)
mlp_params.IN_FEATURES = n_features
mlp_params.OUT_FEATURES = n_outputs

In [10]:
print(train_gt_X.shape, train_gt_y.shape)
print(val_gt_X.shape, val_gt_y.shape)

(367, 2338) (367, 1)
(49, 2338) (49, 1)


In [11]:
X_train = torch.tensor(train_gt_X.values, dtype=torch.float32)
y_train = torch.tensor(train_gt_y.values, dtype=torch.float32)
X_val = torch.tensor(val_gt_X.values, dtype=torch.float32)
y_val = torch.tensor(val_gt_y.values, dtype=torch.float32)

In [12]:
# get the dtypes of the data
print(f"X_train dtypes: {X_train.dtype}")
print(f"y_train dtypes: {y_train.dtype}")
print(f"X_val dtypes: {X_val.dtype}")
print(f"y_val dtypes: {y_val.dtype}")

X_train dtypes: torch.float32
y_train dtypes: torch.float32
X_val dtypes: torch.float32
y_val dtypes: torch.float32


In [13]:
# produce data objects for train, val and test datasets
train_data = torch.utils.data.TensorDataset(X_train, y_train)
val_data = torch.utils.data.TensorDataset(X_val, y_val)


# convert data class into a dataloader to be compatible with pytorch
train_loader = torch.utils.data.DataLoader(
    dataset=train_data, batch_size=mlp_params.HYPERPARAMETER_BATCH_SIZE, shuffle=True
)
valid_loader = torch.utils.data.DataLoader(
    dataset=val_data, batch_size=mlp_params.HYPERPARAMETER_BATCH_SIZE, shuffle=False
)

In [14]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

pathlib.Path("../logs").mkdir(parents=True, exist_ok=True)
# Create a file handler
file_handler = logging.FileHandler("../logs/optuna_log.txt")
file_handler.setLevel(logging.INFO)

# Create a formatter and set it to the handler
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(file_handler)

# Optional: Set Optuna to use this logger
optuna.logging.set_verbosity(optuna.logging.INFO)
optuna.logging.enable_propagation()

In [15]:
# wrap the objective function inside of a lambda function to pass args...
objective_lambda_func = lambda trial: objective_model_optimizer(
    train_loader,
    valid_loader,
    trial=trial,
    params=params,
    metric=mlp_params.METRIC,
    return_info=False,
)
# Study is the object for model optimization
study = optuna.create_study(
    direction=f"{mlp_params.DIRECTION}",
    sampler=RandomSampler(),
    study_name="live_cell_AnnexinV_prediction",
)
# Here I apply the optimize function of the study to the objective function
# This optimizes each parameter specified to be optimized from the defined search space
study.optimize(objective_lambda_func, n_trials=mlp_params.N_TRIALS)
# Prints out the best trial's optimized parameters
objective_model_optimizer(
    train_loader,
    valid_loader,
    trial=study.best_trial,
    params=params,
    metric=mlp_params.METRIC,
    return_info=True,
)

[I 2025-05-13 08:54:08,637] A new study created in memory with name: live_cell_AnnexinV_prediction
[I 2025-05-13 08:54:09,514] Trial 0 finished with value: 0.6374997544288635 and parameters: {'n_layers': 1, 'n_units_l0': 4, 'dropout_0': 0.4660576954504388, 'learning_rate': 0.051391619561266456, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.6374997544288635.
[I 2025-05-13 08:54:09,730] Trial 1 finished with value: 26.295696195364 and parameters: {'n_layers': 3, 'n_units_l0': 3, 'dropout_0': 0.3264689170479415, 'n_units_l1': 6, 'dropout_1': 0.25544510305757945, 'n_units_l2': 10, 'dropout_2': 0.7041916432577999, 'learning_rate': 0.03856842333097559, 'optimizer': 'RMSprop'}. Best is trial 0 with value: 0.6374997544288635.
[I 2025-05-13 08:54:09,913] Trial 2 finished with value: 845.9318025493621 and parameters: {'n_layers': 1, 'n_units_l0': 3, 'dropout_0': 0.7994234878697071, 'learning_rate': 0.08606655540890426, 'optimizer': 'RMSprop'}. Best is trial 0 with value: 0.6374997544288635.

0.5476478981971741

In [16]:
model_name = "Cells_Intensity_MeanIntensityEdge_AnnexinV"
param_dict = extract_best_trial_params(study.best_params, params, model_name=model_name)


untrained_model_archetecture_only = optimized_model_create(
    params=params,
    model_name=model_name,
)
# save the blank model architecture
model_path = f"../models/{model_name}.pt"

In [17]:
import optuna.visualization as vis

# Assuming `study` is your Optuna study object
fig = vis.plot_optimization_history(study)
fig.show()