## Pipeline for supervised modeling

In [None]:
import os

# Check if it's in the correct directory
print("Current working directory:", os.getcwd())
path = os.path.abspath(os.path.join(os.getcwd(), '..', 'path.py'))
%run $path

##### Configure notebook

In [None]:
# Import data
train_file = '../data/in_vitro/mico/train.csv'
val_file = '../data/in_vitro/mico/val.csv'
test_file = '../data/in_vitro/mico/test.csv'

# Configure dataloader
batch_size=256

# Modeling parameters
architecture_type='eegnn'
task_type='classification'
use_uncertainty=False
n_trials=0
calibration=False

# Save the trained model
model_directory = '../output/models'
params_directory = '../output/params'
filename = 'model'
fig_path1="../output/figures/probabilities.png"
fig_path4="../output/figures"

# Plot embeddings along the epochs
method="tSNE"
emb_path="../output/embeddings"
fig_path2="../output/figures"
fig_path3="../output/figures"

##### Load data

In [None]:
from load_data import load_data

train_smiles, y_train = load_data(train_file)
val_smiles, y_val = load_data(val_file)
test_smiles, y_test = load_data(test_file)

print(f"Training data: {len(train_smiles)} samples")
print(f"Validation data: {len(val_smiles)} samples")
print(f"Test data: {len(test_smiles)} samples")

In [None]:
# from load_data import load_data, get_task_names, analyze_all_datasets, check_empty_samples_all_datasets, check_invalid_smiles, to_float_array

# train_smiles, y_train = load_data(train_file)
# val_smiles, y_val = load_data(val_file)
# test_smiles, y_test = load_data(test_file)
# y_train = to_float_array(y_train)
# y_val = to_float_array(y_val)
# y_test = to_float_array(y_test)
# print(f"Training data: {len(train_smiles)} samples")
# print(f"Validation data: {len(val_smiles)} samples")
# print(f"Test data: {len(test_smiles)} samples")

# # Data analysis
# task_names = get_task_names(train_file)
# check_invalid_smiles(train_smiles)

# check_empty_samples_all_datasets(
#     train_data=(train_smiles, y_train),
#     val_data=(val_smiles, y_val),
#     test_data=(test_smiles, y_test)
# )

# analyze_all_datasets(
#     train_data=(train_smiles, y_train),
#     val_data=(val_smiles, y_val),
#     test_data=(test_smiles, y_test),
#     task_names=task_names
# )

##### Building molecular graphs in data loaders

In [None]:
from loaders import graph_loader, graph_info

train_loader, val_loader, test_loader = graph_loader(
    train_smiles, 
    val_smiles, 
    test_smiles, 
    y_train, 
    y_val, 
    y_test, 
    batch_size=batch_size,
    seed=42)

node_dim, edge_dim, num_tasks = graph_info(train_loader)
print(f"Max number of atom features: {node_dim}")
print(f"Max number of bond features: {edge_dim}")
print(f"Number of tasks: {num_tasks}")

##### Starting optuna optimization

In [None]:
from optimizer import objective
from params import initialize_optuna

study = initialize_optuna()
study.optimize(lambda trial: objective(
    trial, 
    node_dim, 
    edge_dim, 
    train_loader, 
    val_loader, 
    num_tasks, 
    architecture_type=architecture_type,
    task_type=task_type, 
    use_uncertainty=use_uncertainty), 
    n_trials=50)

best_params = study.best_params
best_trial = study.best_trial
print("Best hyperparameters:", best_params)

##### Run best model found in optuna optimization

In [None]:
from optimizer import retrain
from save import save_model, save_params

best_params = study.best_params
model, best_val_loss, min_val_loss, train_losses, val_losses = retrain(
    best_params, 
    node_dim, 
    edge_dim, 
    train_loader, 
    val_loader, 
    num_tasks, 
    architecture_type=architecture_type,
    task_type=task_type, 
    use_uncertainty=use_uncertainty)

save_model(model, model_directory, filename)
save_params(best_params, params_directory, filename)

##### Statistical analysis of best model

In [None]:
from utils import device
from statistical import ClassificationEvaluator

ClassificationEvaluator(
    model=model,
    device=device,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    calibration=True)

##### Print the confusion matrice and probability histogram

In [None]:
from utils import device
from densities import plot_confusion_matrix_and_histograms

threshold_json = "../output/calibration/thresholds.json"
plot_confusion_matrix_and_histograms(
    model,
    [train_loader, val_loader, test_loader],
    device,
    thresholds_json=threshold_json,
    out_path=fig_path4
)

##### Print the probability densities

In [None]:
from utils import device
from densities import plot_probability

plot_probability(
    model, 
    test_loader, 
    device,
    threshold=0.5,
    out_path=fig_path1)

##### Visualize embeddings along the epochs

In [None]:
from embeddings import visualize_embeddings

visualize_embeddings(
    in_path=emb_path,
    epoch=45,
    method=method,
    task_index=0,
    out_path=fig_path2)

In [None]:
from embeddings import visualize_kde_dim

visualize_kde_dim(
    in_path=emb_path,
    epoch=45,
    method=method,
    task_index=0,
    out_path=fig_path3)

#### Save model's backup

In [None]:
from backup_results import execute_backup

folder_path = r"C:\Users\gusta\Documents\holisticGNN\results"  # Choose a folder in your machine
experiment_name = "mpnn_30trials_b256"  # Try to add essential details, i.e., architecture, classification, batchsize, uncertainty. 

# Execute backup
backup = execute_backup(folder_path, experiment_name)

print(f"\n✅ Backup done!")