In [None]:
"""
In this notebook we show the workflow for model training, calculation of prediction differences, and QoL export of obtained models.
"""

In [1]:
import sys
sys.path.insert(0, '..')

from itertools import product
from joblib import Parallel, delayed

from src.analysis.differences import *
from src.ml.optim import *
from src.ml.evaluate import ensemble_evaluate
from src.ml.transform import prepare_transformer

  from .autonotebook import tqdm as notebook_tqdm


### Parameters

In [11]:
sign_names = ['Sex', 'Age', 'Weight']  # names of variables for result stratification

input_dir = '../data/carbide/'  # directory with CARBIDE datasets and descriptors

model_name = 'XGBClassifier'  #  model to use: 'XGBClassifier', 'LogisticRegression', 'RandomForestClassifier'
dataset_type = 'primary'  # inclusion criteria to use: 'primary', 'secondary'
pt_set = 'Cred'  # cardiotoxicity definition to use: 'Cvas', 'Card', 'Cred'
dpa_metric = 'prr'  # DPA metric to use: 'prr', 'ror', 'ic'

desc_col = 'RDKit'  # name of a column from descriptors.joblib to use
selection_metric = 'HarmRS'
n_trials = 4  # number of Optuna trials; set to 4 for demonstration purposes
n_jobs = 8  # number of CPUs used during model training

output_dir = f'../results/training/{dataset_type}_{dpa_metric}_{pt_set}/{model_name}/{desc_col}/'  # directory for saving results

os.makedirs(output_dir, exist_ok=True)

### Training

In [12]:
for test_fold in [0, 1, 2, 3, 4]:
    # Build paths
    data_path = os.path.join(input_dir, dataset_type, pt_set, f'carbide_{dpa_metric}.joblib')
    desc_path = os.path.join(input_dir, 'descriptors.joblib')

    # Load data
    data = joblib.load(data_path)
    desc = joblib.load(desc_path)

    # Combine data with selected descriptor types
    df = data.merge(desc[["SMILES", desc_col]], on='SMILES', how='inner')

    # Split data into ml and test
    train_df = df[df['Fold'] != test_fold].reset_index(drop=True)
    test_df = df[df['Fold'] == test_fold].reset_index(drop=True)

    # Prepare training data
    train_manager = DatasetManager(
        df=train_df,
        desc_col=desc_col,
        sign_names=sign_names
    )

    # Prepare DataTransformer for given descriptors
    transformer = prepare_transformer(desc_col)

    # Perform hyperparameter optimization of ml set
    ensemble = optuna_hyperparameter_search(
        model_class=get_model(model_name),
        dataset_manager=train_manager,
        transformer=transformer,
        test_fold=test_fold,
        selection_metric=selection_metric,  # see src/training.py:inner_score for supported metrics
        n_trials=n_trials,
        n_jobs=n_jobs,
        save_dir=output_dir,
    )

    # Evaluate the trained ensemble on the test set
    pred_df, test_scores = ensemble_evaluate(
        ensemble=ensemble,
        test_df=test_df,
        desc_col=desc_col,
        sign_names=sign_names
    )

    # Build paths
    pred_path = os.path.join(output_dir, f'preds_tf_{test_fold}.joblib')  # predictions on the test fold
    scores_path = os.path.join(output_dir, f'scores_tf_{test_fold}.joblib')  # scores on the test fold

    # Save
    joblib.dump(pred_df, pred_path)
    joblib.dump(test_scores, scores_path)

[I 2025-12-04 10:27:36,585] A new study created in memory with name: no-name-7b7557c2-50c2-4665-82fb-21a617af478e


Beginning optuna optimization search


[I 2025-12-04 10:27:40,743] Trial 0 finished with value: 0.55958 and parameters: {'n_estimators': 700, 'max_depth': 2, 'learning_rate': 0.039210787183641065, 'max_leaves': 11, 'subsample': 0.7454482164982754, 'colsample_bytree': 0.7245591708395565, 'gamma': 0.1925194758682508, 'reg_alpha': 0.009797586247415335, 'reg_lambda': 3.8716113077675374}. Best is trial 0 with value: 0.55958.
[I 2025-12-04 10:27:44,001] Trial 1 finished with value: 0.5424 and parameters: {'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.06776595536852902, 'max_leaves': 0, 'subsample': 0.7526962679911986, 'colsample_bytree': 0.9280389289004496, 'gamma': 4.2644988098846435, 'reg_alpha': 1.467336934263922, 'reg_lambda': 3.926073220138631}. Best is trial 0 with value: 0.55958.
[I 2025-12-04 10:27:49,547] Trial 2 finished with value: 0.55553 and parameters: {'n_estimators': 725, 'max_depth': 8, 'learning_rate': 0.05575599690305941, 'max_leaves': 9, 'subsample': 0.9339749817492353, 'colsample_bytree': 0.74897121

Optuna search finished successfully


[I 2025-12-04 10:27:52,932] A new study created in memory with name: no-name-e6ba10dd-2602-4e9b-9873-1fe743a4dc69


Beginning optuna optimization search


[I 2025-12-04 10:27:56,339] Trial 0 finished with value: 0.59036 and parameters: {'n_estimators': 425, 'max_depth': 11, 'learning_rate': 0.09338696964703463, 'max_leaves': 12, 'subsample': 0.9926513527866478, 'colsample_bytree': 0.8224292875318433, 'gamma': 1.5782629497242544, 'reg_alpha': 3.2572462655607843, 'reg_lambda': 4.185769196992629}. Best is trial 0 with value: 0.59036.
[I 2025-12-04 10:27:58,257] Trial 1 finished with value: 0.44461 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.01786182652844155, 'max_leaves': 3, 'subsample': 0.504860649695358, 'colsample_bytree': 0.5675703866070958, 'gamma': 0.5519320507862246, 'reg_alpha': 3.3256091117237006, 'reg_lambda': 1.2071613002946235}. Best is trial 0 with value: 0.59036.
[I 2025-12-04 10:28:02,404] Trial 2 finished with value: 0.58494 and parameters: {'n_estimators': 475, 'max_depth': 4, 'learning_rate': 0.02999464464539614, 'max_leaves': 5, 'subsample': 0.9711798193020624, 'colsample_bytree': 0.823111870

Optuna search finished successfully


[I 2025-12-04 10:28:06,136] A new study created in memory with name: no-name-0c06a496-dd75-43ae-be38-19afe1726cca


Beginning optuna optimization search


[I 2025-12-04 10:28:11,376] Trial 0 finished with value: 0.52699 and parameters: {'n_estimators': 975, 'max_depth': 11, 'learning_rate': 0.024820249095131253, 'max_leaves': 3, 'subsample': 0.6421239779677705, 'colsample_bytree': 0.5918852547621426, 'gamma': 1.8775244510236484, 'reg_alpha': 0.9193962028025604, 'reg_lambda': 1.8704531863528167}. Best is trial 0 with value: 0.52699.
[I 2025-12-04 10:28:15,396] Trial 1 finished with value: 0.56872 and parameters: {'n_estimators': 325, 'max_depth': 12, 'learning_rate': 0.05099754612698431, 'max_leaves': 11, 'subsample': 0.5790524340212291, 'colsample_bytree': 0.9580148938498383, 'gamma': 2.3764992203823176, 'reg_alpha': 0.4825741404891959, 'reg_lambda': 1.6390795686601405}. Best is trial 1 with value: 0.56872.
[I 2025-12-04 10:28:18,556] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 475, 'max_depth': 9, 'learning_rate': 0.005239109579599908, 'max_leaves': 1, 'subsample': 0.7501135183289785, 'colsample_bytree': 0.95621642

Optuna search finished successfully


[I 2025-12-04 10:28:23,401] A new study created in memory with name: no-name-1eb4c635-736d-4646-8b0c-ed80d9459b9d


Beginning optuna optimization search


[I 2025-12-04 10:28:27,846] Trial 0 finished with value: 0.56192 and parameters: {'n_estimators': 650, 'max_depth': 2, 'learning_rate': 0.025341398397950358, 'max_leaves': 11, 'subsample': 0.7642510532032534, 'colsample_bytree': 0.6695234885323815, 'gamma': 1.2286707111404793, 'reg_alpha': 4.818086177973642, 'reg_lambda': 2.0265188307678574}. Best is trial 0 with value: 0.56192.
[I 2025-12-04 10:28:32,182] Trial 1 finished with value: 0.50544 and parameters: {'n_estimators': 950, 'max_depth': 12, 'learning_rate': 0.0065597205216349685, 'max_leaves': 2, 'subsample': 0.9912853677370743, 'colsample_bytree': 0.5702160600408741, 'gamma': 1.9606989891192594, 'reg_alpha': 4.437811425886843, 'reg_lambda': 3.912336422696154}. Best is trial 0 with value: 0.56192.
[I 2025-12-04 10:28:37,599] Trial 2 finished with value: 0.56241 and parameters: {'n_estimators': 725, 'max_depth': 11, 'learning_rate': 0.027350804603778783, 'max_leaves': 5, 'subsample': 0.8955834208698787, 'colsample_bytree': 0.72976

Optuna search finished successfully


[I 2025-12-04 10:28:44,595] A new study created in memory with name: no-name-75eaacb3-bdc7-40dd-925f-1ae96be753ea


Beginning optuna optimization search


[I 2025-12-04 10:28:50,821] Trial 0 finished with value: 0.51673 and parameters: {'n_estimators': 550, 'max_depth': 6, 'learning_rate': 0.0057501472095830815, 'max_leaves': 11, 'subsample': 0.8193895361759361, 'colsample_bytree': 0.8128500185276206, 'gamma': 3.3876657198718467, 'reg_alpha': 1.6583478326569068, 'reg_lambda': 2.6545043636918546}. Best is trial 0 with value: 0.51673.
[I 2025-12-04 10:28:55,357] Trial 1 finished with value: 0.48604 and parameters: {'n_estimators': 100, 'max_depth': 11, 'learning_rate': 0.008282213986677167, 'max_leaves': 0, 'subsample': 0.8707885764655081, 'colsample_bytree': 0.9413171127151838, 'gamma': 0.29503166084661303, 'reg_alpha': 2.540048517414725, 'reg_lambda': 4.884513741151618}. Best is trial 0 with value: 0.51673.
[I 2025-12-04 10:28:57,462] Trial 2 finished with value: 0.44535 and parameters: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.013856738154791126, 'max_leaves': 9, 'subsample': 0.7689817312055482, 'colsample_bytree': 0.8393

Optuna search finished successfully


### Prediction differences

In [14]:
"""
All trained models are not provided as their total size is somewhat large (±50GB after compression)
"""

save_dir = '../results/differences/'
os.makedirs(save_dir, exist_ok=True)

dataset_type = 'primary'
pt_set = 'Cred'
dpa_metric = 'prr'

models = ['XGBClassifier', 'RandomForestClassifier', 'LogisticRegression']
descriptors = ['CDDD', 'MACCS', 'Morgan', 'Klek', 'RDKit', 'ChemBERTa']

def process_combination(model, descriptor):
    params = {
        'dataset_path': f'../data/carbide/{dataset_type}/{pt_set}/carbide_{dpa_metric}.joblib',
        'desc_path': '../data/carbide/descriptors.joblib',
        'ensemble_dir': f'../results/training/{dataset_type}_{pt_set}_{dpa_metric}/{model}/{descriptor}/',
        'save_path': os.path.join(save_dir, f'{model}_{descriptor}_pred_diff.joblib')
    }

    result = data_difference(**params)
    return result

combinations = list(product(models, descriptors))

diffs = Parallel(n_jobs=-1, verbose=1)(
    delayed(process_combination)(model, descriptor)
    for model, descriptor in combinations
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
100%|██████████| 252/252 [00:03<00:00, 80.26it/s]
100%|██████████| 225/225 [00:03<00:00, 73.91it/s]
100%|██████████| 227/227 [00:03<00:00, 72.49it/s]
100%|██████████| 234/234 [00:03<00:00, 71.05it/s]
100%|██████████| 220/220 [00:02<00:00, 78.43it/s]
