In [18]:
import numpy as np
import pandas as pd
from pathlib import Path
from lightning import pytorch as pl
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from chemprop import data, featurizers, models, nn, utils
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from lightning.pytorch.callbacks import ModelCheckpoint
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from openpyxl import Workbook
import os
from pathlib import Path
import sys
print('Python:',sys.version)

Python: 3.11.10 (main, Oct  3 2024, 07:29:13) [GCC 11.2.0]


# Full dataset

In [2]:
df = pd.read_excel("scCO2_all_data(curated).xlsx")

In [3]:
df_rem = df[~df['Name'].isin(['Water', 'Iodine'])] #remove water and iodine from the data set

In [4]:
smiles_column = ['SMILES_Canonical']
target_columns = ['Lg(y)']
descriptor_columns = ['T (K)', 'P (bar)', 'Melting Point', 'dHvap (kJ/mol)', 'g (gcm3)', 'dG', 'MolWt', 'MolLogP', 'TPSA', 'NR', 'Bj', 'HA', 'HD', 'RC', 'SP', 'NA', 'NAr', 'Hat']

In [5]:
smis = df_rem.loc[:, smiles_column].values.flatten().astype(str)
ys = df_rem.loc[:, target_columns].values
extra_datapoint_descriptors = df_rem[descriptor_columns].values

In [6]:
all_data = [
    data.MoleculeDatapoint(Chem.MolFromSmiles(smi), y, x_d=X_d)
    for smi, y, X_d in zip(smis, ys, extra_datapoint_descriptors)
]

In [7]:
mols = [d.mol for d in all_data]

In [8]:
#Strict split SMILES. D-MPNN with thermodynamic properties

In [9]:
class ScaffoldGroupKFold:
    def __init__(self, n_splits=5, shuffle=False, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        unique_groups = np.unique(groups)
        if self.shuffle:
            np.random.seed(self.random_state)
            np.random.shuffle(unique_groups)
        
        fold_size = len(unique_groups) // self.n_splits
        
        for i in range(self.n_splits):
            if i == self.n_splits - 1:
                test_groups = unique_groups[i * fold_size:]
            else:
                test_groups = unique_groups[i * fold_size: (i + 1) * fold_size]
            
            test_indices = np.where(np.isin(groups, test_groups))[0]
            train_indices = np.where(~np.isin(groups, test_groups))[0]
            
            yield train_indices, test_indices

In [10]:
k_splits = ScaffoldGroupKFold(n_splits=5, shuffle=True, random_state=4321)
output_path = Path.cwd() / "hup" / "crossval_results_scCO2.xlsx"

In [11]:
def calculate_aard(true_values, predicted_values):
    true_values = np.array(true_values)  
    predicted_values = np.array(predicted_values)
    return 100 * np.mean(np.abs((true_values - predicted_values) / true_values))

def calculate_r2(true_values, predicted_values):
    true_values = np.array(true_values)  
    predicted_values = np.array(predicted_values)
    return r2_score(true_values, predicted_values)

def calculate_rmse(true_values, predicted_values):
    return np.sqrt(mean_squared_error(true_values, predicted_values))

def calculate_mae(true_values, predicted_values):
    return mean_absolute_error(true_values, predicted_values)

In [12]:
fold_results = {"RMSE": [], "MAE": [], "R²": [], "AARD": []}
all_test_smiles = []
all_test_targets = []
all_test_predictions = []

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    pd.DataFrame({"Placeholder": []}).to_excel(writer, sheet_name="Init", index=False)

    for fold_idx, (train_indices, test_indices) in enumerate(k_splits.split(mols, groups=df_rem['SMILES_Canonical'])):
        print(f"Starting fold {fold_idx + 1}/5")

        train_data, _, test_data = data.split_data_by_indices(all_data, [train_indices], None, [test_indices])
        train_data, val_data = train_test_split(train_data[0], test_size=0.05, random_state=fold_idx)

        featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
        train_dset = data.MoleculeDataset(train_data, featurizer)
        scaler = train_dset.normalize_targets()

        val_dset = data.MoleculeDataset(val_data, featurizer)
        val_dset.normalize_targets(scaler)

        test_dset = data.MoleculeDataset(test_data[0], featurizer)
        
        targets_scaler = train_dset.normalize_targets()
        extra_datapoint_descriptors_scaler = train_dset.normalize_inputs("X_d")
        val_dset.normalize_targets(targets_scaler)
        val_dset.normalize_inputs("X_d", extra_datapoint_descriptors_scaler)
        
        train_dset.cache = True
        val_dset.cache = True

        
        train_loader = data.build_dataloader(train_dset, shuffle=True)
        val_loader = data.build_dataloader(val_dset, shuffle=False)
        test_loader = data.build_dataloader(test_dset, shuffle=False)

        # optimized by Ray Tune
        ffn_hidden_dim = 2400
        message_hidden_dim = 300
        depth = 2
        ffn_num_layers = 1
        batch_norm = True
        metric_list = [nn.metrics.RMSE(), nn.metrics.MAE()]
        descriptor_columns = extra_datapoint_descriptors.shape[1]

        mp = nn.BondMessagePassing(d_h=message_hidden_dim)
        agg = nn.MeanAggregation()
        output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
        ffn = nn.RegressionFFN(
            output_transform=output_transform,
            input_dim=message_hidden_dim + descriptor_columns,
            hidden_dim=ffn_hidden_dim,
            n_layers=ffn_num_layers
        )
        X_d_transform = nn.ScaleTransform.from_standard_scaler(extra_datapoint_descriptors_scaler)

        model = models.MPNN(
            mp, agg, ffn, batch_norm, metric_list, X_d_transform=X_d_transform
        )
        
        # Checkpointing
        checkpoint_dir = Path(f"/home/lab101/temp/mdm/bot/chek/fold_{fold_idx}")
        checkpoint_dir.mkdir(parents=True, exist_ok=True)

        checkpointing = ModelCheckpoint(
            dirpath=checkpoint_dir,
            filename="best-{epoch}-{val_loss:.2f}",
            monitor="val_loss",
            mode="min",
            save_weights_only=True
        )

        trainer = pl.Trainer(accelerator="auto", devices=1, max_epochs=15, callbacks=[checkpointing])
        trainer.fit(model, train_loader, val_loader)

        predictions = trainer.predict(model, test_loader)
        test_smiles = [Chem.MolToSmiles(d.mol) for d in test_data[0]]
        test_targets = [d.y[0] for d in test_data[0]]
        fold_predictions = np.concatenate(predictions).flatten().tolist()

        all_test_smiles.extend(test_smiles)
        all_test_targets.extend(test_targets)
        all_test_predictions.extend(fold_predictions)

        rmse = calculate_rmse(test_targets, fold_predictions)
        mae = calculate_mae(test_targets, fold_predictions)
        r2 = r2_score(test_targets, fold_predictions)
        aard = calculate_aard(test_targets, fold_predictions)

        fold_results["RMSE"].append(rmse)
        fold_results["MAE"].append(mae)
        fold_results["R²"].append(r2)
        fold_results["AARD"].append(aard)

        df_fold = pd.DataFrame({"SMILES": test_smiles, "True_Target": test_targets, "Predicted_Target": fold_predictions})
        df_fold.to_excel(writer, sheet_name=f"Fold_{fold_idx + 1}", index=False)

        print(f"Fold {fold_idx + 1}:")
        print(f"  RMSE: {rmse}")
        print(f"  MAE: {mae}")
        print(f"  R²: {r2}")
        print(f"  AARD, %: {aard}")

mean_rmse = np.mean(fold_results["RMSE"])
std_rmse = np.std(fold_results["RMSE"])
mean_mae = np.mean(fold_results["MAE"])
std_mae = np.std(fold_results["MAE"])
mean_r2 = np.mean(fold_results["R²"])
std_r2 = np.std(fold_results["R²"])
mean_aard = np.mean(fold_results["AARD"])
std_aard = np.std(fold_results["AARD"])

print("\nFinal cross-validation results:")
print(f"  RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")
print(f"  MAE: {mean_mae:.4f} ± {std_mae:.4f}")
print(f"  R²: {mean_r2:.4f} ± {std_r2:.4f}")
print(f"  AARD, %: {mean_aard:.4f} ± {std_aard:.4f}")

df_results = pd.DataFrame({
    "SMILES": all_test_smiles,
    "True_Target": all_test_targets,
    "Predicted_Target": all_test_predictions
})
with pd.ExcelWriter(output_path, engine="openpyxl", mode="a") as writer:
    df_results.to_excel(writer, sheet_name="All_Folds", index=False)

Starting fold 1/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_0 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider in

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 362/362 [00:30<00:00, 12.02it/s, v_num=515, train_loss_step=0.4
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/19 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/19 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|█                  | 1/19 [00:00<00:00, 22.85it/s][A
Validation DataLoader 0:  11%|██                 | 2/19 [00:00<00:01, 12.44it/s][A
Validation DataLoader 0:  16%|███                | 3/19 [00:00<00:01, 14.83it/s][A
Validation DataLoader 0:  21%|████               | 4/19 [00:00<00:01, 10.83it/s][A
Validation DataLoader 0:  26%|█████              | 5/19 [00:00<00:01, 11.53it/s][A
Validation DataLoader 0:  32%|██████             | 6/19 [00:00<00:01, 12.33it/s][A
Validation DataLoader 0:  37%|███████            | 7/19 [00:00<00:01, 11.13it/s][A
Validation DataLoader 0:  42%|████████           | 8/19 [00:00<00:01, 10.66it/s

Validation DataLoader 0:  37%|███████            | 7/19 [00:00<00:00, 12.91it/s][A
Validation DataLoader 0:  42%|████████           | 8/19 [00:00<00:00, 13.99it/s][A
Validation DataLoader 0:  47%|█████████          | 9/19 [00:00<00:00, 12.85it/s][A
Validation DataLoader 0:  53%|█████████▍        | 10/19 [00:00<00:00, 13.41it/s][A
Validation DataLoader 0:  58%|██████████▍       | 11/19 [00:00<00:00, 12.97it/s][A
Validation DataLoader 0:  63%|███████████▎      | 12/19 [00:00<00:00, 12.60it/s][A
Validation DataLoader 0:  68%|████████████▎     | 13/19 [00:00<00:00, 13.18it/s][A
Validation DataLoader 0:  74%|█████████████▎    | 14/19 [00:01<00:00, 12.77it/s][A
Validation DataLoader 0:  79%|██████████████▏   | 15/19 [00:01<00:00, 12.91it/s][A
Validation DataLoader 0:  84%|███████████████▏  | 16/19 [00:01<00:00, 13.04it/s][A
Validation DataLoader 0:  89%|████████████████  | 17/19 [00:01<00:00, 12.76it/s][A
Validation DataLoader 0:  95%|█████████████████ | 18/19 [00:01<00:00, 13.14i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 362/362 [00:31<00:00, 11.67it/s, v_num=515, train_loss_step=0.

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.



Predicting DataLoader 0: 100%|██████████████████| 99/99 [00:28<00:00,  3.53it/s]
Fold 1:
  RMSE: 0.7359468796836246
  MAE: 0.5146870836839329
  R²: 0.7365713390236263
  AARD, %: 16.371155975542155
Starting fold 2/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_1 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 342/342 [00:26<00:00, 12.98it/s, v_num=516, train_loss_step=0.1
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   6%|█                  | 1/18 [00:00<00:01, 10.10it/s][A
Validation DataLoader 0:  11%|██                 | 2/18 [00:00<00:01, 10.31it/s][A
Validation DataLoader 0:  17%|███▏               | 3/18 [00:00<00:01, 10.70it/s][A
Validation DataLoader 0:  22%|████▏              | 4/18 [00:00<00:01, 10.21it/s][A
Validation DataLoader 0:  28%|█████▎             | 5/18 [00:00<00:01, 10.47it/s][A
Validation DataLoader 0:  33%|██████▎            | 6/18 [00:00<00:01, 10.59it/s][A
Validation DataLoader 0:  39%|███████▍           | 7/18 [00:00<00:01, 10.81it/s][A
Validation DataLoader 0:  44%|████████▍          | 8/18 [00:00<00:00, 11.10it/s

Validation DataLoader 0:  83%|███████████████   | 15/18 [00:01<00:00, 14.20it/s][A
Validation DataLoader 0:  89%|████████████████  | 16/18 [00:01<00:00, 13.57it/s][A
Validation DataLoader 0:  94%|█████████████████ | 17/18 [00:01<00:00, 13.17it/s][A
Validation DataLoader 0: 100%|██████████████████| 18/18 [00:01<00:00, 13.20it/s][A
Epoch 9: 100%|█| 342/342 [00:27<00:00, 12.31it/s, v_num=516, train_loss_step=0.0[A
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   6%|█                  | 1/18 [00:00<00:00, 33.14it/s][A
Validation DataLoader 0:  11%|██                 | 2/18 [00:00<00:01, 11.87it/s][A
Validation DataLoader 0:  17%|███▏               | 3/18 [00:00<00:01, 10.63it/s][A
Validation DataLoader 0:  22%|████▏              | 4/18 [00:00<00:01, 10.64i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 342/342 [00:28<00:00, 11.89it/s, v_num=516, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|████████████████| 119/119 [00:34<00:00,  3.48it/s]
Fold 2:
  RMSE: 0.752903956375176
  MAE: 0.5255281520096076
  R²: 0.71948744646873
  AARD, %: 15.193474498115423
Starting fold 3/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_2 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 372/372 [00:29<00:00, 12.48it/s, v_num=517, train_loss_step=0.2
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|▉                  | 1/20 [00:00<00:00, 81.11it/s][A
Validation DataLoader 0:  10%|█▉                 | 2/20 [00:00<00:01, 11.58it/s][A
Validation DataLoader 0:  15%|██▊                | 3/20 [00:00<00:01, 11.28it/s][A
Validation DataLoader 0:  20%|███▊               | 4/20 [00:00<00:01, 10.71it/s][A
Validation DataLoader 0:  25%|████▊              | 5/20 [00:00<00:01, 11.14it/s][A
Validation DataLoader 0:  30%|█████▋             | 6/20 [00:00<00:01, 11.15it/s][A
Validation DataLoader 0:  35%|██████▋            | 7/20 [00:00<00:01, 10.78it/s][A
Validation DataLoader 0:  40%|███████▌           | 8/20 [00:00<00:01, 11.13it/s

Validation:   0%|                                        | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|▉                  | 1/20 [00:00<00:01, 18.34it/s][A
Validation DataLoader 0:  10%|█▉                 | 2/20 [00:00<00:01, 13.95it/s][A
Validation DataLoader 0:  15%|██▊                | 3/20 [00:00<00:01, 11.43it/s][A
Validation DataLoader 0:  20%|███▊               | 4/20 [00:00<00:01, 11.64it/s][A
Validation DataLoader 0:  25%|████▊              | 5/20 [00:00<00:01, 10.73it/s][A
Validation DataLoader 0:  30%|█████▋             | 6/20 [00:00<00:01, 10.39it/s][A
Validation DataLoader 0:  35%|██████▋            | 7/20 [00:00<00:01, 10.17it/s][A
Validation DataLoader 0:  40%|███████▌           | 8/20 [00:00<00:01, 10.36it/s][A
Validation DataLoader 0:  45%|████████▌          | 9/20 [00:00<00:01, 10.24it/s][A
Validation DataLoader 0:  50%|█████████         | 10/20 [00:00<00:00, 10.25i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 372/372 [00:32<00:00, 11.39it/s, v_num=517, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 88/88 [00:24<00:00,  3.63it/s]
Fold 3:
  RMSE: 0.7543385639133463
  MAE: 0.5657988883879834
  R²: 0.7160864754482843
  AARD, %: 15.684987339036462
Starting fold 4/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_3 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 366/366 [00:29<00:00, 12.25it/s, v_num=518, train_loss_step=0.2
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|▉                  | 1/20 [00:00<00:00, 19.29it/s][A
Validation DataLoader 0:  10%|█▉                 | 2/20 [00:00<00:01, 13.40it/s][A
Validation DataLoader 0:  15%|██▊                | 3/20 [00:00<00:01, 16.62it/s][A
Validation DataLoader 0:  20%|███▊               | 4/20 [00:00<00:01, 15.80it/s][A
Validation DataLoader 0:  25%|████▊              | 5/20 [00:00<00:00, 15.41it/s][A
Validation DataLoader 0:  30%|█████▋             | 6/20 [00:00<00:00, 15.95it/s][A
Validation DataLoader 0:  35%|██████▋            | 7/20 [00:00<00:00, 15.31it/s][A
Validation DataLoader 0:  40%|███████▌           | 8/20 [00:00<00:00, 15.39it/s

Validation:   0%|                                        | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|▉                  | 1/20 [00:00<00:01, 17.97it/s][A
Validation DataLoader 0:  10%|█▉                 | 2/20 [00:00<00:01, 14.71it/s][A
Validation DataLoader 0:  15%|██▊                | 3/20 [00:00<00:00, 17.15it/s][A
Validation DataLoader 0:  20%|███▊               | 4/20 [00:00<00:01, 15.81it/s][A
Validation DataLoader 0:  25%|████▊              | 5/20 [00:00<00:01, 13.29it/s][A
Validation DataLoader 0:  30%|█████▋             | 6/20 [00:00<00:01, 12.99it/s][A
Validation DataLoader 0:  35%|██████▋            | 7/20 [00:00<00:01, 12.20it/s][A
Validation DataLoader 0:  40%|███████▌           | 8/20 [00:00<00:00, 12.22it/s][A
Validation DataLoader 0:  45%|████████▌          | 9/20 [00:00<00:00, 11.70it/s][A
Validation DataLoader 0:  50%|█████████         | 10/20 [00:00<00:00, 11.67i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 366/366 [00:31<00:00, 11.57it/s, v_num=518, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 94/94 [00:22<00:00,  4.13it/s]
Fold 4:
  RMSE: 0.7149880218649636
  MAE: 0.5273999504984003
  R²: 0.7570270085508407
  AARD, %: 18.14667120617781
Starting fold 5/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_4 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 379/379 [00:30<00:00, 12.35it/s, v_num=519, train_loss_step=0.1
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|▉                  | 1/20 [00:00<00:00, 32.28it/s][A
Validation DataLoader 0:  10%|█▉                 | 2/20 [00:00<00:01, 16.32it/s][A
Validation DataLoader 0:  15%|██▊                | 3/20 [00:00<00:01, 16.23it/s][A
Validation DataLoader 0:  20%|███▊               | 4/20 [00:00<00:01, 15.29it/s][A
Validation DataLoader 0:  25%|████▊              | 5/20 [00:00<00:01, 14.98it/s][A
Validation DataLoader 0:  30%|█████▋             | 6/20 [00:00<00:00, 14.51it/s][A
Validation DataLoader 0:  35%|██████▋            | 7/20 [00:00<00:00, 13.77it/s][A
Validation DataLoader 0:  40%|███████▌           | 8/20 [00:00<00:00, 13.57it/s

Validation:   0%|                                        | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/20 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|▉                  | 1/20 [00:00<00:03,  6.30it/s][A
Validation DataLoader 0:  10%|█▉                 | 2/20 [00:00<00:02,  7.88it/s][A
Validation DataLoader 0:  15%|██▊                | 3/20 [00:00<00:01,  8.86it/s][A
Validation DataLoader 0:  20%|███▊               | 4/20 [00:00<00:01,  9.30it/s][A
Validation DataLoader 0:  25%|████▊              | 5/20 [00:00<00:01,  9.27it/s][A
Validation DataLoader 0:  30%|█████▋             | 6/20 [00:00<00:01,  9.38it/s][A
Validation DataLoader 0:  35%|██████▋            | 7/20 [00:00<00:01,  9.50it/s][A
Validation DataLoader 0:  40%|███████▌           | 8/20 [00:00<00:01,  9.61it/s][A
Validation DataLoader 0:  45%|████████▌          | 9/20 [00:00<00:01,  9.68it/s][A
Validation DataLoader 0:  50%|█████████         | 10/20 [00:00<00:00, 10.05i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 379/379 [00:33<00:00, 11.36it/s, v_num=519, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 80/80 [00:22<00:00,  3.51it/s]
Fold 5:
  RMSE: 0.8665760187567846
  MAE: 0.6573915761676988
  R²: 0.5904694231228433
  AARD, %: 19.76875547047459

Final cross-validation results:
  RMSE: 0.7650 ± 0.0528
  MAE: 0.5582 ± 0.0525
  R²: 0.7039 ± 0.0586
  AARD, %: 17.0330 ± 1.6949


In [13]:
#Scaffold split. D-MPNN with thermodynamic properties

In [15]:
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return MurckoScaffold.MurckoScaffoldSmiles(mol=mol)

df_rem['scaffold'] = df_rem['SMILES_Canonical'].apply(get_scaffold)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rem['scaffold'] = df_rem['SMILES_Canonical'].apply(get_scaffold)


In [16]:
df_rem['scaffold_group'] = df_rem['scaffold'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rem['scaffold_group'] = df_rem['scaffold'].astype('category').cat.codes


In [17]:
fold_results = {"RMSE": [], "MAE": [], "R²": [], "AARD": []}
all_test_smiles = []
all_test_targets = []
all_test_predictions = []

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    pd.DataFrame({"Placeholder": []}).to_excel(writer, sheet_name="Init", index=False)

    for fold_idx, (train_indices, test_indices) in enumerate(k_splits.split(mols, groups=df_rem['scaffold_group'])):
        print(f"Starting fold {fold_idx + 1}/5")

        train_data, _, test_data = data.split_data_by_indices(all_data, [train_indices], None, [test_indices])
        train_data, val_data = train_test_split(train_data[0], test_size=0.05, random_state=fold_idx)

        featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
        train_dset = data.MoleculeDataset(train_data, featurizer)
        scaler = train_dset.normalize_targets()

        val_dset = data.MoleculeDataset(val_data, featurizer)
        val_dset.normalize_targets(scaler)

        test_dset = data.MoleculeDataset(test_data[0], featurizer)
        
        targets_scaler = train_dset.normalize_targets()
        extra_datapoint_descriptors_scaler = train_dset.normalize_inputs("X_d")
        val_dset.normalize_targets(targets_scaler)
        val_dset.normalize_inputs("X_d", extra_datapoint_descriptors_scaler)
        
        train_dset.cache = True
        val_dset.cache = True

        
        train_loader = data.build_dataloader(train_dset, shuffle=True)
        val_loader = data.build_dataloader(val_dset, shuffle=False)
        test_loader = data.build_dataloader(test_dset, shuffle=False)

        # optimized by Ray Tune
        ffn_hidden_dim = 2400
        message_hidden_dim = 300
        depth = 2
        ffn_num_layers = 1
        batch_norm = True
        metric_list = [nn.metrics.RMSE(), nn.metrics.MAE()]
        descriptor_columns = extra_datapoint_descriptors.shape[1]

        mp = nn.BondMessagePassing(d_h=message_hidden_dim)
        agg = nn.MeanAggregation()
        output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
        ffn = nn.RegressionFFN(
            output_transform=output_transform,
            input_dim=message_hidden_dim + descriptor_columns,
            hidden_dim=ffn_hidden_dim,
            n_layers=ffn_num_layers
        )
        X_d_transform = nn.ScaleTransform.from_standard_scaler(extra_datapoint_descriptors_scaler)

        model = models.MPNN(
            mp, agg, ffn, batch_norm, metric_list, X_d_transform=X_d_transform
        )
        
        # Checkpointing
        checkpoint_dir = Path(f"/home/lab101/temp/mdm/bot/chek/fold_{fold_idx}")
        checkpoint_dir.mkdir(parents=True, exist_ok=True)

        checkpointing = ModelCheckpoint(
            dirpath=checkpoint_dir,
            filename="best-{epoch}-{val_loss:.2f}",
            monitor="val_loss",
            mode="min",
            save_weights_only=True
        )

        trainer = pl.Trainer(accelerator="auto", devices=1, max_epochs=15, callbacks=[checkpointing])
        trainer.fit(model, train_loader, val_loader)

        predictions = trainer.predict(model, test_loader)
        test_smiles = [Chem.MolToSmiles(d.mol) for d in test_data[0]]
        test_targets = [d.y[0] for d in test_data[0]]
        fold_predictions = np.concatenate(predictions).flatten().tolist()

        all_test_smiles.extend(test_smiles)
        all_test_targets.extend(test_targets)
        all_test_predictions.extend(fold_predictions)

        rmse = calculate_rmse(test_targets, fold_predictions)
        mae = calculate_mae(test_targets, fold_predictions)
        r2 = r2_score(test_targets, fold_predictions)
        aard = calculate_aard(test_targets, fold_predictions)

        fold_results["RMSE"].append(rmse)
        fold_results["MAE"].append(mae)
        fold_results["R²"].append(r2)
        fold_results["AARD"].append(aard)

        df_fold = pd.DataFrame({"SMILES": test_smiles, "True_Target": test_targets, "Predicted_Target": fold_predictions})
        df_fold.to_excel(writer, sheet_name=f"Fold_{fold_idx + 1}", index=False)

        print(f"Fold {fold_idx + 1}:")
        print(f"  RMSE: {rmse}")
        print(f"  MAE: {mae}")
        print(f"  R²: {r2}")
        print(f"  AARD, %: {aard}")

mean_rmse = np.mean(fold_results["RMSE"])
std_rmse = np.std(fold_results["RMSE"])
mean_mae = np.mean(fold_results["MAE"])
std_mae = np.std(fold_results["MAE"])
mean_r2 = np.mean(fold_results["R²"])
std_r2 = np.std(fold_results["R²"])
mean_aard = np.mean(fold_results["AARD"])
std_aard = np.std(fold_results["AARD"])

print("\nFinal cross-validation results:")
print(f"  RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")
print(f"  MAE: {mean_mae:.4f} ± {std_mae:.4f}")
print(f"  R²: {mean_r2:.4f} ± {std_r2:.4f}")
print(f"  AARD, %: {mean_aard:.4f} ± {std_aard:.4f}")

df_results = pd.DataFrame({
    "SMILES": all_test_smiles,
    "True_Target": all_test_targets,
    "Predicted_Target": all_test_predictions
})
with pd.ExcelWriter(output_path, engine="openpyxl", mode="a") as writer:
    df_results.to_excel(writer, sheet_name="All_Folds", index=False)

Starting fold 1/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_0 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 400/400 [00:33<00:00, 12.11it/s, v_num=520, train_loss_step=0.1
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/22 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/22 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|▊                  | 1/22 [00:00<00:00, 32.61it/s][A
Validation DataLoader 0:   9%|█▋                 | 2/22 [00:00<00:01, 19.74it/s][A
Validation DataLoader 0:  14%|██▌                | 3/22 [00:00<00:01, 18.96it/s][A
Validation DataLoader 0:  18%|███▍               | 4/22 [00:00<00:00, 18.17it/s][A
Validation DataLoader 0:  23%|████▎              | 5/22 [00:00<00:00, 21.64it/s][A
Validation DataLoader 0:  27%|█████▏             | 6/22 [00:00<00:00, 22.02it/s][A
Validation DataLoader 0:  32%|██████             | 7/22 [00:00<00:00, 23.11it/s][A
Validation DataLoader 0:  36%|██████▉            | 8/22 [00:00<00:00, 24.83it/s

Validation DataLoader 0:  41%|███████▊           | 9/22 [00:00<00:01, 12.59it/s][A
Validation DataLoader 0:  45%|████████▏         | 10/22 [00:00<00:00, 12.20it/s][A
Validation DataLoader 0:  50%|█████████         | 11/22 [00:00<00:00, 12.73it/s][A
Validation DataLoader 0:  55%|█████████▊        | 12/22 [00:00<00:00, 12.71it/s][A
Validation DataLoader 0:  59%|██████████▋       | 13/22 [00:00<00:00, 13.19it/s][A
Validation DataLoader 0:  64%|███████████▍      | 14/22 [00:01<00:00, 12.43it/s][A
Validation DataLoader 0:  68%|████████████▎     | 15/22 [00:01<00:00, 12.23it/s][A
Validation DataLoader 0:  73%|█████████████     | 16/22 [00:01<00:00, 12.56it/s][A
Validation DataLoader 0:  77%|█████████████▉    | 17/22 [00:01<00:00, 12.35it/s][A
Validation DataLoader 0:  82%|██████████████▋   | 18/22 [00:01<00:00, 12.71it/s][A
Validation DataLoader 0:  86%|███████████████▌  | 19/22 [00:01<00:00, 12.40it/s][A
Validation DataLoader 0:  91%|████████████████▎ | 20/22 [00:01<00:00, 12.19i

Validation DataLoader 0:  95%|█████████████████▏| 21/22 [00:01<00:00, 11.85it/s][A
Validation DataLoader 0: 100%|██████████████████| 22/22 [00:01<00:00, 12.11it/s][A
Epoch 14: 100%|█| 400/400 [00:35<00:00, 11.30it/s, v_num=520, train_loss_step=0.[A

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 400/400 [00:35<00:00, 11.22it/s, v_num=520, train_loss_step=0.

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]





/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 58/58 [00:17<00:00,  3.31it/s]
Fold 1:
  RMSE: 0.8065657725087948
  MAE: 0.5622378136020394
  R²: 0.5817379571807655
  AARD, %: 14.033200979652413
Starting fold 2/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_1 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 272/272 [00:20<00:00, 13.08it/s, v_num=521, train_loss_step=0.2
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/15 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/15 [00:00<?, ?it/s][A
Validation DataLoader 0:   7%|█▎                 | 1/15 [00:00<00:00, 15.41it/s][A
Validation DataLoader 0:  13%|██▌                | 2/15 [00:00<00:01, 12.71it/s][A
Validation DataLoader 0:  20%|███▊               | 3/15 [00:00<00:01, 10.99it/s][A
Validation DataLoader 0:  27%|█████              | 4/15 [00:00<00:00, 12.99it/s][A
Validation DataLoader 0:  33%|██████▎            | 5/15 [00:00<00:00, 12.51it/s][A
Validation DataLoader 0:  40%|███████▌           | 6/15 [00:00<00:00, 11.23it/s][A
Validation DataLoader 0:  47%|████████▊          | 7/15 [00:00<00:00, 11.13it/s][A
Validation DataLoader 0:  53%|██████████▏        | 8/15 [00:00<00:00, 11.74it/s

Validation DataLoader 0:   7%|█▎                 | 1/15 [00:00<00:00, 22.62it/s][A
Validation DataLoader 0:  13%|██▌                | 2/15 [00:00<00:00, 17.73it/s][A
Validation DataLoader 0:  20%|███▊               | 3/15 [00:00<00:00, 12.11it/s][A
Validation DataLoader 0:  27%|█████              | 4/15 [00:00<00:00, 13.69it/s][A
Validation DataLoader 0:  33%|██████▎            | 5/15 [00:00<00:00, 13.26it/s][A
Validation DataLoader 0:  40%|███████▌           | 6/15 [00:00<00:00, 13.05it/s][A
Validation DataLoader 0:  47%|████████▊          | 7/15 [00:00<00:00, 12.04it/s][A
Validation DataLoader 0:  53%|██████████▏        | 8/15 [00:00<00:00, 11.59it/s][A
Validation DataLoader 0:  60%|███████████▍       | 9/15 [00:00<00:00, 11.25it/s][A
Validation DataLoader 0:  67%|████████████      | 10/15 [00:00<00:00, 11.34it/s][A
Validation DataLoader 0:  73%|█████████████▏    | 11/15 [00:00<00:00, 11.42it/s][A
Validation DataLoader 0:  80%|██████████████▍   | 12/15 [00:01<00:00, 11.96i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 272/272 [00:23<00:00, 11.81it/s, v_num=521, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|████████████████| 193/193 [00:56<00:00,  3.40it/s]
Fold 2:
  RMSE: 1.0013225467305402
  MAE: 0.7425764964623582
  R²: 0.5852653218248836
  AARD, %: 24.775962665597227
Starting fold 3/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_2 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 413/413 [00:33<00:00, 12.37it/s, v_num=522, train_loss_step=0.1
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/22 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/22 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|▊                  | 1/22 [00:00<00:01, 10.55it/s][A
Validation DataLoader 0:   9%|█▋                 | 2/22 [00:00<00:01, 11.61it/s][A
Validation DataLoader 0:  14%|██▌                | 3/22 [00:00<00:01, 11.21it/s][A
Validation DataLoader 0:  18%|███▍               | 4/22 [00:00<00:01, 10.59it/s][A
Validation DataLoader 0:  23%|████▎              | 5/22 [00:00<00:01, 10.90it/s][A
Validation DataLoader 0:  27%|█████▏             | 6/22 [00:00<00:01, 10.85it/s][A
Validation DataLoader 0:  32%|██████             | 7/22 [00:00<00:01, 11.62it/s][A
Validation DataLoader 0:  36%|██████▉            | 8/22 [00:00<00:01, 11.72it/s

Validation DataLoader 0:  41%|███████▊           | 9/22 [00:00<00:01, 10.46it/s][A
Validation DataLoader 0:  45%|████████▏         | 10/22 [00:00<00:01, 10.44it/s][A
Validation DataLoader 0:  50%|█████████         | 11/22 [00:01<00:01, 10.97it/s][A
Validation DataLoader 0:  55%|█████████▊        | 12/22 [00:01<00:00, 10.63it/s][A
Validation DataLoader 0:  59%|██████████▋       | 13/22 [00:01<00:00, 10.77it/s][A
Validation DataLoader 0:  64%|███████████▍      | 14/22 [00:01<00:00, 10.42it/s][A
Validation DataLoader 0:  68%|████████████▎     | 15/22 [00:01<00:00, 10.49it/s][A
Validation DataLoader 0:  73%|█████████████     | 16/22 [00:01<00:00, 10.71it/s][A
Validation DataLoader 0:  77%|█████████████▉    | 17/22 [00:01<00:00, 10.62it/s][A
Validation DataLoader 0:  82%|██████████████▋   | 18/22 [00:01<00:00, 10.68it/s][A
Validation DataLoader 0:  86%|███████████████▌  | 19/22 [00:01<00:00, 10.92it/s][A
Validation DataLoader 0:  91%|████████████████▎ | 20/22 [00:01<00:00, 11.05i

Validation DataLoader 0:  95%|█████████████████▏| 21/22 [00:01<00:00, 13.05it/s][A
Validation DataLoader 0: 100%|██████████████████| 22/22 [00:01<00:00, 13.44it/s][A
Epoch 14: 100%|█| 413/413 [00:37<00:00, 11.11it/s, v_num=522, train_loss_step=0.[A

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 413/413 [00:37<00:00, 11.07it/s, v_num=522, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 44/44 [00:12<00:00,  3.42it/s]
Fold 3:
  RMSE: 0.8597167776181781
  MAE: 0.6519894305400177
  R²: 0.5073707699913869
  AARD, %: 16.292025460525625
Starting fold 4/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_3 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 401/401 [00:31<00:00, 12.80it/s, v_num=523, train_loss_step=0.3
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/22 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/22 [00:00<?, ?it/s][A
Validation DataLoader 0:   5%|▊                  | 1/22 [00:00<00:01, 14.71it/s][A
Validation DataLoader 0:   9%|█▋                 | 2/22 [00:00<00:01, 16.48it/s][A
Validation DataLoader 0:  14%|██▌                | 3/22 [00:00<00:01, 11.10it/s][A
Validation DataLoader 0:  18%|███▍               | 4/22 [00:00<00:01, 11.63it/s][A
Validation DataLoader 0:  23%|████▎              | 5/22 [00:00<00:01, 11.74it/s][A
Validation DataLoader 0:  27%|█████▏             | 6/22 [00:00<00:01, 12.01it/s][A
Validation DataLoader 0:  32%|██████             | 7/22 [00:00<00:01, 12.99it/s][A
Validation DataLoader 0:  36%|██████▉            | 8/22 [00:00<00:01, 12.24it/s

Validation DataLoader 0:  41%|███████▊           | 9/22 [00:00<00:01, 12.06it/s][A
Validation DataLoader 0:  45%|████████▏         | 10/22 [00:00<00:00, 12.25it/s][A
Validation DataLoader 0:  50%|█████████         | 11/22 [00:00<00:00, 12.88it/s][A
Validation DataLoader 0:  55%|█████████▊        | 12/22 [00:00<00:00, 12.59it/s][A
Validation DataLoader 0:  59%|██████████▋       | 13/22 [00:01<00:00, 12.40it/s][A
Validation DataLoader 0:  64%|███████████▍      | 14/22 [00:01<00:00, 12.23it/s][A
Validation DataLoader 0:  68%|████████████▎     | 15/22 [00:01<00:00, 11.69it/s][A
Validation DataLoader 0:  73%|█████████████     | 16/22 [00:01<00:00, 11.78it/s][A
Validation DataLoader 0:  77%|█████████████▉    | 17/22 [00:01<00:00, 11.53it/s][A
Validation DataLoader 0:  82%|██████████████▋   | 18/22 [00:01<00:00, 11.69it/s][A
Validation DataLoader 0:  86%|███████████████▌  | 19/22 [00:01<00:00, 11.42it/s][A
Validation DataLoader 0:  91%|████████████████▎ | 20/22 [00:01<00:00, 11.74i

Validation DataLoader 0:  95%|█████████████████▏| 21/22 [00:01<00:00, 12.99it/s][A
Validation DataLoader 0: 100%|██████████████████| 22/22 [00:01<00:00, 12.94it/s][A
Epoch 14: 100%|█| 401/401 [00:34<00:00, 11.49it/s, v_num=523, train_loss_step=0.[A

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 401/401 [00:35<00:00, 11.43it/s, v_num=523, train_loss_step=0.

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.



Predicting DataLoader 0: 100%|██████████████████| 57/57 [00:14<00:00,  3.82it/s]
Fold 4:
  RMSE: 0.7892371570276975
  MAE: 0.5784803904567712
  R²: 0.5955764205932439
  AARD, %: 13.075967833304608
Starting fold 5/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_4 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 334/334 [00:27<00:00, 12.10it/s, v_num=524, train_loss_step=0.2
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   6%|█                  | 1/18 [00:00<00:01, 13.24it/s][A
Validation DataLoader 0:  11%|██                 | 2/18 [00:00<00:01, 10.59it/s][A
Validation DataLoader 0:  17%|███▏               | 3/18 [00:00<00:01, 10.65it/s][A
Validation DataLoader 0:  22%|████▏              | 4/18 [00:00<00:01,  9.98it/s][A
Validation DataLoader 0:  28%|█████▎             | 5/18 [00:00<00:01, 10.71it/s][A
Validation DataLoader 0:  33%|██████▎            | 6/18 [00:00<00:00, 12.17it/s][A
Validation DataLoader 0:  39%|███████▍           | 7/18 [00:00<00:00, 11.06it/s][A
Validation DataLoader 0:  44%|████████▍          | 8/18 [00:00<00:00, 11.46it/s

Validation DataLoader 0:  83%|███████████████   | 15/18 [00:01<00:00, 11.06it/s][A
Validation DataLoader 0:  89%|████████████████  | 16/18 [00:01<00:00, 11.15it/s][A
Validation DataLoader 0:  94%|█████████████████ | 17/18 [00:01<00:00, 11.34it/s][A
Validation DataLoader 0: 100%|██████████████████| 18/18 [00:01<00:00, 11.34it/s][A
Epoch 9: 100%|█| 334/334 [00:28<00:00, 11.92it/s, v_num=524, train_loss_step=0.0[A
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   6%|█                  | 1/18 [00:00<00:00, 26.84it/s][A
Validation DataLoader 0:  11%|██                 | 2/18 [00:00<00:01, 15.08it/s][A
Validation DataLoader 0:  17%|███▏               | 3/18 [00:00<00:01, 14.49it/s][A
Validation DataLoader 0:  22%|████▏              | 4/18 [00:00<00:01, 13.11i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 334/334 [00:28<00:00, 11.89it/s, v_num=524, train_loss_step=0.

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.



Predicting DataLoader 0: 100%|████████████████| 128/128 [00:29<00:00,  4.33it/s]
Fold 5:
  RMSE: 0.9141963702446654
  MAE: 0.679938863693288
  R²: 0.5328537399088338
  AARD, %: 23.03348859847843

Final cross-validation results:
  RMSE: 0.8742 ± 0.0772
  MAE: 0.6430 ± 0.0664
  R²: 0.5606 ± 0.0343
  AARD, %: 18.2421 ± 4.7719


# Drug-like dataset

In [19]:
df = pd.read_excel("Drug-like_data.xlsx")

In [20]:
smiles_column = ['SMILES_Canonical']
target_columns = ['Lg(y)']
descriptor_columns = ['T (K)', 'P (bar)', 'Melting Point', 'dHvap (kJ/mol)', 'g (gcm3)', 'dG', 'MolWt', 'MolLogP', 'TPSA', 'NR', 'Bj', 'HA', 'HD', 'RC', 'SP', 'NA', 'NAr', 'Hat']

In [21]:
smis = df.loc[:, smiles_column].values.flatten().astype(str)
ys = df.loc[:, target_columns].values
extra_datapoint_descriptors = df[descriptor_columns].values

In [22]:
all_data = [
    data.MoleculeDatapoint(Chem.MolFromSmiles(smi), y, x_d=X_d)
    for smi, y, X_d in zip(smis, ys, extra_datapoint_descriptors)
]

In [23]:
mols = [d.mol for d in all_data]

In [24]:
#Strict split SMILES. D-MPNN with thermodynamic properties

In [25]:
class ScaffoldGroupKFold:
    def __init__(self, n_splits=5, shuffle=False, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        unique_groups = np.unique(groups)
        if self.shuffle:
            np.random.seed(self.random_state)
            np.random.shuffle(unique_groups)
        
        fold_size = len(unique_groups) // self.n_splits
        
        for i in range(self.n_splits):
            if i == self.n_splits - 1:
                test_groups = unique_groups[i * fold_size:]
            else:
                test_groups = unique_groups[i * fold_size: (i + 1) * fold_size]
            
            test_indices = np.where(np.isin(groups, test_groups))[0]
            train_indices = np.where(~np.isin(groups, test_groups))[0]
            
            yield train_indices, test_indices

In [26]:
k_splits = ScaffoldGroupKFold(n_splits=5, shuffle=True, random_state=4321)
output_path = Path.cwd() / "hup" / "crossval_results_scCO2.xlsx"

In [27]:
def calculate_aard(true_values, predicted_values):
    true_values = np.array(true_values)  
    predicted_values = np.array(predicted_values)
    return 100 * np.mean(np.abs((true_values - predicted_values) / true_values))

def calculate_r2(true_values, predicted_values):
    true_values = np.array(true_values)  
    predicted_values = np.array(predicted_values)
    return r2_score(true_values, predicted_values)

def calculate_rmse(true_values, predicted_values):
    return np.sqrt(mean_squared_error(true_values, predicted_values))

def calculate_mae(true_values, predicted_values):
    return mean_absolute_error(true_values, predicted_values)

In [28]:
fold_results = {"RMSE": [], "MAE": [], "R²": [], "AARD": []}
all_test_smiles = []
all_test_targets = []
all_test_predictions = []

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    pd.DataFrame({"Placeholder": []}).to_excel(writer, sheet_name="Init", index=False)

    for fold_idx, (train_indices, test_indices) in enumerate(k_splits.split(mols, groups=df['SMILES_Canonical'])):
        print(f"Starting fold {fold_idx + 1}/5")

        train_data, _, test_data = data.split_data_by_indices(all_data, [train_indices], None, [test_indices])
        train_data, val_data = train_test_split(train_data[0], test_size=0.05, random_state=fold_idx)

        featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
        train_dset = data.MoleculeDataset(train_data, featurizer)
        scaler = train_dset.normalize_targets()

        val_dset = data.MoleculeDataset(val_data, featurizer)
        val_dset.normalize_targets(scaler)

        test_dset = data.MoleculeDataset(test_data[0], featurizer)
        
        targets_scaler = train_dset.normalize_targets()
        extra_datapoint_descriptors_scaler = train_dset.normalize_inputs("X_d")
        val_dset.normalize_targets(targets_scaler)
        val_dset.normalize_inputs("X_d", extra_datapoint_descriptors_scaler)
        
        train_dset.cache = True
        val_dset.cache = True

        
        train_loader = data.build_dataloader(train_dset, shuffle=True)
        val_loader = data.build_dataloader(val_dset, shuffle=False)
        test_loader = data.build_dataloader(test_dset, shuffle=False)

        # optimized by Ray Tune
        ffn_hidden_dim = 2400
        message_hidden_dim = 300
        depth = 2
        ffn_num_layers = 1
        batch_norm = True
        metric_list = [nn.metrics.RMSE(), nn.metrics.MAE()]
        descriptor_columns = extra_datapoint_descriptors.shape[1]

        mp = nn.BondMessagePassing(d_h=message_hidden_dim)
        agg = nn.MeanAggregation()
        output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
        ffn = nn.RegressionFFN(
            output_transform=output_transform,
            input_dim=message_hidden_dim + descriptor_columns,
            hidden_dim=ffn_hidden_dim,
            n_layers=ffn_num_layers
        )
        X_d_transform = nn.ScaleTransform.from_standard_scaler(extra_datapoint_descriptors_scaler)

        model = models.MPNN(
            mp, agg, ffn, batch_norm, metric_list, X_d_transform=X_d_transform
        )
        
        # Checkpointing
        checkpoint_dir = Path(f"/home/lab101/temp/mdm/bot/chek/fold_{fold_idx}")
        checkpoint_dir.mkdir(parents=True, exist_ok=True)

        checkpointing = ModelCheckpoint(
            dirpath=checkpoint_dir,
            filename="best-{epoch}-{val_loss:.2f}",
            monitor="val_loss",
            mode="min",
            save_weights_only=True
        )

        trainer = pl.Trainer(accelerator="auto", devices=1, max_epochs=15, callbacks=[checkpointing])
        trainer.fit(model, train_loader, val_loader)

        predictions = trainer.predict(model, test_loader)
        test_smiles = [Chem.MolToSmiles(d.mol) for d in test_data[0]]
        test_targets = [d.y[0] for d in test_data[0]]
        fold_predictions = np.concatenate(predictions).flatten().tolist()

        all_test_smiles.extend(test_smiles)
        all_test_targets.extend(test_targets)
        all_test_predictions.extend(fold_predictions)

        rmse = calculate_rmse(test_targets, fold_predictions)
        mae = calculate_mae(test_targets, fold_predictions)
        r2 = r2_score(test_targets, fold_predictions)
        aard = calculate_aard(test_targets, fold_predictions)

        fold_results["RMSE"].append(rmse)
        fold_results["MAE"].append(mae)
        fold_results["R²"].append(r2)
        fold_results["AARD"].append(aard)

        df_fold = pd.DataFrame({"SMILES": test_smiles, "True_Target": test_targets, "Predicted_Target": fold_predictions})
        df_fold.to_excel(writer, sheet_name=f"Fold_{fold_idx + 1}", index=False)

        print(f"Fold {fold_idx + 1}:")
        print(f"  RMSE: {rmse}")
        print(f"  MAE: {mae}")
        print(f"  R²: {r2}")
        print(f"  AARD, %: {aard}")

mean_rmse = np.mean(fold_results["RMSE"])
std_rmse = np.std(fold_results["RMSE"])
mean_mae = np.mean(fold_results["MAE"])
std_mae = np.std(fold_results["MAE"])
mean_r2 = np.mean(fold_results["R²"])
std_r2 = np.std(fold_results["R²"])
mean_aard = np.mean(fold_results["AARD"])
std_aard = np.std(fold_results["AARD"])

print("\nFinal cross-validation results:")
print(f"  RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")
print(f"  MAE: {mean_mae:.4f} ± {std_mae:.4f}")
print(f"  R²: {mean_r2:.4f} ± {std_r2:.4f}")
print(f"  AARD, %: {mean_aard:.4f} ± {std_aard:.4f}")

df_results = pd.DataFrame({
    "SMILES": all_test_smiles,
    "True_Target": all_test_targets,
    "Predicted_Target": all_test_predictions
})
with pd.ExcelWriter(output_path, engine="openpyxl", mode="a") as writer:
    df_results.to_excel(writer, sheet_name="All_Folds", index=False)

Starting fold 1/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_0 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 238/238 [00:14<00:00, 15.97it/s, v_num=525, train_loss_step=0.0
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|█▍                 | 1/13 [00:00<00:00, 27.32it/s][A
Validation DataLoader 0:  15%|██▉                | 2/13 [00:00<00:00, 23.11it/s][A
Validation DataLoader 0:  23%|████▍              | 3/13 [00:00<00:00, 31.66it/s][A
Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 21.25it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 22.26it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 22.27it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 19.53it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 18.07it/s

Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 18.97it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 18.61it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 17.36it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 16.89it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 17.20it/s][A
Validation DataLoader 0:  69%|█████████████▏     | 9/13 [00:00<00:00, 17.82it/s][A
Validation DataLoader 0:  77%|█████████████▊    | 10/13 [00:00<00:00, 18.29it/s][A
Validation DataLoader 0:  85%|███████████████▏  | 11/13 [00:00<00:00, 17.24it/s][A
Validation DataLoader 0:  92%|████████████████▌ | 12/13 [00:00<00:00, 17.01it/s][A
Validation DataLoader 0: 100%|██████████████████| 13/13 [00:00<00:00, 17.62it/s][A
Epoch 12: 100%|█| 238/238 [00:15<00:00, 15.08it/s, v_num=525, train_loss_step=0.[A
Validation: |                                             | 0/? [00:00<?, ?i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 238/238 [00:15<00:00, 15.24it/s, v_num=525, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 61/61 [00:15<00:00,  4.04it/s]
Fold 1:
  RMSE: 0.7509111990264223
  MAE: 0.5474730388968528
  R²: 0.7036070508380569
  AARD, %: 17.966594789205917
Starting fold 2/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_1 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 237/237 [00:15<00:00, 15.47it/s, v_num=526, train_loss_step=0.1
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|█▍                 | 1/13 [00:00<00:00, 16.56it/s][A
Validation DataLoader 0:  15%|██▉                | 2/13 [00:00<00:00, 19.54it/s][A
Validation DataLoader 0:  23%|████▍              | 3/13 [00:00<00:00, 15.15it/s][A
Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 15.29it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 16.03it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 15.13it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 16.12it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 15.85it/s

Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 18.50it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 19.06it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 15.56it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 16.76it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 14.53it/s][A
Validation DataLoader 0:  69%|█████████████▏     | 9/13 [00:00<00:00, 15.23it/s][A
Validation DataLoader 0:  77%|█████████████▊    | 10/13 [00:00<00:00, 15.47it/s][A
Validation DataLoader 0:  85%|███████████████▏  | 11/13 [00:00<00:00, 15.76it/s][A
Validation DataLoader 0:  92%|████████████████▌ | 12/13 [00:00<00:00, 16.87it/s][A
Validation DataLoader 0: 100%|██████████████████| 13/13 [00:00<00:00, 16.66it/s][A
Epoch 12: 100%|█| 237/237 [00:15<00:00, 15.54it/s, v_num=526, train_loss_step=0.[A
Validation: |                                             | 0/? [00:00<?, ?i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 237/237 [00:15<00:00, 14.89it/s, v_num=526, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 61/61 [00:13<00:00,  4.39it/s]
Fold 2:
  RMSE: 0.7686560331266027
  MAE: 0.5858805020820527
  R²: 0.7467185385192198
  AARD, %: 21.681065538144725
Starting fold 3/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_2 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 229/229 [00:13<00:00, 16.61it/s, v_num=527, train_loss_step=0.1
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|█▍                 | 1/13 [00:00<00:00, 32.14it/s][A
Validation DataLoader 0:  15%|██▉                | 2/13 [00:00<00:00, 22.49it/s][A
Validation DataLoader 0:  23%|████▍              | 3/13 [00:00<00:00, 19.54it/s][A
Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 17.99it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 15.08it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 15.48it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 15.33it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 16.10it/s

Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 15.17it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 14.11it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 15.49it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 14.90it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 15.79it/s][A
Validation DataLoader 0:  69%|█████████████▏     | 9/13 [00:00<00:00, 15.55it/s][A
Validation DataLoader 0:  77%|█████████████▊    | 10/13 [00:00<00:00, 14.85it/s][A
Validation DataLoader 0:  85%|███████████████▏  | 11/13 [00:00<00:00, 15.11it/s][A
Validation DataLoader 0:  92%|████████████████▌ | 12/13 [00:00<00:00, 14.83it/s][A
Validation DataLoader 0: 100%|██████████████████| 13/13 [00:00<00:00, 15.11it/s][A
Epoch 12: 100%|█| 229/229 [00:14<00:00, 16.12it/s, v_num=527, train_loss_step=0.[A
Validation: |                                             | 0/? [00:00<?, ?i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 229/229 [00:14<00:00, 15.37it/s, v_num=527, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 70/70 [00:17<00:00,  4.07it/s]
Fold 3:
  RMSE: 0.685027953090186
  MAE: 0.5172412221704291
  R²: 0.7724636042505589
  AARD, %: 15.131037239692338
Starting fold 4/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_3 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:  50%|███████       | 1/2 [00:00<00:00, 108.40it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 234/234 [00:14<00:00, 15.90it/s, v_num=528, train_loss_step=0.4
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|█▍                 | 1/13 [00:00<00:00, 16.42it/s][A
Validation DataLoader 0:  15%|██▉                | 2/13 [00:00<00:00, 16.58it/s][A
Validation DataLoader 0:  23%|████▍              | 3/13 [00:00<00:00, 11.30it/s][A
Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 12.10it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 11.97it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 12.73it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 13.65it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 13.11it/s

Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 16.97it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 15.03it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 16.52it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 16.42it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 14.73it/s][A
Validation DataLoader 0:  69%|█████████████▏     | 9/13 [00:00<00:00, 14.99it/s][A
Validation DataLoader 0:  77%|█████████████▊    | 10/13 [00:00<00:00, 15.45it/s][A
Validation DataLoader 0:  85%|███████████████▏  | 11/13 [00:00<00:00, 15.46it/s][A
Validation DataLoader 0:  92%|████████████████▌ | 12/13 [00:00<00:00, 14.48it/s][A
Validation DataLoader 0: 100%|██████████████████| 13/13 [00:00<00:00, 15.45it/s][A
Epoch 12: 100%|█| 234/234 [00:14<00:00, 16.02it/s, v_num=528, train_loss_step=0.[A
Validation: |                                             | 0/? [00:00<?, ?i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 234/234 [00:15<00:00, 15.31it/s, v_num=528, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 65/65 [00:15<00:00,  4.28it/s]
Fold 4:
  RMSE: 0.7440093994567862
  MAE: 0.5700443848355783
  R²: 0.6766723735113068
  AARD, %: 15.387348775150187
Starting fold 5/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_4 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking:   0%|                                    | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 242/242 [00:16<00:00, 14.94it/s, v_num=529, train_loss_step=0.1
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|█▍                 | 1/13 [00:00<00:00, 22.31it/s][A
Validation DataLoader 0:  15%|██▉                | 2/13 [00:00<00:00, 15.30it/s][A
Validation DataLoader 0:  23%|████▍              | 3/13 [00:00<00:00, 14.72it/s][A
Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 13.66it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 15.94it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 16.92it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 14.26it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 14.51it/s

Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 14.92it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 14.75it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 16.96it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 15.69it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 15.13it/s][A
Validation DataLoader 0:  69%|█████████████▏     | 9/13 [00:00<00:00, 15.42it/s][A
Validation DataLoader 0:  77%|█████████████▊    | 10/13 [00:00<00:00, 15.11it/s][A
Validation DataLoader 0:  85%|███████████████▏  | 11/13 [00:00<00:00, 14.64it/s][A
Validation DataLoader 0:  92%|████████████████▌ | 12/13 [00:00<00:00, 14.38it/s][A
Validation DataLoader 0: 100%|██████████████████| 13/13 [00:00<00:00, 14.26it/s][A
Epoch 12: 100%|█| 242/242 [00:16<00:00, 15.12it/s, v_num=529, train_loss_step=0.[A
Validation: |                                             | 0/? [00:00<?, ?i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 242/242 [00:17<00:00, 13.96it/s, v_num=529, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 56/56 [00:12<00:00,  4.58it/s]
Fold 5:
  RMSE: 0.6685365068959032
  MAE: 0.4940107034571468
  R²: 0.7623929450218354
  AARD, %: 15.413898174092754

Final cross-validation results:
  RMSE: 0.7234 ± 0.0393
  MAE: 0.5429 ± 0.0336
  R²: 0.7324 ± 0.0365
  AARD, %: 17.1160 ± 2.5055


In [29]:
#Scaffold split. D-MPNN with thermodynamic properties

In [30]:
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return MurckoScaffold.MurckoScaffoldSmiles(mol=mol)

df['scaffold'] = df['SMILES_Canonical'].apply(get_scaffold)

In [31]:
df['scaffold_group'] = df['scaffold'].astype('category').cat.codes

In [32]:
fold_results = {"RMSE": [], "MAE": [], "R²": [], "AARD": []}
all_test_smiles = []
all_test_targets = []
all_test_predictions = []

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    pd.DataFrame({"Placeholder": []}).to_excel(writer, sheet_name="Init", index=False)

    for fold_idx, (train_indices, test_indices) in enumerate(k_splits.split(mols, groups=df['scaffold_group'])):
        print(f"Starting fold {fold_idx + 1}/5")

        train_data, _, test_data = data.split_data_by_indices(all_data, [train_indices], None, [test_indices])
        train_data, val_data = train_test_split(train_data[0], test_size=0.05, random_state=fold_idx)

        featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
        train_dset = data.MoleculeDataset(train_data, featurizer)
        scaler = train_dset.normalize_targets()

        val_dset = data.MoleculeDataset(val_data, featurizer)
        val_dset.normalize_targets(scaler)

        test_dset = data.MoleculeDataset(test_data[0], featurizer)
        
        targets_scaler = train_dset.normalize_targets()
        extra_datapoint_descriptors_scaler = train_dset.normalize_inputs("X_d")
        val_dset.normalize_targets(targets_scaler)
        val_dset.normalize_inputs("X_d", extra_datapoint_descriptors_scaler)
        
        train_dset.cache = True
        val_dset.cache = True

        
        train_loader = data.build_dataloader(train_dset, shuffle=True)
        val_loader = data.build_dataloader(val_dset, shuffle=False)
        test_loader = data.build_dataloader(test_dset, shuffle=False)

        # optimized by Ray Tune
        ffn_hidden_dim = 2400
        message_hidden_dim = 300
        depth = 2
        ffn_num_layers = 1
        batch_norm = True
        metric_list = [nn.metrics.RMSE(), nn.metrics.MAE()]
        descriptor_columns = extra_datapoint_descriptors.shape[1]

        mp = nn.BondMessagePassing(d_h=message_hidden_dim)
        agg = nn.MeanAggregation()
        output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
        ffn = nn.RegressionFFN(
            output_transform=output_transform,
            input_dim=message_hidden_dim + descriptor_columns,
            hidden_dim=ffn_hidden_dim,
            n_layers=ffn_num_layers
        )
        X_d_transform = nn.ScaleTransform.from_standard_scaler(extra_datapoint_descriptors_scaler)

        model = models.MPNN(
            mp, agg, ffn, batch_norm, metric_list, X_d_transform=X_d_transform
        )
        
        # Checkpointing
        checkpoint_dir = Path(f"/home/lab101/temp/mdm/bot/chek/fold_{fold_idx}")
        checkpoint_dir.mkdir(parents=True, exist_ok=True)

        checkpointing = ModelCheckpoint(
            dirpath=checkpoint_dir,
            filename="best-{epoch}-{val_loss:.2f}",
            monitor="val_loss",
            mode="min",
            save_weights_only=True
        )

        trainer = pl.Trainer(accelerator="auto", devices=1, max_epochs=15, callbacks=[checkpointing])
        trainer.fit(model, train_loader, val_loader)

        predictions = trainer.predict(model, test_loader)
        test_smiles = [Chem.MolToSmiles(d.mol) for d in test_data[0]]
        test_targets = [d.y[0] for d in test_data[0]]
        fold_predictions = np.concatenate(predictions).flatten().tolist()

        all_test_smiles.extend(test_smiles)
        all_test_targets.extend(test_targets)
        all_test_predictions.extend(fold_predictions)

        rmse = calculate_rmse(test_targets, fold_predictions)
        mae = calculate_mae(test_targets, fold_predictions)
        r2 = r2_score(test_targets, fold_predictions)
        aard = calculate_aard(test_targets, fold_predictions)

        fold_results["RMSE"].append(rmse)
        fold_results["MAE"].append(mae)
        fold_results["R²"].append(r2)
        fold_results["AARD"].append(aard)

        df_fold = pd.DataFrame({"SMILES": test_smiles, "True_Target": test_targets, "Predicted_Target": fold_predictions})
        df_fold.to_excel(writer, sheet_name=f"Fold_{fold_idx + 1}", index=False)

        print(f"Fold {fold_idx + 1}:")
        print(f"  RMSE: {rmse}")
        print(f"  MAE: {mae}")
        print(f"  R²: {r2}")
        print(f"  AARD, %: {aard}")

mean_rmse = np.mean(fold_results["RMSE"])
std_rmse = np.std(fold_results["RMSE"])
mean_mae = np.mean(fold_results["MAE"])
std_mae = np.std(fold_results["MAE"])
mean_r2 = np.mean(fold_results["R²"])
std_r2 = np.std(fold_results["R²"])
mean_aard = np.mean(fold_results["AARD"])
std_aard = np.std(fold_results["AARD"])

print("\nFinal cross-validation results:")
print(f"  RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")
print(f"  MAE: {mean_mae:.4f} ± {std_mae:.4f}")
print(f"  R²: {mean_r2:.4f} ± {std_r2:.4f}")
print(f"  AARD, %: {mean_aard:.4f} ± {std_aard:.4f}")

df_results = pd.DataFrame({
    "SMILES": all_test_smiles,
    "True_Target": all_test_targets,
    "Predicted_Target": all_test_predictions
})
with pd.ExcelWriter(output_path, engine="openpyxl", mode="a") as writer:
    df_results.to_excel(writer, sheet_name="All_Folds", index=False)

Starting fold 1/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_0 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 249/249 [00:15<00:00, 16.31it/s, v_num=530, train_loss_step=0.1
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   7%|█▎                 | 1/14 [00:00<00:00, 23.78it/s][A
Validation DataLoader 0:  14%|██▋                | 2/14 [00:00<00:00, 19.89it/s][A
Validation DataLoader 0:  21%|████               | 3/14 [00:00<00:00, 20.66it/s][A
Validation DataLoader 0:  29%|█████▍             | 4/14 [00:00<00:00, 20.64it/s][A
Validation DataLoader 0:  36%|██████▊            | 5/14 [00:00<00:00, 18.98it/s][A
Validation DataLoader 0:  43%|████████▏          | 6/14 [00:00<00:00, 16.51it/s][A
Validation DataLoader 0:  50%|█████████▌         | 7/14 [00:00<00:00, 14.66it/s][A
Validation DataLoader 0:  57%|██████████▊        | 8/14 [00:00<00:00, 14.56it/s

Validation DataLoader 0:  79%|██████████████▏   | 11/14 [00:00<00:00, 18.14it/s][A
Validation DataLoader 0:  86%|███████████████▍  | 12/14 [00:00<00:00, 17.53it/s][A
Validation DataLoader 0:  93%|████████████████▋ | 13/14 [00:00<00:00, 16.44it/s][A
Validation DataLoader 0: 100%|██████████████████| 14/14 [00:00<00:00, 16.49it/s][A
Epoch 11: 100%|█| 249/249 [00:16<00:00, 15.10it/s, v_num=530, train_loss_step=0.[A
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   7%|█▎                 | 1/14 [00:00<00:00, 17.85it/s][A
Validation DataLoader 0:  14%|██▋                | 2/14 [00:00<00:00, 20.22it/s][A
Validation DataLoader 0:  21%|████               | 3/14 [00:00<00:00, 19.25it/s][A
Validation DataLoader 0:  29%|█████▍             | 4/14 [00:00<00:00, 17.91i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 249/249 [00:18<00:00, 13.61it/s, v_num=530, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 49/49 [00:10<00:00,  4.53it/s]
Fold 1:
  RMSE: 0.9186784585628996
  MAE: 0.7217965874445207
  R²: 0.6199559544696587
  AARD, %: 28.12889249653054
Starting fold 2/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_1 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 168/168 [00:10<00:00, 15.99it/s, v_num=531, train_loss_step=0.4
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                         | 0/9 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                            | 0/9 [00:00<?, ?it/s][A
Validation DataLoader 0:  11%|██▏                 | 1/9 [00:00<00:00, 14.13it/s][A
Validation DataLoader 0:  22%|████▍               | 2/9 [00:00<00:00, 14.55it/s][A
Validation DataLoader 0:  33%|██████▋             | 3/9 [00:00<00:00, 16.29it/s][A
Validation DataLoader 0:  44%|████████▉           | 4/9 [00:00<00:00, 14.43it/s][A
Validation DataLoader 0:  56%|███████████         | 5/9 [00:00<00:00, 14.18it/s][A
Validation DataLoader 0:  67%|█████████████▎      | 6/9 [00:00<00:00, 13.61it/s][A
Validation DataLoader 0:  78%|███████████████▌    | 7/9 [00:00<00:00, 13.18it/s][A
Validation DataLoader 0:  89%|█████████████████▊  | 8/9 [00:00<00:00, 13.93it/s

Validation DataLoader 0: 100%|████████████████████| 9/9 [00:00<00:00, 13.35it/s][A
Epoch 14: 100%|█| 168/168 [00:12<00:00, 13.73it/s, v_num=531, train_loss_step=0.[A

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 168/168 [00:12<00:00, 13.45it/s, v_num=531, train_loss_step=0.

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.



Predicting DataLoader 0: 100%|████████████████| 134/134 [00:31<00:00,  4.25it/s]
Fold 2:
  RMSE: 0.8943573249188447
  MAE: 0.717761459051818
  R²: 0.5968603177038554
  AARD, %: 21.004770877534348
Starting fold 3/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_2 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking DataLoader 0:  50%|███████▌       | 1/2 [00:00<00:00, 28.70it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 257/257 [00:15<00:00, 16.21it/s, v_num=532, train_loss_step=0.2
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   7%|█▎                 | 1/14 [00:00<00:00, 41.88it/s][A
Validation DataLoader 0:  14%|██▋                | 2/14 [00:00<00:00, 25.04it/s][A
Validation DataLoader 0:  21%|████               | 3/14 [00:00<00:00, 28.90it/s][A
Validation DataLoader 0:  29%|█████▍             | 4/14 [00:00<00:00, 25.13it/s][A
Validation DataLoader 0:  36%|██████▊            | 5/14 [00:00<00:00, 18.62it/s][A
Validation DataLoader 0:  43%|████████▏          | 6/14 [00:00<00:00, 18.70it/s][A
Validation DataLoader 0:  50%|█████████▌         | 7/14 [00:00<00:00, 18.93it/s][A
Validation DataLoader 0:  57%|██████████▊        | 8/14 [00:00<00:00, 19.40it/s

Validation DataLoader 0:  79%|██████████████▏   | 11/14 [00:00<00:00, 17.27it/s][A
Validation DataLoader 0:  86%|███████████████▍  | 12/14 [00:00<00:00, 17.44it/s][A
Validation DataLoader 0:  93%|████████████████▋ | 13/14 [00:00<00:00, 17.55it/s][A
Validation DataLoader 0: 100%|██████████████████| 14/14 [00:00<00:00, 17.70it/s][A
Epoch 11: 100%|█| 257/257 [00:15<00:00, 17.02it/s, v_num=532, train_loss_step=0.[A
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   7%|█▎                 | 1/14 [00:00<00:00, 19.96it/s][A
Validation DataLoader 0:  14%|██▋                | 2/14 [00:00<00:00, 26.67it/s][A
Validation DataLoader 0:  21%|████               | 3/14 [00:00<00:00, 20.66it/s][A
Validation DataLoader 0:  29%|█████▍             | 4/14 [00:00<00:00, 21.30i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 257/257 [00:16<00:00, 15.21it/s, v_num=532, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 40/40 [00:09<00:00,  4.10it/s]
Fold 3:
  RMSE: 0.6730270006904463
  MAE: 0.4959186462825931
  R²: 0.6878077791195952
  AARD, %: 14.193400792400586
Starting fold 4/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_3 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 246/246 [00:15<00:00, 16.20it/s, v_num=533, train_loss_step=0.1
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|█▍                 | 1/13 [00:00<00:00, 14.44it/s][A
Validation DataLoader 0:  15%|██▉                | 2/13 [00:00<00:00, 19.81it/s][A
Validation DataLoader 0:  23%|████▍              | 3/13 [00:00<00:00, 18.11it/s][A
Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 14.20it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 14.70it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 14.80it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 14.79it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 15.29it/s

Validation DataLoader 0:  31%|█████▊             | 4/13 [00:00<00:00, 17.85it/s][A
Validation DataLoader 0:  38%|███████▎           | 5/13 [00:00<00:00, 18.14it/s][A
Validation DataLoader 0:  46%|████████▊          | 6/13 [00:00<00:00, 18.79it/s][A
Validation DataLoader 0:  54%|██████████▏        | 7/13 [00:00<00:00, 18.88it/s][A
Validation DataLoader 0:  62%|███████████▋       | 8/13 [00:00<00:00, 19.44it/s][A
Validation DataLoader 0:  69%|█████████████▏     | 9/13 [00:00<00:00, 19.54it/s][A
Validation DataLoader 0:  77%|█████████████▊    | 10/13 [00:00<00:00, 20.26it/s][A
Validation DataLoader 0:  85%|███████████████▏  | 11/13 [00:00<00:00, 20.39it/s][A
Validation DataLoader 0:  92%|████████████████▌ | 12/13 [00:00<00:00, 19.90it/s][A
Validation DataLoader 0: 100%|██████████████████| 13/13 [00:00<00:00, 20.74it/s][A
Epoch 12: 100%|█| 246/246 [00:15<00:00, 15.45it/s, v_num=533, train_loss_step=0.[A
Validation: |                                             | 0/? [00:00<?, ?i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 246/246 [00:16<00:00, 15.05it/s, v_num=533, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 52/52 [00:12<00:00,  4.20it/s]
Fold 4:
  RMSE: 0.7404537335491891
  MAE: 0.5520953639540059
  R²: 0.7092164109817813
  AARD, %: 21.04399016689529
Starting fold 5/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/lab101/temp/mdm/bot/chek/fold_4 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
Loading `train_dataloader` to estimate number of stepping batches.
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn    

Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0: 100%|█| 260/260 [00:15<00:00, 17.14it/s, v_num=534, train_loss_step=0.0
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   7%|█▎                 | 1/14 [00:00<00:00, 19.46it/s][A
Validation DataLoader 0:  14%|██▋                | 2/14 [00:00<00:00, 29.92it/s][A
Validation DataLoader 0:  21%|████               | 3/14 [00:00<00:00, 21.43it/s][A
Validation DataLoader 0:  29%|█████▍             | 4/14 [00:00<00:00, 19.05it/s][A
Validation DataLoader 0:  36%|██████▊            | 5/14 [00:00<00:00, 19.57it/s][A
Validation DataLoader 0:  43%|████████▏          | 6/14 [00:00<00:00, 20.69it/s][A
Validation DataLoader 0:  50%|█████████▌         | 7/14 [00:00<00:00, 19.32it/s][A
Validation DataLoader 0:  57%|██████████▊        | 8/14 [00:00<00:00, 18.91it/s

Validation DataLoader 0:  79%|██████████████▏   | 11/14 [00:00<00:00, 15.82it/s][A
Validation DataLoader 0:  86%|███████████████▍  | 12/14 [00:00<00:00, 15.77it/s][A
Validation DataLoader 0:  93%|████████████████▋ | 13/14 [00:00<00:00, 16.55it/s][A
Validation DataLoader 0: 100%|██████████████████| 14/14 [00:00<00:00, 16.48it/s][A
Epoch 11: 100%|█| 260/260 [00:15<00:00, 16.26it/s, v_num=534, train_loss_step=0.[A
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                        | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/14 [00:00<?, ?it/s][A
Validation DataLoader 0:   7%|█▎                 | 1/14 [00:00<00:00, 36.09it/s][A
Validation DataLoader 0:  14%|██▋                | 2/14 [00:00<00:00, 20.92it/s][A
Validation DataLoader 0:  21%|████               | 3/14 [00:00<00:00, 19.10it/s][A
Validation DataLoader 0:  29%|█████▍             | 4/14 [00:00<00:00, 16.65i

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|█| 260/260 [00:16<00:00, 16.00it/s, v_num=534, train_loss_step=0.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/lab101/anaconda3/envs/chemp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████████████| 37/37 [00:09<00:00,  3.97it/s]
Fold 5:
  RMSE: 0.7895075064775768
  MAE: 0.5975756309140653
  R²: 0.664042825829974
  AARD, %: 15.280018559141432

Final cross-validation results:
  RMSE: 0.8032 ± 0.0924
  MAE: 0.6170 ± 0.0899
  R²: 0.6556 ± 0.0417
  AARD, %: 19.9302 ± 4.9829
