<a href="https://colab.research.google.com/github/MZiaAfzal71/Edge-Aware-GNN/blob/main/Models/Chemprop_D_MPNN_Scaffold_for_ESOL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ‚ö†Ô∏è Runtime Configuration (Important)

To speed up model training, it is recommended to use a GPU runtime in Google Colab.

Go to:
- **Runtime ‚Üí Change runtime type**
- Set **Hardware accelerator** to **GPU**
- Select **T4 GPU**

Using a GPU significantly reduces training time for Chemprop models.


# Chemprop on ESOL ‚Äî Scaffold Split (Structure Only)

This notebook evaluates Chemprop under a **Bemis‚ÄìMurcko scaffold-based split**
to assess generalization to unseen chemical scaffolds.

In this experiment:
- Molecules are represented **only by SMILES-derived molecular graphs**
- **No molecular descriptors** are used
- The dataset is split using a **predefined scaffold split**
  (Train / Validation / Test)
- An **ensemble of 5 Chemprop models** is trained
- Each model is trained for **50 epochs**

Scaffold splitting provides a more chemically realistic evaluation
compared to random splits and tests the robustness of learned
structure‚Äìproperty relationships.


In [None]:
# 1Ô∏è‚É£ Fetch data
!git clone https://github.com/MZiaAfzal71/Edge-Aware-GNN.git

In [None]:
# 2Ô∏è‚É£ Change current/working directory
%cd Edge-Aware-GNN/ESOL\ Dataset

In [None]:
# 3Ô∏è‚É£ Install rdkit and PyG
!pip install rdkit chemprop

In [None]:
#  4Ô∏è‚É£ Imports
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import random
import copy
import os

import torch
from lightning import pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint

from chemprop import data, featurizers, models, nn
from chemprop.models import save_model, load_model
from chemprop.data import split_data_by_indices, make_split_indices


from rdkit import Chem
from rdkit.Chem import Descriptors

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
# 5Ô∏è‚É£ Set random seeds for reproducibility across Python, NumPy, and PyTorch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
# 6Ô∏è‚É£ Load dataset, standardize target variable, and prepare data for modeling

tqdm.pandas()

file_path = "delaney-processed-scaffold.csv"
smiles_col = "smiles"
target_col = "measured log solubility in mols per litre"

set_seed(42)

df = pd.read_csv(file_path)

smis_col = df[smiles_col].values
ys = df[[target_col]].values


In [None]:
# 8Ô∏è‚É£ Create molecular datapoints from SMILES, generate RDKit molecules, and split the dataset into train/validation/test sets

split_col = df['BM-Scaffold']
train_ind = split_col[split_col == "Train"].index
val_ind = split_col[split_col == "Valid"].index
test_ind = split_col[split_col == "Test"].index

all_data = [data.MoleculeDatapoint.from_smi(smi, y)
            for smi, y in zip(smis_col, ys)]

mols = [d.mol for d in all_data]

train_data, val_data, test_data = split_data_by_indices(
    all_data, [train_ind], [val_ind], [test_ind]
)

In [None]:
# 9Ô∏è‚É£ Featurize molecules, normalize targets, build data loaders, and initialize an ensemble of MPNN models with regression heads and evaluation metrics

featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data[0], featurizer)
scaler = train_dset.normalize_targets()

val_dset = data.MoleculeDataset(val_data[0], featurizer)
val_dset.normalize_targets(scaler)

test_dset = data.MoleculeDataset(test_data[0], featurizer)

train_loader = data.build_dataloader(train_dset)
val_loader = data.build_dataloader(val_dset, shuffle=False)
test_loader = data.build_dataloader(test_dset, shuffle=False)

mp = nn.BondMessagePassing()
agg = nn.MeanAggregation()

ffn_input_dim = mp.output_dim
output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
ffn = nn.RegressionFFN(input_dim=ffn_input_dim, output_transform=output_transform)
batch_norm = True

metric_list = [nn.metrics.RMSE(), nn.metrics.MAE(), nn.metrics.R2Score()] # Only the first metric is used for training and early stopping

ensemble = []
n_models = 10

for _ in range(n_models):
    ensemble.append(models.MPNN(mp, agg, ffn, metrics=metric_list))


In [None]:
# üîü Train an ensemble of MPNN models with PyTorch Lightning, using checkpointing to save and reload the best-performing model from each run

trained_ensemble = []
for i, model in enumerate(ensemble):
    out_dir = Path(f"chemprop_model/ch_pt_random_{i}") # directory for storing the best model after training
    os.makedirs(out_dir, exist_ok=True)

    check_pointing = ModelCheckpoint(
        out_dir,  # Directory where model checkpoints will be saved
        "best-{epoch}-{val_loss:.3f}",  # Filename format for checkpoints, including epoch and validation loss
        "val_loss",  # Metric used to select the best checkpoint (based on validation loss)
        mode="min",  # Save the checkpoint with the lowest validation loss (minimization objective)
        save_last=True,  # Always save the most recent checkpoint, even if it's not the best
    )


    trainer = pl.Trainer(
        logger=False,
        enable_checkpointing=True, # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
        enable_progress_bar=True,
        accelerator="auto",
        devices=1,
        max_epochs=50, # number of epochs to train for
        callbacks=[check_pointing], # Use the configured checkpoint callback
    )
    trainer.fit(model, train_loader, val_loader)

    best_model_path = check_pointing.best_model_path
    trained_ensemble.append(model.__class__.load_from_checkpoint(best_model_path,
        weights_only=False
    ))



In [None]:
# 1Ô∏è‚É£1Ô∏è‚É£ Generate and collect ensemble predictions on the training set and the combined validation‚Äìtest set

trained_dataset = data.MoleculeDataset(train_data[0], featurizer)
trained_dataloader = data.build_dataloader(trained_dataset, shuffle=False)

combined_val_data = [val_data[0] + test_data[0]]
combined_val_dataset = data.MoleculeDataset(combined_val_data[0])
combined_val_dataloader = data.build_dataloader(combined_val_dataset, shuffle=False)

train_prediction = []
val_prediction = []

for model in trained_ensemble:
    train_prediction.append(torch.concat(trainer.predict(model, trained_dataloader)))
    val_prediction.append(torch.concat(trainer.predict(model, combined_val_dataloader)))


In [None]:
train_y_true = ys[train_ind].ravel()
val_y_true = ys[np.concatenate([val_ind, test_ind])].ravel()

fold_results = []
for i, (t_pred, v_pred) in enumerate(zip(train_prediction, val_prediction)):
    train_rmse = np.sqrt(mean_squared_error(
                t_pred,
                train_y_true
    ))

    train_r2 = r2_score(
        t_pred,
        train_y_true
    )

    train_mae = mean_absolute_error(
        t_pred,
        train_y_true
    )

    val_rmse = np.sqrt(mean_squared_error(
        v_pred,
        val_y_true
    ))

    val_r2 = r2_score(
        v_pred,
        val_y_true
    )

    val_mae = mean_absolute_error(
        v_pred,
        val_y_true
    )

    fold_results.append({
        "Ensemble": i + 1,
        "best_train_rmse": train_rmse,
        "best_train_r2": train_r2,
        "best_train_mae": train_mae,
        "best_val_rmse": val_rmse,
        "best_val_r2": val_r2,
        "best_val_mae": val_mae
    })

chemprop_res_df = pd.DataFrame(fold_results)
chemprop_res_df.to_csv("chemprop ensemble scaffold.csv", index=False)

In [None]:
# 1Ô∏è‚É£3Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£4Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£5Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£6Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£7Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£8Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£9Ô∏è‚É£