<a href="https://colab.research.google.com/github/MZiaAfzal71/Edge-Aware-GNN/blob/main/Models/Chemprop_D_MPNN_Repeated_KFold_for_ESOL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ‚ö†Ô∏è Runtime Configuration (Important)

To speed up model training, it is recommended to use a GPU runtime in Google Colab.

Go to:
- **Runtime ‚Üí Change runtime type**
- Set **Hardware accelerator** to **GPU**
- Select **T4 GPU**

Using a GPU significantly reduces training time for Chemprop models.


# Chemprop on ESOL ‚Äî Repeated 5√ó5 Cross-Validation (Structure Only)

This notebook trains and evaluates a **Chemprop message-passing neural network (MPNN)**
for predicting aqueous solubility on the **ESOL (Delaney) dataset** using
**structure-only molecular representations**.

In this experiment:
- Molecules are represented **exclusively by SMILES-derived molecular graphs**
- **No molecular descriptors** are included
- Model evaluation is performed using a **5√ó5 repeated cross-validation** strategy
- Each fold and repeat uses identical data splits for fair comparison
- An ensemble of Chemprop models is trained for each evaluation

This notebook establishes a **structure-only Chemprop baseline**
under a robust repeated cross-validation protocol.


In [None]:
# 1Ô∏è‚É£ Fetch data
!git clone https://github.com/MZiaAfzal71/Edge-Aware-GNN.git

In [None]:
# 2Ô∏è‚É£ Change current/working directory
%cd Edge-Aware-GNN/ESOL\ Dataset

In [None]:
# 3Ô∏è‚É£ Install rdkit and chemprop
!pip install rdkit chemprop

In [None]:
#  4Ô∏è‚É£ Imports
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import random
import copy
import os

import torch
from lightning import pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint

from chemprop import data, featurizers, models, nn
from chemprop.models import save_model, load_model
from chemprop.data import split_data_by_indices, make_split_indices


from rdkit import Chem
from rdkit.Chem import Descriptors

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import RepeatedKFold

In [None]:
# 5Ô∏è‚É£ Set random seeds for reproducibility across Python, NumPy, and PyTorch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
# 6Ô∏è‚É£ Utility function for repeated k-fold cross-validation

def run_repeated_kfold_cv(
    df,
    n_splits=5,
    n_repeats=5,
    seed=42
):
    rkf = RepeatedKFold(
        n_splits=n_splits,
        n_repeats=n_repeats,
        random_state=seed
    )

    split_id = 0

    indices = np.arange(len(df))

    for train_idx, val_idx in rkf.split(indices):
        repeat = split_id // n_splits
        fold   = split_id % n_splits


        yield repeat, fold, train_idx, val_idx

        split_id += 1


In [None]:
# 7Ô∏è‚É£ Run repeated k-fold cross-validation for a Chemprop MPNN model, training on molecular graphs and reporting RMSE, R¬≤, and MAE for each repeat‚Äìfold split

def run_chemprop_repeated_fold(
    df,
    smiles_col='smiles',
    target_col='measured log solubility in mols per litre',
    epochs=50
):

  fold_results = []

  for repeat, fold, train_idx, val_idx in run_repeated_kfold_cv(df):

    smis = df[smiles_col].values
    ys = df[[target_col]].values

    all_data = [data.MoleculeDatapoint.from_smi(smi, y)
            for smi, y in zip(smis, ys)]

    train_data = [all_data[idx] for idx in train_idx]
    val_data = [all_data[idx] for idx in val_idx]

    featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

    train_dset = data.MoleculeDataset(train_data, featurizer)
    scaler = train_dset.normalize_targets()

    val_dset = data.MoleculeDataset(val_data, featurizer)
    val_dset.normalize_targets(scaler)

    train_loader = data.build_dataloader(train_dset)
    train_loader_pred = data.build_dataloader(train_dset, shuffle=False)

    val_loader = data.build_dataloader(val_dset, shuffle=False)

    mp = nn.BondMessagePassing()
    agg = nn.MeanAggregation()

    ffn_input_dim = mp.output_dim
    output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
    ffn = nn.RegressionFFN(input_dim=ffn_input_dim, output_transform=output_transform)
    batch_norm = True

    metric_list = [nn.metrics.RMSE(), nn.metrics.MAE(), nn.metrics.R2Score()] # Only the first metric is used for training and early stopping

    mpnn = models.MPNN(mp, agg, ffn, metrics=metric_list)

    out_dir = Path(f"chemprop_model/ch_pt_repeat_{repeat}_fold_{fold}") # directory for storing the best model after training
    os.makedirs(out_dir, exist_ok=True)

    check_pointing = ModelCheckpoint(
        out_dir,  # Directory where model checkpoints will be saved
        "best-{epoch}-{val_loss:.3f}",  # Filename format for checkpoints, including epoch and validation loss
        "val_loss",  # Metric used to select the best checkpoint (based on validation loss)
        mode="min",  # Save the checkpoint with the lowest validation loss (minimization objective)
        save_last=True,  # Always save the most recent checkpoint, even if it's not the best
    )


    trainer = pl.Trainer(
        logger=False,
        enable_checkpointing=True, # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
        enable_progress_bar=True,
        accelerator="auto",
        devices=1,
        max_epochs=epochs, # number of epochs to train for
        callbacks=[check_pointing], # Use the configured checkpoint callback
    )
    trainer.fit(mpnn, train_loader, val_loader)

    best_model_path = check_pointing.best_model_path
    mpnn = mpnn.__class__.load_from_checkpoint(best_model_path,
        weights_only=False
    )

    train_dset = data.MoleculeDataset(train_data, featurizer)
    val_dset = data.MoleculeDataset(val_data, featurizer)

    train_loader = data.build_dataloader(train_dset, shuffle=False)
    val_loader = data.build_dataloader(val_dset, shuffle=False)

    train_predictions = torch.concat(trainer.predict(mpnn, train_loader_pred))
    val_predictions = torch.concat(trainer.predict(mpnn, val_loader))

    train_y_true = df[target_col][train_idx].values
    val_y_true = df[target_col][val_idx].values

    train_rmse = np.sqrt(mean_squared_error(train_predictions, train_y_true))

    train_r2 = r2_score(train_predictions, train_y_true)

    train_mae = mean_absolute_error(train_predictions, train_y_true)

    val_rmse = np.sqrt(mean_squared_error(val_predictions, val_y_true))

    val_r2 = r2_score(val_predictions, val_y_true)

    val_mae = mean_absolute_error(val_predictions, val_y_true)

    fold_results.append({
        "Repeat": repeat+1,
        "Fold" : fold+1,
        "best_train_rmse": train_rmse,
        "best_train_r2": train_r2,
        "best_train_mae": train_mae,
        "best_val_rmse": val_rmse,
        "best_val_r2": val_r2,
        "best_val_mae": val_mae
    })

  return fold_results



In [None]:
# 8Ô∏è‚É£  Load dataset, standardize target variable, and prepare data for modeling

tqdm.pandas()

file_path = "delaney-processed-scaffold.csv"
smiles_col = "smiles"
target_col = "measured log solubility in mols per litre"

set_seed(42)

df = pd.read_csv(file_path)

In [None]:
# 9Ô∏è‚É£ Execute repeated k-fold Chemprop training, collect fold-wise performance metrics, and save the results to a CSV file

results = run_chemprop_repeated_fold(df)
res_df = pd.DataFrame(results)
res_df.to_csv("chemprop repeated kfold.csv", index=False)

In [None]:
# üîü

In [None]:
# 1Ô∏è‚É£1Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£2Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£3Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£4Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£5Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£6Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£7Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£8Ô∏è‚É£

In [None]:
# 1Ô∏è‚É£9Ô∏è‚É£