# Step Forward Cross Validation for Bioactivity Prediction

**"Traditional random data splits produce similar molecules between training and test sets, conflicting with the reality of VS libraries which mostly contain structurally distinct compounds. Scaffold split, grouping molecules by shared core structure, is widely considered to reflect this real-world scenario."** -- https://arxiv.org/pdf/2406.00873

## Implementation of Scaffold Split Cross Validation Class

In [1]:
from collections import defaultdict

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold

`ScaffoldSplitCV` groups molecules by their chemical scaffolds, shuffles these groups, and sequentially assigns entire scaffold groups to the training set until a target fraction is reached, with the remaining groups forming the test set.

In [2]:
class ScaffoldSplitCV:
    def __init__(self, smiles_col="standardized_smiles", n_folds=10, frac_train=0.9, seed=69420,
                 include_chirality=False):
        self.smiles_col = smiles_col
        self.frac_train = frac_train
        self.seed = seed
        self.n_folds = n_folds
        self.include_chirality = include_chirality

    def split(self, df):
        smiles_list = df[self.smiles_col].tolist()
        for i in range(1, self.n_folds):
            yield self._scaffold_split(smiles_list, self.frac_train, i * self.seed, self.include_chirality)

    def _scaffold_split(self, smiles_list, frac_train, seed, include_chirality):
        scaffold_to_indices = defaultdict(list)
        for idx, smiles in enumerate(smiles_list):
            scaffold = self._generate_scaffold(smiles, include_chirality)
            scaffold_to_indices[scaffold].append(idx)

        scaffold_groups = list(scaffold_to_indices.values())
        rng = np.random.RandomState(seed)
        rng.shuffle(scaffold_groups)

        n_total = len(smiles_list)
        n_train = int(np.floor(frac_train * n_total))

        train_indices = []
        test_indices = []

        for group in scaffold_groups:
            if len(train_indices) + len(group) <= n_train:
                train_indices.extend(group)
            else:
                test_indices.extend(group)

        return np.array(train_indices), np.array(test_indices)

    @staticmethod
    def _generate_scaffold(smiles, include_chirality=False):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError(f"Invalid SMILES string: {smiles}")
        scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
        return scaffold

In [3]:
scaffold_cv = ScaffoldSplitCV()

In [4]:
df = pd.read_csv("../benchmark/data/processed/target_CHEMBL1865-1.IC50.csv")

#### Number of Train & Test Molecules Across Folds

In [5]:
for i, (train_idx, test_idx) in enumerate(scaffold_cv.split(df)):
    print(
        f"Fold [{i + 1}] => Train [{len(train_idx)} ({len(train_idx) / (len(train_idx) + len(test_idx)) * 100:.2f}%)] + Test [{len(test_idx)} ({len(test_idx) / (len(train_idx) + len(test_idx)) * 100:.2f}%)]")

