# Step Forward Cross Validation for Bioactivity Prediction

**"Traditional random data splits produce similar molecules between training and test sets, conflicting with the reality of VS libraries which mostly contain structurally distinct compounds. Scaffold split, grouping molecules by shared core structure, is widely considered to reflect this real-world scenario."** -- https://arxiv.org/pdf/2406.00873

## Implmenetation of RandomSplitCV

In [1]:
import numpy as np
import pandas as pd

In [3]:
class RandomSplitCV:
    def __init__(self, frac_train=0.9, n_folds=10, seed=69420):
        self.frac_train = frac_train
        self.n_folds = n_folds
        self.seed = seed

    def split(self, df):
        n_total = len(df)
        indices = np.arange(n_total)

        for i in range(1, self.n_folds):
            fold_seed = self.seed * i
            rng = np.random.RandomState(fold_seed)
            shuffled_indices = rng.permutation(indices)

            n_train = int(np.floor(self.frac_train * n_total))
            train_indices = shuffled_indices[:n_train]
            test_indices = shuffled_indices[n_train:]

            yield train_indices, test_indices

In [4]:
random_cv = RandomSplitCV()

In [5]:
df = pd.read_csv("../benchmark/data/processed/target_CHEMBL203-1.Ki.csv")

In [6]:
for i, (train_idx, test_idx) in enumerate(random_cv.split(df)):
    print(
        f"Fold [{i + 1}] => Train [{len(train_idx)} ({len(train_idx) / (len(train_idx) + len(test_idx)) * 100:.2f}%)] + Test [{len(test_idx)} ({len(test_idx) / (len(train_idx) + len(test_idx)) * 100:.2f}%)]")

