# Step Forward Cross Validation for Bioactivity Prediction

## Implmenetation of RandomSplitCV

In [1]:
import numpy as np
import pandas as pd

In [2]:
class RandomSplitCV:
    def __init__(self, frac_train=0.9, n_folds=10, seed=69420):
        self.frac_train = frac_train
        self.n_folds = n_folds
        self.seed = seed

    def split(self, df):
        n_total = len(df)
        indices = np.arange(n_total)

        for i in range(1, self.n_folds):
            fold_seed = self.seed * i
            rng = np.random.RandomState(fold_seed)
            shuffled_indices = rng.permutation(indices)

            n_train = int(np.floor(self.frac_train * n_total))
            train_indices = shuffled_indices[:n_train]
            test_indices = shuffled_indices[n_train:]

            yield train_indices, test_indices

In [3]:
random_cv = RandomSplitCV()

In [4]:
df = pd.read_csv("../benchmark/data/processed/target_CHEMBL1865-1.IC50.csv")

In [5]:
for i, (train_idx, test_idx) in enumerate(random_cv.split(df)):
    print(
        f"Fold [{i + 1}] => Train [{len(train_idx)} ({len(train_idx) / (len(train_idx) + len(test_idx)) * 100:.2f}%)] + Test [{len(test_idx)} ({len(test_idx) / (len(train_idx) + len(test_idx)) * 100:.2f}%)]")

