# Step Forward Cross Validation for Bioactivity Prediction

## Benchmark for hERG, MAP14K and VEGFR2 for 3 fingerprints (ECFP4, RDKit and AtomPair)

In [None]:
import os

os.chdir('../sfcv/')
from datasplit import SortedStepForwardCV, UnsortedStepForwardCV, ScaffoldSplitCV, RandomSplitCV

os.chdir('../notebook/')

In [33]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from concurrent.futures import ProcessPoolExecutor

### Fingerprint Calculation

In [None]:
ecfp4gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048)
apgen = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2048)

In [None]:
def compute_ecfp4(smiles: str) -> np.ndarray | None:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return ecfp4gen.GetFingerprintAsNumPy(mol)

In [None]:
def compute_rdkit_fp(smiles: str) -> np.ndarray | None:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdkgen.GetFingerprintAsNumPy(mol)

In [None]:
def compute_atompair_fp(smiles: str) -> np.ndarray | None:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return apgen.GetFingerprintAsNumPy(mol)

#### Since, we'll be training on these fingerprints, precomputing these fingerprints and saving them will save some time.

In [None]:
molecule_set = set()

for fname in os.listdir("../benchmark/data/processed"):
    if fname.endswith(".csv"):
        df = pd.read_csv(f"../benchmark/data/processed/{fname}")
        molecule_set |= set(df["standardized_smiles"].unique())

In [None]:
len(molecule_set)

In [None]:
smi2ecfp4 = {}
smi2atompair = {}
smi2rdkit = {}

In [None]:
for smi in tqdm(molecule_set, desc="Computing Fingerprints"):
    smi2ecfp4[smi] = compute_ecfp4(smi)
    smi2atompair[smi] = compute_atompair_fp(smi)
    smi2rdkit[smi] = compute_rdkit_fp(smi)

## Saving the split columns

In [None]:
os.makedirs("../benchmark/data/final/", exist_ok=True)

In [None]:
cv_splitters = {
    "RandomSplit": RandomSplitCV(frac_train=0.9, n_folds=10, seed=69420),
    "ScaffoldSplit": ScaffoldSplitCV(smiles_col='standardized_smiles', n_folds=10, frac_train=0.9, seed=69420,
                                     include_chirality=False),
    "SortedStepForward_LogD": SortedStepForwardCV(sorting_col="LogD", ideal=2, n_bins=10, ascending=False),
    "SortedStepForward_LogP": SortedStepForwardCV(sorting_col="LogP", ideal=2, n_bins=10, ascending=False),
    "SortedStepForward_MCE18": SortedStepForwardCV(sorting_col="MCE18", n_bins=10, ascending=True),
    "UnsortedStepForward": UnsortedStepForwardCV(n_bins=10, random_state=69420)
}

In [None]:
def add_cv_split_columns(df, cv_splitters):
    df = df.copy()
    for split_name, cv_splitter in cv_splitters.items():
        for fold_idx, (train_idx, test_idx) in enumerate(cv_splitter.split(df), start=1):
            col_name = f"{split_name}_Fold_{fold_idx}"
            df[col_name] = None
            df.loc[train_idx, col_name] = "Train"
            df.loc[test_idx, col_name] = "Test"
    return df

In [None]:
for fname in tqdm(os.listdir('../benchmark/data/processed/'), desc="Processing Splits"):
    if os.path.exists(f"../benchmark/data/final/{fname}"):
        continue
    if fname.endswith('.csv'):
        df = pd.read_csv(f"../benchmark/data/processed/{fname}")
        df = add_cv_split_columns(df, cv_splitters)
        df.to_csv(f"../benchmark/data/final/{fname}")

## Models

In [None]:
def mlp_regressor_factory(n_train, random_state=42):
    n_hidden = min(25, int(np.sqrt(n_train)))
    return MLPRegressor(
        hidden_layer_sizes=(n_hidden,), random_state=random_state, max_iter=1000
    )

In [None]:
def xgb_regressor_factory(n_train, random_state=42):
    n_estimators = min(25, int(np.sqrt(n_train)))
    return XGBRegressor(n_estimators=n_estimators, random_state=random_state)

In [None]:
def rf_regressor_factory(n_train, random_state=42):
    n_trees = min(25, int(np.sqrt(n_train)))
    return RandomForestRegressor(n_estimators=n_trees, random_state=random_state)

In [None]:
regressor_factories = [rf_regressor_factory, xgb_regressor_factory, mlp_regressor_factory]

In [None]:
def process_regressor(regressor_factory, X_train, y_train, fingerprint_vals):
    regressor = regressor_factory(len(X_train))
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(np.vstack(fingerprint_vals))
    identifier = getattr(regressor_factory, '__name__', str(regressor_factory))
    return identifier, y_pred

## Bulk Tanimoto Similarity

In [None]:
def bulk_tanimoto_similarity(mol_fp: np.ndarray, list_of_fps: np.ndarray) -> np.ndarray:
    intersection = np.sum(list_of_fps & mol_fp, axis=1)
    union = np.sum(list_of_fps | mol_fp, axis=1)
    return intersection / union

## Let's Train the models

In [None]:
os.makedirs("../benchmark/data/results/", exist_ok=True)

In [30]:
fp2map = {"ECFP4": smi2ecfp4,
          "RDKitFP": smi2rdkit,
          "AtomPairsFP": smi2atompair}

In [31]:
for fname in tqdm(os.listdir("../benchmark/data/final/"), desc="Training"):
    if fname.endswith(".csv"):
        df = pd.read_csv(f"../benchmark/data/final/{fname}")

        fold_cols = [i for i in df.columns if "_Fold_" in i]
        for fold_col in tqdm(fold_cols, desc="Processing Splits and Folds"):
            test_mask = df[fold_col] == "Test"
            train_mask = df[fold_col] == "Train"

            for fp, mapping in fp2map.items():
                df[fp] = df["standardized_smiles"].map(mapping)
                test_fps = np.vstack(df.loc[test_mask, fp].values)
                train_fps = np.vstack(df.loc[train_mask, fp].values)
                max_tcs = [max(bulk_tanimoto_similarity(test_fp, train_fps)) for test_fp in test_fps]
                df.loc[test_mask, f"{fold_col}_{fp}_Tc"] = max_tcs

                X_train = np.vstack(df.loc[df[fold_col] == "Train", fp].values)
                y_train = df.loc[df[fold_col] == "Train", "pchembl_value"].values
                X = np.vstack(df[fp].values)

                for regressor_factory in regressor_factories:
                    model_name, preds = process_regressor(regressor_factory, X_train, y_train, X)
                    df[f'{fold_col}_{fp}_{model_name}'] = preds
        df.to_csv(f"../benchmark/data/results/{fname}")

Training:   0%|          | 0/67 [00:00<?, ?it/s]

Processing Splits and Folds:   2%|▏         | 1/54 [00:29<25:42, 29.10s/it]
Training:   0%|          | 0/67 [00:29<?, ?it/s]


KeyboardInterrupt: 

In [32]:
def process_single_file(fname):
    # Read the CSV file
    df = pd.read_csv(f"../benchmark/data/final/{fname}")

    # Identify fold columns
    fold_cols = [col for col in df.columns if "_Fold_" in col]

    for fold_col in fold_cols:
        test_mask = df[fold_col] == "Test"
        train_mask = df[fold_col] == "Train"

        for fp, mapping in fp2map.items():
            df[fp] = df["standardized_smiles"].map(mapping)
            test_fps = np.vstack(df.loc[test_mask, fp].values)
            train_fps = np.vstack(df.loc[train_mask, fp].values)

            max_tcs = [max(bulk_tanimoto_similarity(test_fp, train_fps)) for test_fp in test_fps]
            df.loc[test_mask, f"{fold_col}_{fp}_Tc"] = max_tcs

            X_train = np.vstack(df.loc[train_mask, fp].values)
            y_train = df.loc[train_mask, "pchembl_value"].values
            X_full = np.vstack(df[fp].values)

            for regressor_factory in regressor_factories:
                model_name, preds = process_regressor(regressor_factory, X_train, y_train, X_full)
                df[f'{fold_col}_{fp}_{model_name}'] = preds

    df.to_csv(f"../benchmark/data/results/{fname}")
    return fname

In [34]:
def main():
    files = [fname for fname in os.listdir("../benchmark/data/final/") if fname.endswith(".csv")]
    with ProcessPoolExecutor() as executor:
        list(tqdm(executor.map(process_single_file, files), total=len(files), desc="Processing Files"))

In [35]:
if __name__ == "__main__":
    main()

Processing Files:   0%|          | 0/67 [00:00<?, ?it/s]Process SpawnProcess-1:
Process SpawnProcess-3:
Process SpawnProcess-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/process.py", line 249, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/queues.py", line 122

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.