# Step Forward Cross Validation for Bioactivity Prediction

## Create a table extracting relevant information from the benchmark results

In [4]:
import os
import re

import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
from tqdm import tqdm

In [6]:
NOVELTY_TC = 0.55
data_dir = "../benchmark/data/results"
fnames = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".csv")]

results = []

for fname in tqdm(fnames):
    df = pd.read_csv(fname)
    pred_cols = [i for i in df.columns if i.endswith("_regressor_factory")]
    target = os.path.basename(fname).replace(".csv", "")
    if target in ["target_CHEMBL240-1.IC50", "target_CHEMBL240-2.IC50", "target_CHEMBL240-3.IC50"]:
        print(target + 'processed with pchembl_value < 5.2')
        discovery_func = lambda d: d["pchembl_value"] < 5.2
    else:
        discovery_func = lambda d: d["pchembl_value"] > 7

    split_cache = {}
    for pred_col in pred_cols:
        match = re.search(r"(.*?)_Fold_(\d+)_(.*?)_(.*?)_regressor_factory", pred_col)
        split_type, fold_no, fp, model = match.groups()
        split_key = f"{split_type}_Fold_{fold_no}"

        if split_key in split_cache:
            df_test, df_train = split_cache[split_key]
        else:
            mask_test = df[split_key] == "Test"
            mask_train = df[split_key] == "Train"
            df_test = df[mask_test]
            df_train = df[mask_train]
            split_cache[split_key] = (df_test, df_train)

        n_test = df_test.shape[0]
        n_train = df_train.shape[0]

        train_mean_pchembl = df_train['pchembl_value'].mean()
        test_mean_pchembl = df_test['pchembl_value'].mean()

        novelty_col = f"{split_type}_Fold_{fold_no}_{fp}_Tc"
        novelty_mask = df_test[novelty_col] < NOVELTY_TC

        discovery_mask = discovery_func(df_test)
        total_discovery = discovery_mask.sum()

        novel_discovery_mask = novelty_mask & discovery_mask
        n_novel = novelty_mask.sum()

        err_mask = (np.abs(df_test[pred_col] - df_test["pchembl_value"]) <= 0.5)

        novel_disc_within_err = (novel_discovery_mask & err_mask).sum()
        disc_within_err = (discovery_mask & err_mask).sum()

        y_true = df_test["pchembl_value"].values
        y_pred = df_test[pred_col].values
        r2 = r2_score(y_true, y_pred)
        rmse = root_mean_squared_error(y_true, y_pred)

        if novelty_mask.sum() > 0:
            novelty_err = mean_absolute_error(
                df_test.loc[novelty_mask, "pchembl_value"],
                df_test.loc[novelty_mask, pred_col]
            )
        else:
            novelty_err = np.nan

        discovery_yield = disc_within_err / total_discovery if total_discovery > 0 else 0

        results.append({
            "target": target,
            "split_type": split_type,
            "fold_no": fold_no,
            "fingerprint": fp,
            "model": model,
            "r2": r2,
            "rmse": rmse,
            "novelty_err": novelty_err,
            "discovery_yield": discovery_yield,
            "number of discovery compounds": total_discovery,
            "number of discovery compounds within error range": disc_within_err,
            "number of novel compounds": n_novel,
            "number of novel discovery compounds": novel_discovery_mask.sum(),
            "number of novel discovery compounds within error range": novel_disc_within_err,
            "number of test compounds": n_test,
            "number of train compounds": n_train,
            "mean train pchembl value": train_mean_pchembl,
            "mean test pchembl value": test_mean_pchembl
        })

results_df = pd.DataFrame(results)
results_df.to_csv("../benchmark/results/tables/results.csv", index=False)

 25%|██▌       | 17/67 [00:07<00:21,  2.28it/s]

target_CHEMBL240-1.IC50processed with pchembl_value < 5.2


 94%|█████████▍| 63/67 [00:28<00:01,  2.45it/s]

target_CHEMBL240-2.IC50processed with pchembl_value < 5.2


 96%|█████████▌| 64/67 [00:28<00:01,  2.46it/s]

target_CHEMBL240-3.IC50processed with pchembl_value < 5.2


100%|██████████| 67/67 [00:29<00:00,  2.25it/s]
