# Step Forward Cross Validation for Bioactivity Prediction

## Let's compare and see the changes in number of compounds after standardization and deduplication.

In [1]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
after_std = {}

for fname in os.listdir('../benchmark/data/standardized'):
    if fname.endswith('.csv'):
        df = pd.read_csv(f'../benchmark/data/standardized/{fname}')
        after_std[fname[:-4]] = len(df["standardized_smiles"])

In [3]:
before_std = {}

for fname in os.listdir('../benchmark/data/raw'):
    if fname.endswith('.csv'):
        df = pd.read_csv(f'../benchmark/data/raw/{fname}')
        before_std[fname[:-4]] = len(df["canonical_smiles"])

In [4]:
df = pd.DataFrame([before_std, after_std]).T
df.index.name = "Target"
df.columns = ["Before Standardization", "After Standardization"]
df

Unnamed: 0_level_0,Before Standardization,After Standardization
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
target_CHEMBL220-1.IC50,1582,1200
target_CHEMBL3130-1.IC50,987,938
target_CHEMBL5145-1.IC50,221,209
target_CHEMBL4235-4.IC50,252,214
target_CHEMBL206-1.IC50,489,471
...,...,...
target_CHEMBL2039-3.IC50,484,454
target_CHEMBL240-2.IC50,249,241
target_CHEMBL240-3.IC50,670,603
target_CHEMBL222-2.IC50,214,194


In [5]:
os.makedirs('../benchmark/results/tables', exist_ok=True)
os.makedirs('../benchmark/results/figures', exist_ok=True)

In [6]:
df.to_csv('../benchmark/results/tables/before_and_after_standardization.csv', index=False)

## Plotting this Relationship

In [None]:
# Create the parity plot
plt.figure(figsize=(6, 6))
sns.scatterplot(x=df["Before Standardization"], y=df["After Standardization"])

# Add y = x reference line
max_val = max(df["Before Standardization"].max(), df["After Standardization"].max())
plt.plot([0, max_val], [0, max_val], linestyle="--", color="red", label="No Change")

# Labels and title
plt.xlabel("No. of Molecules Before Standardization")
plt.ylabel("No. of Molecules After Standardization")
plt.legend()

# Show the plot
plt.title("Change in Number of Molecules Before & After Standardization")
plt.savefig('../benchmark/results/figures/before_and_after_standardization.svg')
plt.show()