In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import numpy as np
import pandas as pd
import pickle as pkl
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [None]:
from astartes import train_val_test_split

Define helper functions

In [None]:
params = Chem.SmilesParserParams()
params.removeHs = False

Function taken from Chemprop: https://github.com/chemprop/chemprop/blob/master/chemprop/features/features_generators.py

In [None]:
MORGAN_RADIUS = 2
MORGAN_NUM_BITS = 2048


def morgan_counts_features_generator(
    mol,
    radius=MORGAN_RADIUS,
    num_bits=MORGAN_NUM_BITS,
):
    """
    Generates a counts-based Morgan fingerprint for a molecule.
    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
    :param radius: Morgan fingerprint radius.
    :param num_bits: Number of bits in Morgan fingerprint.
    :return: A 1D numpy array containing the counts-based Morgan fingerprint.
    """
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
    features = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features

# Read in the data
- This csv file was directly taken from [Zenodo](https://zenodo.org/record/6618262#.Y-ZRzMHMLUI) which stores data from the following publication: Kevin A. Spiekermann, Lagnajit Pattanaik, and William H. Green. "High Accuracy Barrier Heights, Enthalpies, and Rate Coefficients for Chemical Reactions". In: Sci. Data 9.1 (2022), pp. 1â€“12. [link](https://www.nature.com/articles/s41597-022-01529-6)

In [None]:
CSV_PATH = 'ccsdtf12_dz.csv'
df = pd.read_csv(CSV_PATH)

In [None]:
df.describe()

# Random splits

In [None]:
RANDOM_SPLITS = []
sampler = "random"
for seed in range(5):
    # create 85:5:10 data split
    _, _, _, train_indices, val_indices, test_indices = train_val_test_split(
        np.arange(len(df)),
        train_size=0.85,
        val_size=0.05,
        test_size=0.1,
        sampler=sampler,
        random_state=seed,
        return_indices=True,
    )
    print(
        len(train_indices),
        len(val_indices),
        len(test_indices),
        f"first val index {val_indices[0]}",
        f"first test index {test_indices[0]}",
    )
    RANDOM_SPLITS.append([train_indices, val_indices, test_indices])

In [None]:
with open('RDB7_splits/RDB7_splits_random.pkl', 'wb') as f:
    pkl.dump(RANDOM_SPLITS, f)

# Scaffold splits

In [None]:
SCAFFOLD_SPLITS = []
sampler = "scaffold"
for seed in range(5):
    # create 85:5:10 data split
    (
        _,
        _,
        _,
        train_labels,
        val_labels,
        test_labels,
        train_indices,
        val_indices,
        test_indices,
    ) = train_val_test_split(
        df.rsmi.values,
        train_size=0.85,
        val_size=0.05,
        test_size=0.1,
        sampler=sampler,
        random_state=seed,
        return_indices=True,
    )
    print(
        len(train_indices),
        len(val_indices),
        len(test_indices),
        f"first val index {val_indices[0]}",
        f"first test index {test_indices[0]}",
    )
    SCAFFOLD_SPLITS.append([train_indices, val_indices, test_indices])

In [None]:
with open('RDB7_splits/RDB7_splits_scaffold.pkl', 'wb') as f:
    pkl.dump(SCAFFOLD_SPLITS, f)

# KMeans

Featurize the data using morgan fingerprint with standard settings

In [None]:
morgan_fps = np.zeros((len(df), MORGAN_NUM_BITS))
for i, row in df.iterrows():
    rmol = Chem.MolFromSmiles(row.rsmi, params)
    morgan = morgan_counts_features_generator(rmol)
    morgan_fps[i, :] = morgan

- random seed 1 and 4 produce the same val and test splits here so I must use something different than 0, 1, 2, 3, 4
- but random seed 1 and 5 produce the same test set...
- so let's use seeds 0, 3, 6, 9, 12

In [None]:
KMEANS_SPLITS = []
sampler = "kmeans"
for seed in range(0, 15, 3):
    # create 85:5:10 data split
    _, _, _, _, _, _, train_indices, val_indices, test_indices = train_val_test_split(
        morgan_fps,
        train_size=0.85,
        val_size=0.05,
        test_size=0.1,
        sampler=sampler,
        hopts={"n_clusters": 20},
        random_state=seed,
        return_indices=True,
    )
    print(
        len(train_indices),
        len(val_indices),
        len(test_indices),
        f"first val index {val_indices[0]}",
        f"first test index {test_indices[0]}",
    )
    KMEANS_SPLITS.append([train_indices, val_indices, test_indices])

In [None]:
with open('RDB7_splits/RDB7_splits_kmeans.pkl', 'wb') as f:
    pkl.dump(KMEANS_SPLITS, f)