In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import numpy as np
import pandas as pd
import pickle as pkl
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [None]:
from astartes import train_val_test_split

In [None]:
params = Chem.SmilesParserParams()
params.removeHs = False

# Read in the data
- This csv file was directly taken from [Zenodo](https://zenodo.org/record/6618262#.Y-ZRzMHMLUI) which stores data from the following publication: Kevin A. Spiekermann, Lagnajit Pattanaik, and William H. Green. "High Accuracy Barrier Heights, Enthalpies, and Rate Coefficients for Chemical Reactions". In: Sci. Data 9.1 (2022), pp. 1–12. [link](https://www.nature.com/articles/s41597-022-01529-6)

In [None]:
CSV_PATH = 'ccsdtf12_dz.csv'
df = pd.read_csv(CSV_PATH)

In [None]:
df.describe()

# Random splits

In [None]:
RANDOM_SPLITS = []
sampler = "random"
for seed in range(5):
    # create 85:5:10 data split
    _, _, _, train_indices, val_indices, test_indices = train_val_test_split(
        np.arange(len(df)),
        train_size=0.85,
        val_size=0.05,
        test_size=0.1,
        sampler=sampler,
        random_state=seed,
        return_indices=True,
    )
    RANDOM_SPLITS.append([train_indices, val_indices, test_indices])

In [None]:
with open('RDB7_splits/RDB7_splits_random.pkl', 'wb') as f:
    pkl.dump(RANDOM_SPLITS, f)

# Scaffold splits

In [None]:
SCAFFOLD_SPLITS = []
sampler = "scaffold"
for seed in range(5):
    # create 85:5:10 data split
    (
        _,
        _,
        _,
        train_labels,
        val_labels,
        test_labels,
        train_indices,
        val_indices,
        test_indices,
    ) = train_val_test_split(
        df.rsmi.values,
        train_size=0.85,
        val_size=0.05,
        test_size=0.1,
        sampler=sampler,
        random_state=seed,
        return_indices=True,
    )
    SCAFFOLD_SPLITS.append([train_indices, val_indices, test_indices])

In [None]:
with open('RDB7_splits/RDB7_splits_scaffold.pkl', 'wb') as f:
    pkl.dump(SCAFFOLD_SPLITS, f)

# KMeans

Featurize the data using morgan fingerprint with standard settings.

Function taken from Chemprop: https://github.com/chemprop/chemprop/blob/master/chemprop/features/features_generators.py

In [None]:
MORGAN_RADIUS = 2
MORGAN_NUM_BITS = 2048

def morgan_counts_features_generator(
    mol,
    radius=MORGAN_RADIUS,
    num_bits=MORGAN_NUM_BITS,
):
    """
    Generates a counts-based Morgan fingerprint for a molecule.
    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
    :param radius: Morgan fingerprint radius.
    :param num_bits: Number of bits in Morgan fingerprint.
    :return: A 1D numpy array containing the counts-based Morgan fingerprint.
    """
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
    features = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features

In [None]:
morgan_fps = np.zeros((len(df), MORGAN_NUM_BITS))
for i, row in df.iterrows():
    rmol = Chem.MolFromSmiles(row.rsmi, params)
    morgan = morgan_counts_features_generator(rmol)
    morgan_fps[i, :] = morgan

In [None]:
KMEANS_SPLITS = []
sampler = "kmeans"
seed = 0
K = 20
n_init = 10

# cluster the data
_, _, _, train_labels, val_labels, test_labels, train_indices, val_indices, test_indices = train_val_test_split(
    morgan_fps,
    train_size=0.85,
    val_size=0.05,
    test_size=0.1,
    sampler=sampler,
    hopts={"n_clusters": K, "n_init": n_init},
    random_state=seed,
    return_indices=True,
)

In [None]:
labels = np.concatenate((train_labels, val_labels, test_labels))
labels.shape

In [None]:
indices = np.concatenate((train_indices, val_indices, test_indices))
indices.shape

In [None]:
clusters2indices = {i: [] for i in range(K)}
# clusters2rsmiles = {i: [] for i in range(K)}
for idx, label in zip(indices, labels):
    # rsmi = df.rsmi.values[idx]
    # clusters2rsmiles[label].append(rsmi)
    clusters2indices[label].append(idx)

In [None]:
clusters2sizes = {k: len(v)/len(df) for k, v in clusters2indices.items()}
sorted_clusters2sizes = {k: v for k, v in sorted(clusters2sizes.items(), key=lambda item: item[1])}
for key, value in sorted_clusters2sizes.items():
    print(f'cluster {key} has {value * 100:0.1f} % of the data')

In [None]:
# define the val and test clusters to aim for 85:5:10 splits
val_test_cluster_indices = [
    ([17],  [15, 16]),
    ([13],  [3,  10]),
    ([0],   [18, 19]),
    ([8],   [12,  9]),
    ([14],  [5,   8]),
]

KMEANS_SPLITS = []
for val_keys, test_keys in val_test_cluster_indices:
    indices_set = set(clusters2sizes.keys())
    print('*'*88)
    
    # get val indices
    val_indices = []
    val_clusters = set()
    for val_key in val_keys:
        val_clusters.add(val_key)
        
        val_indices_tmp = clusters2indices[val_key]
        val_indices.extend(val_indices_tmp)
        print(f'Validation group is {val_key} with {len(val_indices_tmp)} samples i.e. {len(val_indices_tmp)/len(df)*100:.1f}%')
        indices_set.remove(val_key)
    if len(val_keys) > 1:
        print(f'Validation group is {val_clusters} with {len(val_indices)} samples i.e. {len(val_indices)/len(df)*100:.1f}%')
    print()
    
    # get test indices
    test_indices = []
    test_clusters = set()
    for test_key in test_keys:
        test_clusters.add(test_key)
        
        test_indices_tmp = clusters2indices[test_key]
        test_indices.extend(test_indices_tmp)
        print(f'Testing group is {test_key} with {len(test_indices_tmp)} samples i.e. {len(test_indices_tmp)/len(df)*100:.1f}%')
        # test_indices = clusters2indices[test_cluster]
        indices_set.remove(test_key)
    if len(test_keys) > 1:
        print(f'Testing group is {test_clusters} with {len(test_indices)} samples i.e. {len(test_indices)/len(df)*100:.1f}%')
    print()
    
    # training indices is the remaining clusters
    train_indices = []
    for i in indices_set:
        train_indices.extend(clusters2indices[i])
    print(f'Training groups are {indices_set} with {len(train_indices)} i.e. {len(train_indices)/len(df)*100:.1f}% samples')
    
    # make sure this adds up to the total
    assert (len(train_indices) + len(val_indices) + len(test_indices)) == len(df)
    KMEANS_SPLITS.append([train_indices, val_indices, test_indices])

In [None]:
with open('RDB7_splits/RDB7_splits_kmeans.pkl', 'wb') as f:
    pkl.dump(KMEANS_SPLITS, f)