In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
import pickle as pkl
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from astartes import train_val_test_split

Define helper functions

In [3]:
params = Chem.SmilesParserParams()
params.removeHs = False

Function taken from Chemprop: https://github.com/chemprop/chemprop/blob/master/chemprop/features/features_generators.py

In [4]:
MORGAN_RADIUS = 2
MORGAN_NUM_BITS = 2048
def morgan_counts_features_generator(mol,
                                     radius=MORGAN_RADIUS,
                                     num_bits=MORGAN_NUM_BITS):
    """
    Generates a counts-based Morgan fingerprint for a molecule.
    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
    :param radius: Morgan fingerprint radius.
    :param num_bits: Number of bits in Morgan fingerprint.
    :return: A 1D numpy array containing the counts-based Morgan fingerprint.
    """
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
    features = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features

# Read in the data
- This csv file was directly taken from [Zenodo](https://zenodo.org/record/6618262#.Y-ZRzMHMLUI) which stores data from the following publication: Kevin A. Spiekermann, Lagnajit Pattanaik, and William H. Green. "High Accuracy Barrier Heights, Enthalpies, and Rate Coefficients for Chemical Reactions". In: Sci. Data 9.1 (2022), pp. 1–12. [link](https://www.nature.com/articles/s41597-022-01529-6)

In [5]:
CSV_PATH = 'ccsdtf12_dz.csv'
df = pd.read_csv(CSV_PATH)
df

Unnamed: 0,idx,rsmi,psmi,dE0,dHrxn298,rmg_family
0,0,[C:1]([c:2]1[n:3][o:4][n:5][n:6]1)([H:7])([H:8...,[C:1]([C:2]([N:3]=[O:4])=[N+:6]=[N-:5])([H:7])...,48.61085,26.77621,
1,1,[C:1]([c:2]1[n:3][o:4][n:5][n:6]1)([H:7])([H:8...,[C:1]([N:3]=[C:2]=[N:6][N:5]=[O:4])([H:7])([H:...,74.02980,28.79099,
2,2,[C:1]([O:2][C:3]([C:4]([O:5][H:13])([H:11])[H:...,[C:1]1([H:6])([H:7])[O:2][C:3]([H:9])([H:10])[...,97.42200,12.60220,
3,3,[C:1]([O:2][C:3]([C:4]([O:5][H:13])([H:11])[H:...,[C:1]([O:2][H:13])([H:6])([H:7])[H:8].[C:3]1([...,75.25375,28.98589,
4,4,[C:1]([O:2][C:3]([C:4]([O:5][H:13])([H:11])[H:...,[C:1]([O:2][H:13])([H:6])([H:7])[H:8].[C:3]([C...,72.16356,1.41779,
...,...,...,...,...,...,...
11921,11956,[C:1]([C@@:2]([O:3][H:12])([C:4]([O:5][C:6](=[...,[C:1]([C:2][C:4]([O:5][C:6](=[O:7])[H:15])([H:...,75.56813,79.63518,
11922,11957,[C:1]([C@@:2]([O:3][H:12])([C:4]([O:5][C:6](=[...,[C:1]([C@@:2]1([H:11])[O:3][C@:6]([O:7][H:12])...,42.41621,5.79695,
11923,11958,[C:1]([C@@:2]([O:3][H:12])([C:4]([O:5][C:6](=[...,[C:1]([C@@:2]([O:3][H:12])([C:4](=[O:5])[H:14]...,72.75039,30.54744,
11924,11959,[C:1]([C@@:2]([O:3][H:12])([C:4]([O:5][C:6](=[...,[C:1](=[C:2]([C:4]([O:5][C:6](=[O:7])[H:15])([...,65.83112,14.48350,"1,3_Insertion_ROR"


In [6]:
df.describe()

Unnamed: 0,idx,dE0,dHrxn298
count,11926.0,11926.0,11926.0
mean,5974.624266,80.060894,35.416565
std,3449.834665,21.849569,30.21347
min,0.0,9.41876,-113.99216
25%,2989.25,65.0516,14.471478
50%,5972.5,78.67034,33.09714
75%,8955.75,93.110327,58.510087
max,11960.0,195.59815,172.16572


# Random splits

In [7]:
RANDOM_SPLITS = []
sampler='random'
for seed in range(5):
    # create 85:5:10 data split
    _, _, _, train_indices, val_indices, test_indices = train_val_test_split(np.arange(len(df)),
                                                                    train_size=0.85,
                                                                    val_size=0.05,
                                                                    test_size=0.1,
                                                                    sampler=sampler,
                                                                    random_state=seed,
                                                                    return_indices=True,
                                                                   )
    print(len(train_indices), len(val_indices), len(test_indices), f'first val index {val_indices[0]}',  f'first test index {test_indices[0]}')
    RANDOM_SPLITS.append([train_indices, val_indices, test_indices])

10137 596 1193 first val index 10945 first test index 11433
10137 596 1193 first val index 2433 first test index 4596
10137 596 1193 first val index 8701 first test index 6429
10137 596 1193 first val index 8144 first test index 10592
10137 596 1193 first val index 9083 first test index 1793


In [8]:
with open('RDB7_splits/RDB7_splits_random.pkl', 'wb') as f:
    pkl.dump(RANDOM_SPLITS, f)

# Scaffold splits

In [9]:
SCAFFOLD_SPLITS = []
sampler='scaffold'
for seed in range(5):
    # create 85:5:10 data split
    _, _, _, train_labels, val_labels, test_labels, train_indices, val_indices, test_indices = train_val_test_split(df.rsmi.values,
                                                                    train_size=0.85,
                                                                    val_size=0.05,
                                                                    test_size=0.1,
                                                                    sampler=sampler,
                                                                    random_state=seed,
                                                                    return_indices=True,
                                                                   )
    print(len(train_indices), len(val_indices), len(test_indices), f'first val index {val_indices[0]}',  f'first test index {test_indices[0]}')
    SCAFFOLD_SPLITS.append([train_indices, val_indices, test_indices])

/Users/kevin/Dropbox (MIT)/code/astartes/astartes/samplers/extrapolation/scaffold.py:47: NoMatchingScaffold: No matching scaffold was found for the 3271 molecules corresponding to indices {8192, 8193, 2, 3, 4, 5, 6, 8194, 8215, 8216, 8217, 8218, 8219, 8220, 8221, 8222, 8223, 8224, 8225, 8226, 8227, 8228, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 126, 127, 128, 129, 130, 8329, 8330, 8331, 8332, 8333, 8334, 8335, 8336, 8337, 8338, 8339, 8340, 8341, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 8379, 8380, 8381, 8382, 194, 195, 196, 197, 198, 199, 200, 201, 210, 211, 212, 213, 227, 228, 229, 230, 231, 240, 241, 242, 243, 244, 245, 246, 247, 248, 272, 273, 274, 8470, 8471, 8472, 8473, 8474, 8475, 8476, 8477, 8478, 8479, 293, 294, 295, 296, 297, 298, 299, 335, 336, 337, 338, 8541, 8542, 8543, 8544, 854

10138 596 1192 first val index 9000 first test index 6699
10138 596 1192 first val index 7878 first test index 4886
10138 596 1192 first val index 7737 first test index 2904
10138 596 1192 first val index 3633 first test index 11346
10138 596 1192 first val index 9637 first test index 10602


In [10]:
with open('RDB7_splits/RDB7_splits_scaffold.pkl', 'wb') as f:
    pkl.dump(SCAFFOLD_SPLITS, f)

# KMeans

### Featurize the data using morgan fingerprint with standard settings

In [16]:
# morgan_fps = np.zeros((len(df), 2048))
# for i, row in df.iterrows():
#     rmol = Chem.MolFromSmiles(row.rsmi, params)
#     morgan = morgan_counts_features_generator(rmol)
#     morgan_fps[i, :] = morgan
from astartes.molecules import train_val_test_split_molecules

- random seed 1 and 4 produce the same val and test splits here so I must use something different than 0, 1, 2, 3, 4
- but random seed 1 and 5 produce the same test set...
- so let's use seeds 0, 3, 6, 9, 12

In [17]:
KMEANS_SPLITS = []
sampler='kmeans'
for seed in range(0, 15, 3):
    # create 85:5:10 data split
    _, _, _, _, _, _, train_indices, val_indices, test_indices = train_val_test_split_molecules(df.rsmi.values,
                                                                    train_size=0.85,
                                                                    val_size=0.05,
                                                                    test_size=0.1,
                                                                    sampler=sampler,
                                                                    hopts={"n_clusters": 20},
                                                                    random_state=seed,
                                                                    return_indices=True,
                                                                   )
    print(len(train_indices), len(val_indices), len(test_indices), f'first val index {val_indices[0]}',  f'first test index {test_indices[0]}')
    KMEANS_SPLITS.append([train_indices, val_indices, test_indices])

  warn(


10423 526 977 first val index 214 first test index 240


  warn(


10345 539 1042 first val index 2 first test index 0


  warn(


10373 405 1148 first val index 0 first test index 502


  warn(


10350 468 1108 first val index 502 first test index 194
10248 493 1185 first val index 49 first test index 60


  warn(


In [18]:
with open('RDB7_splits/RDB7_splits_kmeans.pkl', 'wb') as f:
    pkl.dump(KMEANS_SPLITS, f)