## Import Packages

In [6]:
import os
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem
from rdkit.Chem import PandasTools
import rdkit.Chem.Fragments as Fragments

import random
import numpy as np
import matplotlib.pyplot as plt

import sys
curr_dir = os.path.abspath(".")
util_dir = os.path.join(os.path.dirname(curr_dir))
sys.path.append(util_dir)
from utils.kmeans import KMeans

ModuleNotFoundError: No module named 'utils.kmeans'

## Curate Dataset

In [2]:
infile = "../data/combined_training_datasets_unique.sdf"
name = os.path.splitext(os.path.basename(infile))[0]

all_df = PandasTools.LoadSDF(infile)
all_df.head()



Unnamed: 0,pKa,marvin_pKa,marvin_atom,marvin_pKa_type,original_dataset,ID,ROMol
0,6.21,6.09,10,basic,['chembl25'],1702768,<rdkit.Chem.rdchem.Mol object at 0x7f8259d81690>
1,7.46,8.2,9,basic,['chembl25'],273537,<rdkit.Chem.rdchem.Mol object at 0x7f8259d820a0>
2,4.2,3.94,9,basic,['datawarrior'],7175,<rdkit.Chem.rdchem.Mol object at 0x7f8259d82110>
3,3.73,5.91,8,acidic,['datawarrior'],998,<rdkit.Chem.rdchem.Mol object at 0x7f8259d82180>
4,11.0,8.94,13,basic,['chembl25'],560562,<rdkit.Chem.rdchem.Mol object at 0x7f8259d821f0>


In [3]:
patterns = []
for patstr in dir(Chem.Fragments):
    if patstr.startswith("fr"):
        patterns.append(patstr)
print(f"Number of fragment patterns: {len(patterns)}")

PATTERNS = [getattr(Fragments, patstr) for patstr in patterns]

def featurize(mol):
    counts = [pattern(mol) for pattern in PATTERNS]
    return counts

X = []
Y = []

for idx, row in all_df.iterrows():
    x = featurize(row["ROMol"])
    X.append(x)
    Y.append(row["pKa"])
X = np.array(X, dtype=float)
Y = np.array(Y, dtype=float).reshape(-1, 1)
Y = np.where(Y<7, 1, 0)

Number of fragment patterns: 85


## Hyperparameters

In [4]:
# for reproduce purposes
SEED = 0
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

datadir = "."
ratio = 0.1
device = "cuda" if torch.cuda.is_available() else "cpu"

nsamples = X.shape[0]
ndim = X.shape[1]

## Split dataset

In [5]:
def random_split(X, Y, ratio):
    nsamples = X.shape[0]
    val_size = int(nsamples*ratio)
    val_indices = np.random.choice(nsamples, val_size, replace=False)
    train_indices = set(range(nsamples)) - set(val_indices.tolist())
    train_indices = list(train_indices)
    X_train, Y_train = X[train_indices], Y[train_indices]
    X_test, Y_test = X[val_indices], Y[val_indices]
    return (X_train, Y_train), (X_test, Y_test)

## Model

## Training

## Analysis