In [1]:
import pickle
import glob, os
import numpy as np
from rewards.properties import qed as mol_qed, drd2 as mol_drd2, logP as mol_logP
from coma.properties import penalized_logp
from rdkit import Chem
import tqdm
import networkx as nx
from rdkit.Chem import Descriptors

from multiprocess import Pool
pool = Pool(12)

def qed(smile):
    return mol_qed(Chem.MolFromSmiles(smile))

def drd2(smile):
    return mol_drd2(Chem.MolFromSmiles(smile))

def logP(smile):
    return mol_logP(Chem.MolFromSmiles(smile))

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://num

In [2]:
# PROPERTY_NAME = "drd2"
# SCORING_FT = drd2

# PROPERTY_NAME = "qed"
# SCORING_FT = qed

# PROPERTY_NAME = "logp04"
# SCORING_FT = penalized_logp

PROPERTY_NAME = "logp06"
SCORING_FT = penalized_logp

In [3]:
files = glob.glob(f"results/eval_on_coma_{PROPERTY_NAME}/*")
print(len(files))

800


# Get the top molecules based on x*sim+y*prop combination

In [4]:
from rewards.drd2_scorer import fingerprints_from_mol
from functools import partial
clf = pickle.load(open("rewards/clf_py36.pkl", 'rb'))

def sm_fp(sm):
    return fingerprints_from_mol(Chem.MolFromSmiles(sm))

def get_drd2_on_fp(clf, fp):
    fp = np.array(fp)
    fp = fp.reshape(-1, 2048)
    return clf.predict_proba(fp)[:, 1]
    
    
def get_drd2_on_smile_arr_mp(smiles_arr):
    bs = 10
    smile_list_reshaped = [list(map(sm_fp, smiles_arr[i:i+bs])) for i in range(0, smiles_arr.shape[0], bs)]
    res = pool.map(partial(get_drd2_on_fp, clf), smile_list_reshaped, chunksize=1000//bs)
    return np.concatenate(res)
    
    
    
samples = pickle.load(open("results/eval_on_coma_drd2/0.pickle", 'rb'))

traj_d = samples["traj"]
all_keys = np.array(list(traj_d.keys()))
smiles_arr = np.vectorize(traj_d.get)(all_keys)

%time res = get_drd2_on_smile_arr_mp(smiles_arr[:1000])

CPU times: user 449 ms, sys: 32.9 ms, total: 481 ms
Wall time: 4.7 s


In [8]:
def get_prop_multiprocess(smiles_arr):
    if PROPERTY_NAME == "drd2":
        return get_drd2_on_smile_arr_mp(smiles_arr)
    
    return np.array(pool.map(SCORING_FT, smiles_arr, chunksize=100))
    

def get_top_mols(file, sim_coeffs=[], prop_coeffs=[], topk=20):
    file_data = pickle.load(open(file, 'rb'))

    # Get info and sim arr
    traj_d, sim_d = file_data["traj"], file_data["sim"]
    all_keys = np.array(list(traj_d.keys()))
    smiles_arr = np.vectorize(traj_d.get)(all_keys)
    sim_arr = np.vectorize(sim_d.get)(all_keys)
    
    # Some are None - remove
    smiles_arr = smiles_arr[sim_arr != None]
    sim_arr = sim_arr[sim_arr != None]

    # Calc property
    prop_arr = get_prop_multiprocess(smiles_arr)

    # Get topk mols (while loop) for each coeff combination (for loop)
    for sc, pc in zip(sim_coeffs, prop_coeffs):

        results = {"sim": [], "prop": [], "smiles": []} 
        count = 0
        
        # Temp sim and prop array - since we need to modify these in the while loop
        temp_sim_arr = sim_arr.copy()
        temp_prop_arr = prop_arr.copy()
        
        while count < topk:
            idx = (sc*temp_sim_arr + pc*temp_prop_arr).argmax()
            if smiles_arr[idx] in results["smiles"]:
                temp_sim_arr[idx] = -1
                temp_prop_arr[idx] = -1
                continue
            
            # Add results
            results["sim"].append(temp_sim_arr[idx])
            results["prop"].append(temp_prop_arr[idx])
            results["smiles"].append(smiles_arr[idx])
            
            # Update the values - to prevent replacement
            temp_sim_arr[idx] = -1
            temp_prop_arr[idx] = -1
            count += 1
        
        yield results

# Coefficients for sim and prop
# sim_coeff_list = [1, 1, 0, 1]
# prop_coeff_list = [1, 2, 1, 0]
sim_coeff_list = [30, 35, 40]
prop_coeff_list = [1, 1, 1]

# Result dict and file names
result_file_name = lambda x, y: f"{x}sim_{y}{PROPERTY_NAME}"
result_dict = {result_file_name(x, y): 
               {
                   "sim": [],
                   "prop": [],
                   "smiles": []
                } 
    for x, y in zip(sim_coeff_list, prop_coeff_list)
}
print(result_dict)

# Make dir
folder = f"results/best_{PROPERTY_NAME}"
if not os.path.exists(folder):
    os.makedirs(folder)

for file in tqdm.tqdm(files):
    for sc, pc, d in zip(sim_coeff_list, prop_coeff_list, get_top_mols(file, sim_coeff_list, prop_coeff_list, topk=20)):
        for key in d:
            result_dict[result_file_name(sc, pc)][key].extend(d[key])
        
for filename_base in result_dict:
    result_file_path = os.path.join(folder, f"{filename_base}.pickle")
    pickle.dump(result_dict[filename_base], open(result_file_path, 'wb'))
    print(f"Dumped {result_file_path}.")

{'30sim_1logp06': {'sim': [], 'prop': [], 'smiles': []}, '35sim_1logp06': {'sim': [], 'prop': [], 'smiles': []}, '40sim_1logp06': {'sim': [], 'prop': [], 'smiles': []}}


100%|████████████████████████████████████████████████████████████| 800/800 [2:06:28<00:00,  9.49s/it]


Dumped results/best_logp06/30sim_1logp06.pickle.
Dumped results/best_logp06/35sim_1logp06.pickle.
Dumped results/best_logp06/40sim_1logp06.pickle.


# Results

In [10]:
coma_results = '''COMA 0.7142 1.000 0.799 0.792 0.329 0.999 0.366
VJTNN 0.6787 0.999 0.803 0.797 0.344 0.696 0.433
VJTNN+GAN 0.7137 0.999 0.787 0.780 0.324 0.885 0.507
CORE 0.6947 0.999 0.800 0.793 0.338 0.779 0.459
HierG2G 0.6922 0.981 0.680 0.673 0.258 0.971 0.590
HierG2G+BT 0.6783 1.000 0.808 0.802 0.336 0.671 0.453
JTVAE 0.4077 0.935 0.091 0.084 0.402 0.934 0.000
UGMMT 0.7088 0.999 0.737 0.730 0.261 0.996 0.530'''

for row in coma_results.split("n"):
    row = row.split(" ")
    print("\t".join(row))

COMA	0.7142	1.000	0.799	0.792	0.329	0.999	0.366
VJTNN	0.6787	0.999	0.803	0.797	0.344	0.696	0.433
VJTNN+GAN	0.7137	0.999	0.787	0.780	0.324	0.885	0.507
CORE	0.6947	0.999	0.800	0.793	0.338	0.779	0.459
HierG2G	0.6922	0.981	0.680	0.673	0.258	0.971	0.590
HierG2G+BT	0.6783	1.000	0.808	0.802	0.336	0.671	0.453
JTVAE	0.4077	0.935	0.091	0.084	0.402	0.934	0.000
UGMMT	0.7088	0.999	0.737	0.730	0.261	0.996	0.530


In [9]:
import pickle
import numpy as np
import glob
import tqdm
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit import Chem
from multiprocess import Pool

pool = Pool(12)

class FastTanimotoOneToBulk:
    def __init__(self, bs):
        self.bs = bs
        self.b_fps = np.vstack([self._fingerprints_from_smi(smi) for smi in self.bs])
        
    def __call__(self, a):
        a_fp = self._fingerprints_from_smi(a)
        return (a_fp&self.b_fps).sum(axis=1) / (a_fp|self.b_fps).sum(axis=1)
        
    def _fingerprints_from_smi(self, smi):
        mol = Chem.MolFromSmiles(smi)
        fp = GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useChirality=False)
        nfp = np.array([b=='1' for b in fp.ToBitString()])
        return nfp

def diversity(smiles):
    fast_sim_bulk = FastTanimotoOneToBulk(smiles)
    sim_list = []
    for sim in tqdm.tqdm(pool.imap(fast_sim_bulk, smiles, chunksize=200), total=len(smiles)):
        sim_list.append(sim.mean())
    
    # Total sim
    tot = np.mean(sim_list)
    
    # Each sim list has an extra sim=1 from sim(mol, mol). Adjust for that
    l = len(smiles)
    tot = (tot * l - 1) / (l-1)
    
    return 1 - tot

# CHANGE THESE TWO
PROPERTY_NAME = "logp06"
SCORING_FT = penalized_logp

for file in glob.glob(f"results/best_{PROPERTY_NAME}/*"):
    dicty = pickle.load(open(file, 'rb'))
    results = [0, 1.0, np.mean(dicty["prop"]), 0, np.mean(dicty["sim"]), 1.0, np.mean(diversity(dicty["smiles"]))]
    mean_test_property = np.mean(list(map(lambda x: SCORING_FT(x.strip("\n")), open(f"datasets/coma/{PROPERTY_NAME}/rdkit_test.txt", 'r').readlines())))
    results[3] = results[2] - mean_test_property
    results[0] = np.mean(results[1:])
    print("\t".join([file] + list(map(lambda x: str(round(x, 3)), results))))

100%|██████████████████████████████████████████████████████████| 16000/16000 [02:44<00:00, 97.52it/s]


results/best_logp06/35sim_1logp06.pickle	1.507	1.0	1.522	4.266	0.373	1.0	0.879


100%|██████████████████████████████████████████████████████████| 16000/16000 [02:44<00:00, 97.14it/s]


results/best_logp06/20sim_1logp06.pickle	2.397	1.0	4.253	6.998	0.267	1.0	0.867


100%|██████████████████████████████████████████████████████████| 16000/16000 [02:44<00:00, 97.41it/s]


results/best_logp06/15sim_1logp06.pickle	2.774	1.0	5.42	8.165	0.2	1.0	0.858


100%|██████████████████████████████████████████████████████████| 16000/16000 [02:44<00:00, 97.46it/s]


results/best_logp06/30sim_1logp06.pickle	1.708	1.0	2.135	4.879	0.354	1.0	0.877


100%|██████████████████████████████████████████████████████████| 16000/16000 [02:44<00:00, 97.13it/s]


results/best_logp06/40sim_1logp06.pickle	1.372	1.0	1.112	3.856	0.384	1.0	0.881


100%|██████████████████████████████████████████████████████████| 16000/16000 [02:44<00:00, 96.99it/s]


results/best_logp06/10sim_1logp06.pickle	3.004	1.0	6.144	8.888	0.144	1.0	0.851


100%|██████████████████████████████████████████████████████████| 16000/16000 [02:44<00:00, 97.40it/s]


results/best_logp06/25sim_1logp06.pickle	2.012	1.0	3.068	5.812	0.32	1.0	0.873
