In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
import sys
sys.path.append("..")  # add top folder to path

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import moses
import gentrl

import tinymolecule as tiny

## Initializing the encoder multilayer perceptron (MLP) and decoder MLP to form the variational autoencoder
We note that the latent size for decoder is twice as small because we're drawing a sample from a distribution, whereas for the encoder, we get two values: mean and log variance. The MLPs are relatively small neural networks, by default there are two hidden layers, each with 100 neurons, and output layer (latent space) is 10 neurons. The decoder shares a similar structure but in reverse: the input from latent space is 5 neurons, with two hidden layers of 100 neurons.

In [2]:
enc = tiny.LinearEncoder(num_hidden_layers=2, latent_size=10)  # mu and sigma
dec = tiny.LinearDecoder(num_hidden_layers=2, latent_size=5)  # a single sample
vae = tiny.TinyVAE(enc, dec)

In [3]:
vae

TinyVAE(
  (encoder): LinearEncoder(
    (layers): ModuleList(
      (0): Linear(in_features=50, out_features=100, bias=True)
      (1): Linear(in_features=100, out_features=100, bias=True)
      (2): Linear(in_features=100, out_features=100, bias=True)
      (3): Linear(in_features=100, out_features=10, bias=True)
    )
    (dropout): ModuleList(
      (0): Dropout(p=0.85, inplace=False)
      (1): Dropout(p=0.85, inplace=False)
      (2): Dropout(p=0.85, inplace=False)
    )
  )
  (decoder): LinearDecoder(
    (layers): ModuleList(
      (0): Linear(in_features=5, out_features=100, bias=True)
      (1): Linear(in_features=100, out_features=100, bias=True)
      (2): Linear(in_features=100, out_features=100, bias=True)
      (3): Linear(in_features=100, out_features=50, bias=True)
    )
    (dropout): ModuleList(
      (0): Dropout(p=0.85, inplace=False)
      (1): Dropout(p=0.85, inplace=False)
      (2): Dropout(p=0.85, inplace=False)
    )
  )
)

## Loading in assay data on CCR4
In the future, we will filter this data based on an IC50 threshold to pick out only relevant molecules.

In [61]:
data_dir = "/Users/Munchic/Developer/Capstone/tinymolecule/data/ccr4_ic50_meta.csv"
assays = pd.read_csv(data_dir)
assays.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,31863,[],CHEMBL663853,Inhibitory concentration against human DNA top...,B,BAO_0000190,BAO_0000357,single protein format,c1ccc(-c2nc3c(-c4nc5ccccc5o4)cccc3o2)cc1,...,Homo sapiens,DNA topoisomerase II alpha,9606.0,,,IC50,uM,UO_0000065,,100.0
1,,31864,[],CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,BAO_0000190,BAO_0000218,organism-based format,Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...,...,Homo sapiens,Heparanase,9606.0,,,IC50,uM,UO_0000065,,2.5
2,,31865,[],CHEMBL693237,In vivo concentration required against angioge...,F,BAO_0000190,BAO_0000218,organism-based format,Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...,...,,NON-PROTEIN TARGET,,,,IC50,uM,UO_0000065,,50.0
3,,31866,[],CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,BAO_0000190,BAO_0000218,organism-based format,COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...,...,Homo sapiens,Heparanase,9606.0,,,IC50,uM,UO_0000065,,9.0
4,Not Determined,31867,[],CHEMBL693238,In vivo concentration required against angioge...,F,BAO_0000190,BAO_0000218,organism-based format,COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...,...,,NON-PROTEIN TARGET,,,,IC50,uM,,,


In [64]:
assays.columns

Index(['activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value'],
      dtype='object')

In [84]:
ic50_data = assays[assays["type"] == "IC50"]
ic50_nM_data = ic50_data[ic50_data["standard_units"] == "nM"]
ic50_nM_data.to_csv("../data/ccr4_ic50.csv")

In [179]:
a = pd.read_csv("/Users/Munchic/Developer/Capstone/tinymolecule/data/ccr4_ic50_train.csv", usecols=['SMILES'], squeeze=True).astype(str).tolist()
len(a)

5831

In [182]:
vocab.c2i

{'#': 0,
 '(': 1,
 ')': 2,
 '+': 3,
 '-': 4,
 '.': 5,
 '/': 6,
 '1': 7,
 '2': 8,
 '3': 9,
 '4': 10,
 '5': 11,
 '6': 12,
 '7': 13,
 '8': 14,
 '=': 15,
 '@': 16,
 'B': 17,
 'C': 18,
 'F': 19,
 'H': 20,
 'I': 21,
 'K': 22,
 'N': 23,
 'O': 24,
 'P': 25,
 'S': 26,
 '[': 27,
 '\\': 28,
 ']': 29,
 'a': 30,
 'c': 31,
 'i': 32,
 'l': 33,
 'n': 34,
 'o': 35,
 'r': 36,
 's': 37,
 '<bos>': 38,
 '<eos>': 39,
 '<pad>': 40,
 '<unk>': 41}

In [174]:
a = pd.read_csv("/Users/Munchic/Developer/Capstone/tinymolecule/data/ccr4_ic50.csv")
a.g

Unnamed: 0.1,Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,0,,31863,[],CHEMBL663853,Inhibitory concentration against human DNA top...,B,BAO_0000190,BAO_0000357,single protein format,...,Homo sapiens,DNA topoisomerase II alpha,9606.0,,,IC50,uM,UO_0000065,,100.0
1,1,,31864,[],CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,BAO_0000190,BAO_0000218,organism-based format,...,Homo sapiens,Heparanase,9606.0,,,IC50,uM,UO_0000065,,2.5
2,2,,31865,[],CHEMBL693237,In vivo concentration required against angioge...,F,BAO_0000190,BAO_0000218,organism-based format,...,,NON-PROTEIN TARGET,,,,IC50,uM,UO_0000065,,50.0
3,3,,31866,[],CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,BAO_0000190,BAO_0000218,organism-based format,...,Homo sapiens,Heparanase,9606.0,,,IC50,uM,UO_0000065,,9.0
4,5,,31868,[],CHEMBL760688,Inhibitory activity against Palmitoyl-CoA oxid...,B,BAO_0000190,BAO_0000357,single protein format,...,Rattus norvegicus,Palmitoyl-CoA oxidase,10116.0,,,IC50,uM,UO_0000065,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5899,28602,,60465,[],CHEMBL755012,Activity against opioid receptor in guinea pig...,F,BAO_0000190,BAO_0000221,tissue-based format,...,Homo sapiens,Opioid receptors; mu & kappa,9606.0,,,IC50,nM,UO_0000065,,6.3
5900,28635,,60498,[],CHEMBL747657,Binding affinity towards Muscarinic acetylchol...,B,BAO_0000190,BAO_0000221,tissue-based format,...,Rattus norvegicus,Muscarinic acetylcholine receptor M1,10116.0,,,IC50,nM,UO_0000065,,23.0
5901,28636,,60499,[],CHEMBL744022,Binding affinity towards muscarinic receptor o...,B,BAO_0000190,BAO_0000221,tissue-based format,...,Rattus norvegicus,Muscarinic acetylcholine receptor M2,10116.0,,,IC50,nM,UO_0000065,,100.0
5902,28639,,60502,[],CHEMBL651151,Concentration required for 50% inhibition of b...,B,BAO_0000190,BAO_0000221,tissue-based format,...,Rattus norvegicus,Angiotensin II receptor,10116.0,,,IC50,uM,UO_0000065,,27.0


In [115]:
ccr4_ic50_nM_smiles = ['SMILES'] + list(ic50_nM_data["canonical_smiles"])

with open("../data/ccr4_ic50_train.txt", "a") as f:
    for entry in ccr4_ic50_nM_smiles:
        if type(entry) == str:
            f.write(entry + '\n')

In [123]:
ccr4_ic50_smiles_df = pd.DataFrame({"SMILES": list(ic50_nM_data["canonical_smiles"])})
ccr4_ic50_smiles_df.to_csv("../data/ccr4_ic50_train.csv")

In [116]:
with open("../data/ccr4_ic50_train.txt", "r") as f:
    print(f.read())

)c2C(=O)c13.Cl
CCN(CC)CCNc1ccc2c3c(nn2CCNCCO)-c2c(O)ccc(O)c2C(=O)c13.Cl
CN(C)CCn1nc2c3c(c(NCCCN)ccc31)C(=O)c1c(O)ccc(O)c1-2.Cl
Nc1cc2c3ccccc3nc-2c[nH]1
CCN1CCC[C@H]1CNC(=O)c1c(OC)ccc(Br)c1O
CCc1ccc2c(c1)c(OC)c(C)c1nc(C(=O)c3ccccc3)cn12
COc1ccc2c(C)nc(O)cc2c1
CCCCC1N(Cc2ccc(-c3ccccc3-c3nn[nH]n3)cc2)C(=O)c2cc(C(C)(C)OC)ccc2N1C
CCCCc1nc(Cl)c(CO)n1Cc1ccc(-c2ccccc2-c2nnn[nH]2)cc1
CC(C)(C)N(CC(=O)N(CC(=O)NO)CC1CCCCC1)C(=O)Nc1ccc(Oc2ccccc2)cc1
O=C(CN(C(=O)CN(C(=O)Nc1ccc(Oc2ccccc2)cc1)C1CCCc2ccccc21)C1CCCc2ccccc21)NO
CCCCN(CC(=O)NO)C(=O)CN(CCCc1ccccc1)C(=O)Nc1ccc(Oc2ccccc2)cc1
O=C(CN(CCCc1ccccc1)C(=O)CN(CCCc1ccccc1)C(=O)Nc1ccc(Oc2ccccc2)cc1)NO
COc1ccc(OC)c(CCNC(=S)Nc2ccc(Br)cn2)c1
COc1ccc(OC)c(CCNC(=S)Nc2ccc(Br)cn2)c1
COc1ccc(OC)c(CCNC(=S)Nc2ccc(Br)cn2)c1
COc1ccc(OC)c(CCNC(=S)Nc2ccc(Br)cn2)c1
COc1ccc(OC)c(CCNC(=S)Nc2ccc(Br)cn2)c1
Fc1ccccc1CC/N=C(\S)Nc1ccc(Br)cn1
Fc1ccccc1CC/N=C(\S)Nc1ccc(Br)cn1
Fc1ccccc1CC/N=C(\S)Nc1ccc(Br)cn1
Fc1ccccc1CC/N=C(\S)Nc1ccc(Br)cn1
Fc1ccccc1CC/N=C(\S)Nc1ccc(Br)cn1
C

In [160]:
ccr4_ic50_smiles_vals = ccr4_ic50_smiles_df[ccr4_ic50_smiles_df['SMILES'].apply(lambda x: isinstance(x, str))]['SMILES'].values
ccr4_ic50_smiles_vals

array(['c1ccc(-c2nc3c(-c4nc5ccccc5o4)cccc3o2)cc1',
       'Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)c3)nc2c1',
       'Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)c3)nc2c1', ...,
       'CN1CCN(CC(=O)N2c3ccccc3NC(=O)c3ccccc32)CC1',
       'CCC1Nc2ccccc2C(=O)N1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1',
       'CCCCC1Nc2ccc(Cc3ccccc3)cc2C(=O)N1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1'],
      dtype=object)

In [161]:
vocab = moses.CharVocab.from_data(ccr4_ic50_smiles_vals)
v

In [164]:
torch.save(vocab, "/Users/Munchic/Developer/Capstone/tinymolecule/data/ccr4_ic50_vocab.pt")

In [139]:
vocab = moses.CharVocab.from_data(ccr4_ic50_smiles_df.values)

TypeError: '<' not supported between instances of 'str' and 'float'

In [171]:
a = pd.read_csv("/Users/Munchic/Developer/Capstone/tinymolecule/data/ccr4_ic50_train.csv", usecols=['SMILES'], squeeze=True).astype(str).tolist()
len(a)

5904

In [156]:
train = moses.get_dataset('train')
vocab = CharVocab.from_data(train)

In [158]:
train

array(['CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1',
       'CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1',
       'Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO', ...,
       'NC(=O)c1ccc2ccccc2c1Br',
       'CC(=O)Nc1cccc(-c2nc3cc(C)ccc3[nH]c2=O)c1',
       'CC(NC(=O)OC(C)(C)C)c1nc(CO)nn1Cc1ccccc1'], dtype=object)

In [135]:
vocab.c2i

{'#': 0,
 '(': 1,
 ')': 2,
 '-': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '=': 10,
 'B': 11,
 'C': 12,
 'F': 13,
 'H': 14,
 'N': 15,
 'O': 16,
 'S': 17,
 '[': 18,
 ']': 19,
 'c': 20,
 'l': 21,
 'n': 22,
 'o': 23,
 'r': 24,
 's': 25,
 '<bos>': 26,
 '<eos>': 27,
 '<pad>': 28,
 '<unk>': 29}

In [5]:
# get assays involving molecules without complex chiral centers since gentrl cannot encode them
# for some reason, gentrl doesn't encode selenium or phosphorus so we remove them for now -> will look into this
assays_no_cx_chir = assays[assays["canonical_smiles"].str.contains("@|I|[\+]|\-|\.|\/|Se|P|7|8") == False]
assays_no_cx_chir.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
17,,31880,[],CHEMBL651396,Effective concentration of compound achieving ...,F,BAO_0000188,BAO_0000219,cell-based format,Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O,...,Homo sapiens,CCRF-CEM,9606.0,,,EC50,uM,UO_0000065,,0.0636
18,,31881,[],CHEMBL657398,Cytotoxic concentration of compound required t...,F,BAO_0000187,BAO_0000219,cell-based format,Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O,...,Homo sapiens,CCRF-CEM,9606.0,,,CC50,uM,UO_0000065,,1000.0
19,,31882,[],CHEMBL845142,Selectivity index expressed as the ratio of CC...,F,BAO_0000179,BAO_0000019,assay format,Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O,...,,Unchecked,,,,Selectivity index,uM,UO_0000064,,15723.0
20,Not Determined,31883,[],CHEMBL752464,Antimycobacterial activity against Mycobacteri...,F,BAO_0000376,BAO_0000218,organism-based format,Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773.0,,,Inhibition,,,,
21,,31884,[],CHEMBL618375,In vitro hydrolysis in human plasma,A,BAO_0002115,BAO_0000366,cell-free format,Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O,...,,ADMET,,,,T1/2,min,UO_0000032,,20.0


In [6]:
molecules = gentrl.tokenizer.encode(assays_no_cx_chir["canonical_smiles"])[0].float()  # encode SMIl
molecules[0]  # an example molecule

tensor([ 1., 23., 10., 24., 10., 18., 22., 23.,  3., 23., 13., 23., 23., 22.,
        23.,  9., 23., 22., 13.,  9., 15., 23.,  6., 14., 23., 23.,  6., 23.,
        23., 14., 15.,  9.,  3., 15., 10., 22., 13.,  9., 15.,  7., 18., 21.,
        11., 10., 24., 13.,  9.,  2.,  2.,  2.])

In [60]:
molecules.shape

torch.Size([10067, 50])

## Encoder time!
Let's see how our encoder works. When we pass in this 50-dimensional vector, it will spit out a 10-dimension vector, 5 dimensions to encode a mean, 5 for standard deviation in variational autoencoder's latent space. Essentially, each molecule becomes a distribution, then it's reconstruction is a sample from this distribution.

In [7]:
out = vae(molecules[0:1])
print("reconstructred molecule (untrained VAE):\n", out[0].detach(), "\n")
print("encoded mean (untrained VAE):\n", out[1].detach())
print("encoded standard deviation (untrained VAE):\n", out[2].detach())

reconstructred molecule (untrained VAE):
 tensor([[1.0000e+00, 1.3355e-03, 1.0000e+00, 0.0000e+00, 4.1989e-36, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 1.4347e-38,
         1.0000e+00, 2.6623e-35, 1.1020e-03, 1.0000e+00, 0.0000e+00, 0.0000e+00,
         1.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 6.1429e-25, 0.0000e+00,
         1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 1.5686e-19, 2.5568e-22,
         1.0000e+00, 1.0000e+00, 6.6948e-18, 1.0000e+00, 1.0000e+00, 4.5150e-10,
         1.0000e+00, 1.2431e-12, 1.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
         1.7119e-13, 3.5437e-17]]) 

encoded mean (untrained VAE):
 tensor([[ 12.9422,  -8.8463,  15.6615,   2.8145, -20.8087]])
encoded standard deviation (untrained VAE):
 tensor([[ -3.6916,  14.5477,   0.2585, -23.0162, -18.7246]])


In [8]:
# it should not make any sense as we try to convert this failed reconstructed vector into a SMILES string
int_out = torch.tensor(out[0].detach(), dtype=int)
print("Reconstructed SMILES (doesn't make much sense):", gentrl.tokenizer.decode(int_out))

Reconstructed SMILES (doesn't make much sense): ['']


## Time to train the VAE


In [9]:
molecloader = tiny.dataset.molecloader(molecules)
optimizer = optim.Adam(vae.parameters(), lr=1e-2)
criterion = nn.BCELoss(reduction="sum")

In [10]:
train_loss = tiny.train.simple_train(vae, molecloader, criterion, optimizer, num_epochs=20)

epoch 1/20 started at 0.0000 s
epoch_loss: 7.205274817850362e+23
epoch 2/20 started at 0.4655 s
epoch_loss: 62148057.19907311
epoch 3/20 started at 0.9639 s
epoch_loss: 86906728.23145342
epoch 4/20 started at 1.4073 s
epoch_loss: 213449.64518370808
epoch 5/20 started at 1.8716 s
epoch_loss: 118348.55338876459
epoch 6/20 started at 2.3256 s
epoch_loss: 310054.51866246475
epoch 7/20 started at 2.8156 s
epoch_loss: 6591491.110914399
epoch 8/20 started at 3.3286 s
epoch_loss: 19613124.646462355
epoch 9/20 started at 3.8416 s
epoch_loss: 757912.9861009815
epoch 10/20 started at 4.3547 s
epoch_loss: 1273495183524.467
epoch 11/20 started at 4.8666 s
epoch_loss: 35945.32612706438
epoch 12/20 started at 5.3882 s
epoch_loss: 83309.42953877509
epoch 13/20 started at 5.9440 s
epoch_loss: 4824.1162858794005
epoch 14/20 started at 6.5099 s
epoch_loss: 2338.2558582161046
epoch 15/20 started at 7.0340 s
epoch_loss: 2492.2321027924745
epoch 16/20 started at 7.5549 s
epoch_loss: 3228.391173930108
epoch 

## Results of a trained VAE

In [40]:
# let's try on one molecule
print("actual molecule:\n", molecules[0:1])
print("actual SMILES:", gentrl.tokenizer.decode(molecules[0:1]), "\n")

recon = torch.tensor(vae(molecules[0:1])[0].detach() * 27, dtype=int)
print("reconstructed molecule:\n", recon)
print("reconstructed SMILES:", gentrl.tokenizer.decode(recon), "\n")

actual molecule:
 tensor([[ 1., 23., 10., 24., 10., 18., 22., 23.,  3., 23., 13., 23., 23., 22.,
         23.,  9., 23., 22., 13.,  9., 15., 23.,  6., 14., 23., 23.,  6., 23.,
         23., 14., 15.,  9.,  3., 15., 10., 22., 13.,  9., 15.,  7., 18., 21.,
         11., 10., 24., 13.,  9.,  2.,  2.,  2.]])
actual SMILES: ['Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O'] 

reconstructed molecule:
 tensor([[ 1, 18, 15, 19, 19, 14, 14, 14, 16, 15, 15, 15, 14, 14, 15, 14, 14, 14,
         14, 13, 13, 13, 13, 12, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10,
         10,  9,  9,  9,  8,  8,  8,  7,  7,  7,  6,  6,  5,  2]])
reconstructed SMILES: ['n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl'] 



In [57]:
gentrl.tokenizer.decode(molecules[0:1])

['Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O']

In [58]:
molecules[0:1]

tensor([[ 1., 23., 10., 24., 10., 18., 22., 23.,  3., 23., 13., 23., 23., 22.,
         23.,  9., 23., 22., 13.,  9., 15., 23.,  6., 14., 23., 23.,  6., 23.,
         23., 14., 15.,  9.,  3., 15., 10., 22., 13.,  9., 15.,  7., 18., 21.,
         11., 10., 24., 13.,  9.,  2.,  2.,  2.]])

In [51]:
vae(molecules[0:4])

(tensor([[0.0371, 0.6738, 0.5596, 0.7108, 0.7307, 0.5334, 0.5310, 0.5520, 0.5989,
          0.5779, 0.5747, 0.5562, 0.5529, 0.5246, 0.5623, 0.5379, 0.5303, 0.5372,
          0.5317, 0.5081, 0.4953, 0.5006, 0.5079, 0.4620, 0.4987, 0.4826, 0.4637,
          0.4576, 0.4569, 0.4363, 0.4103, 0.4255, 0.4094, 0.4036, 0.4002, 0.3885,
          0.3966, 0.3632, 0.3575, 0.3369, 0.3151, 0.3179, 0.3192, 0.2866, 0.2682,
          0.2670, 0.2394, 0.2331, 0.2220, 0.0741],
         [0.0371, 0.6738, 0.5596, 0.7108, 0.7307, 0.5334, 0.5310, 0.5520, 0.5989,
          0.5779, 0.5747, 0.5562, 0.5529, 0.5246, 0.5623, 0.5379, 0.5303, 0.5372,
          0.5317, 0.5081, 0.4953, 0.5006, 0.5079, 0.4620, 0.4987, 0.4826, 0.4637,
          0.4576, 0.4569, 0.4363, 0.4103, 0.4255, 0.4094, 0.4036, 0.4002, 0.3885,
          0.3966, 0.3632, 0.3575, 0.3369, 0.3151, 0.3179, 0.3192, 0.2866, 0.2682,
          0.2670, 0.2394, 0.2331, 0.2220, 0.0741],
         [0.0371, 0.6738, 0.5596, 0.7108, 0.7307, 0.5334, 0.5310, 0.5520, 0.59

In [55]:
vae.parameters

TypeError: 'method' object is not iterable

## What's wrong
I ran the reconstructed tensor through a sigmoid function to use for binary cross entropy (typical loss function for classification problems) but it's not comparable to the tensors of natural numbers from the original samples. I should be using evidence lower bound (ELBO) loss.

In [183]:
gentrl.tokenizer.decode(molecules[0:1])

['Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O']

In [184]:
from copy import deepcopy
a = deepcopy(molecules[0:1])
a[0][1] -= 1

In [185]:
gentrl.tokenizer.decode(a)

['(c1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O']

In [42]:
gentrl.tokenizer.decode(torch.tensor(vae(molecules[0:100])[0].detach() * 27, dtype=int))

['n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==###]]]]ccccOOO666[[[NNCl',
 'n)oo3334)))33)3333====#==#

In [187]:
moses.get_all_metrics(gentrl.tokenizer.decode(torch.tensor(vae(molecules[0:1])[0].detach(), dtype=int)))

ZeroDivisionError: division by zero

In [95]:
moses.get_all_metrics(gentrl.tokenizer.decode(molecules[0:1000]))

{'valid': 0.8069999999999999,
 'unique@1000': 0.30607187112763323,
 'unique@10000': 0.30607187112763323,
 'FCD/Test': 19.45183701321632,
 'SNN/Test': 0.4320562639570295,
 'Frag/Test': 0.7859763272904042,
 'Scaf/Test': 0.1744568673045227,
 'FCD/TestSF': 19.83549232718582,
 'SNN/TestSF': 0.4181724801345681,
 'Frag/TestSF': 0.7868037320515826,
 'Scaf/TestSF': 0.016005336136847514,
 'IntDiv': 0.862626757393271,
 'IntDiv2': 0.8331333931598097,
 'Filters': 0.8946716232961586,
 'logP': 0.5536426027350173,
 'SA': 0.1375670362162745,
 'QED': 0.1049100901271577,
 'weight': 39.63517455010296,
 'Novelty': 0.9959514170040485}

In [188]:
all_atoms = "".join(assays_no_cx_chir["canonical_smiles"].tolist())

In [189]:
charvocab = moses.utils.OneHotVocab.from_data(assays_no_cx_chir["canonical_smiles"])

In [190]:
charvocab.vectors

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.

In [191]:
charvocab.string2ids("CCO[C@H](CNC(=O)c1[nH]c(Br)c(Br)c1Br)c1cc(Br)c(O)c(Br)c1")

[11,
 11,
 15,
 17,
 11,
 28,
 13,
 18,
 1,
 11,
 14,
 11,
 1,
 9,
 15,
 2,
 19,
 3,
 17,
 21,
 13,
 18,
 19,
 1,
 10,
 23,
 2,
 19,
 1,
 10,
 23,
 2,
 19,
 3,
 10,
 23,
 2,
 19,
 3,
 19,
 19,
 1,
 10,
 23,
 2,
 19,
 1,
 15,
 2,
 19,
 1,
 10,
 23,
 2,
 19,
 3]

In [192]:
print(charvocab.string2ids("Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O"))

[11, 19, 3, 19, 21, 1, 11, 4, 11, 9, 11, 11, 1, 11, 15, 11, 1, 9, 15, 2, 11, 14, 5, 11, 11, 14, 11, 11, 5, 2, 15, 4, 2, 19, 1, 9, 15, 2, 17, 21, 13, 18, 19, 3, 9, 15]


In [125]:
assays_no_cx_chir["canonical_smiles"][18]

'Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O'

1932

In [117]:
from torch.utils.data import DataLoader
from moses import CharVocab, StringDataset

train = moses.get_dataset('train')

array(['CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1',
       'CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1',
       'Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO', ...,
       'NC(=O)c1ccc2ccccc2c1Br',
       'CC(=O)Nc1cccc(-c2nc3cc(C)ccc3[nH]c2=O)c1',
       'CC(NC(=O)OC(C)(C)C)c1nc(CO)nn1Cc1ccccc1'], dtype=object)

In [None]:
pd.read_csv("../../moses/da")