# Creating VAE in keras to generate molecules

In [1]:
import tensorflow as tf
import numpy as np
from keras import backend as K
from keras.layers import Input, Dense, LSTM, Lambda, Reshape
from keras.models import Model
from keras import objectives
from keras.layers.core import RepeatVector
from keras.losses import MSE

lstm_dim = 64
max_smiles_len = 100
latent_dim = 64
batch_size = 10

SMILES_CHARS = [' ',
                  '#', '%', '(', ')', '+', '-', '.', '/',
                  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                  '=', '@',
                  'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                  'R', 'S', 'T', 'V', 'X', 'Z',
                  '[', '\\', ']',
                  'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                  't', 'u','\n']

input_dim = (max_smiles_len, len(SMILES_CHARS))
output_dim = (max_smiles_len, len(SMILES_CHARS))


smi2index = dict((c, i) for i, c in enumerate(SMILES_CHARS))
index2smi = dict((i, c) for i, c in enumerate(SMILES_CHARS))

Using TensorFlow backend.


In [130]:
with open('smallsmiles.txt') as f:
    small_smiles_as_list = f.readlines()
    
with open('smiles.txt') as f:
    full_smiles_as_list = f.readlines()

def smiles_to_onehot(smiles, max_len = 100):
    onehot = np.zeros((max_len, len(SMILES_CHARS)))
    for i, c in enumerate(smiles):
        onehot[i, smi2index[c]] = 1
    return onehot


def smiles_decoder(onehot):
    smi = ''
    onehot = onehot.argmax( axis=-1 )
    for i in onehot:
        smi += index2smi[i]
    return smi

decoded_rnn_size = 64
encoded_rnn_size = 64
batch_size = 1


## Model of the encoder

In [3]:
def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [159]:
x_input = Input(shape=input_dim)
lstm = LSTM(latent_dim, activation='relu')(x_input)
z_mean = Dense(latent_dim)(lstm)
z_var = Dense(latent_dim)(lstm)
# z = Lambda(lambda m: m[0] + m[1] * tf.random.normal(tf.shape(m[0])))([zmean, zvar])

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_var])
encoder = Model(x_input, z)

## Model of the decoder

In [160]:
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
repeated = RepeatVector(100)(latent_inputs)
x_2 = LSTM(57, activation='relu', return_sequences=True)(repeated)
decoder = Model(latent_inputs, x_2)

### Training data

### shape(56, 100, 57)

In [12]:
X = [smiles_to_onehot(x) for x in small_smiles_as_list]
X = np.array(X)
X.shape

(56, 100, 57)

### shape(115936, 100, 57)

In [6]:
X_all = [smiles_to_onehot(x) for x in full_smiles_as_list]
X_all = np.array(X_all)
X_all.shape

(115936, 100, 57)

In [7]:
X = X_all[:10000]
X.shape

(10000, 100, 57)

### Loss functions

In [136]:
def calculate_loss(x, x_decoded):
    recon = K.binary_crossentropy(x, x_decoded)#,from_logits=True)
    kl = 0.5 * K.sum(K.exp(z_var) + K.square(z_mean) - 1. - z_var)
    
    #xent_loss = objectives.mse(x, x_decoded)
    #kl_loss = - 0.5 * K.mean(1 + z_var - K.square(z_mean) - K.exp(z_var))
    #loss = xent_loss + kl_loss
    
    return recon + kl

In [9]:
def z_loss(x, x_new):
    #xent_loss = objectives.mse(x, x_new)
    kl_loss = - 0.5 * K.mean(1 + z_var - K.square(z_mean) - K.exp(z_var))
    # loss = kl_loss + xent_loss
    return kl_loss

## Testing the decoder

In [104]:
decoder.compile(loss=MSE, optimizer='adam')

In [105]:
decoder.fit(Y, X, epochs=6, batch_size=10)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.callbacks.History at 0x260d3f1eef0>

In [114]:
test_input = np.zeros((3,64))

In [115]:
result = decoder.predict(test_input)

In [116]:
result.shape

(3, 100, 57)

## Testing the encoder

In [23]:
encoder.compile(loss=z_loss, optimizer='adam')

In [24]:
encoder.fit(X, X, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1f05a0a6be0>

In [26]:
X1 = X[0:1]
X1 = np.reshape(X1, (1,100,57))

In [27]:
vec = encoder.predict(X1)
vec

array([[-8.1301773e-01, -2.6299402e-01, -9.8160648e-01, -1.2278748e+00,
        -1.1572304e+00, -1.5604430e+00, -1.3140545e+00, -7.6917696e-01,
        -2.4372327e+00,  1.0624061e+00, -3.0992565e-01,  8.3419043e-01,
         5.6062818e-01, -1.3172098e-01, -9.5717466e-01, -2.1324940e+00,
         5.0561869e-01,  2.7011657e-01,  5.0852150e-01, -1.7143270e+00,
         8.7156326e-01,  7.9136580e-01,  2.8402200e-01,  1.1757556e+00,
        -6.0712612e-01,  2.3732959e-01,  1.5342817e+00,  8.8119239e-01,
         1.3047813e+00, -7.7862233e-01, -1.4512120e-03,  1.3836908e-01,
        -1.2058165e+00, -1.8850985e+00, -7.6009864e-01, -1.7057220e+00,
        -1.0867665e+00, -1.8097383e+00,  1.2657901e+00, -2.4245261e-01,
         1.1391633e+00,  5.5728078e-01, -1.2521151e-01,  3.1110209e-01,
         1.3163338e+00,  2.4253933e+00,  1.1826134e+00,  1.9561082e+00,
         9.5373905e-01, -4.7003338e-01,  1.5187393e+00, -1.9416742e-01,
        -3.8858828e-01,  3.5764900e-01, -8.0885172e-01,  1.90862

##  Compile VAE

In [161]:
decoder = Model(latent_inputs, x_2)
encoder = Model(x_input, z)

In [162]:
outputs = decoder(encoder(x_input))

In [163]:
vae = Model(x_input,outputs)

In [164]:
vae.compile(loss=calculate_loss, optimizer='adam')

In [165]:
vae.fit(X,X, epochs=3, batch_size=60, validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x15f0d91ffd0>

In [26]:
vae.summary()

Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 57)           0         
_________________________________________________________________
model_10 (Model)             (None, 64)                39552     
_________________________________________________________________
model_9 (Model)              (None, 100, 57)           27816     
Total params: 67,368
Trainable params: 67,368
Non-trainable params: 0
_________________________________________________________________


In [144]:
X1 = X[0:3]
X1 = np.reshape(X1, (3,100,57))

In [145]:
predicted = vae.predict(X1)

In [146]:
for ind in np.arange(0,predicted.shape[0]):
    print(smiles_decoder(predicted[ind]))

HHiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
111SSSSSuuuuuuuuFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
---55555555555555555555555555555555555555555555FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF


In [147]:
full_smiles_as_list[0:3]

['C[C@@]1(C(=O)C=C(O1)C(=O)[O-])c2ccccc2\n',
 'c1ccc(cc1)C(c2ccccc2)[S@](=O)CC(=O)NO\n',
 'CCC[S@](=O)c1ccc2c(c1)[nH]/c(=N\\C(=O)OC)/[nH]2\n']