In [1]:
import tensorflow as tf
import numpy as np
from keras import backend as K
from keras.layers import Input, Dense, LSTM, Lambda, Reshape
from keras.models import Model
from keras import objectives
from keras.layers.core import RepeatVector
from keras.losses import MSE

lstm_dim = 64
max_smiles_len = 100
latent_dim = 64
batch_size = 10

SMILES_CHARS = [' ',
                  '#', '%', '(', ')', '+', '-', '.', '/',
                  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                  '=', '@',
                  'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                  'R', 'S', 'T', 'V', 'X', 'Z',
                  '[', '\\', ']',
                  'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                  't', 'u','\n']

input_dim = (max_smiles_len, len(SMILES_CHARS))
output_dim = (max_smiles_len, len(SMILES_CHARS))


smi2index = dict((c, i) for i, c in enumerate(SMILES_CHARS))
index2smi = dict((i, c) for i, c in enumerate(SMILES_CHARS))

Using TensorFlow backend.


In [2]:
with open('smallsmiles.txt') as f:
    smiles_as_list = f.readlines()


def smiles_to_onehot(smiles, max_len = 100):
    onehot = np.zeros((max_len, len(SMILES_CHARS)))
    for i, c in enumerate(smiles):
        onehot[i, smi2index[c]] = 1
    return onehot


def smiles_decoder(onehot):
    smi = ''
    onehot = onehot.argmax( axis=-1 )
    for i in onehot:
        smi += index2smi[i]
    return smi

decoded_rnn_size = 64
encoded_rnn_size = 64
batch_size = 1


In [3]:
input = Input(shape=input_dim)
lstm = LSTM(latent_dim, activation='relu')(input)
zmean = Dense(latent_dim, name='Z_mean_t')(lstm)
zvar = Dense(latent_dim, name='Z_log_var_t', activation=tf.nn.softplus)(lstm)
z = Lambda(lambda m: m[0] + m[1] * tf.random.normal(tf.shape(m[0])))([zmean, zvar])
# z_reshaped = Reshape((-1, latent_dim))(z)
encoder = Model(input, z)

In [4]:
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
repeated = RepeatVector(100)(latent_inputs)
x_2 = LSTM(57, activation='relu', return_sequences=True)(repeated)
decoder = Model(latent_inputs, x_2)

In [9]:
X = [smiles_to_onehot(x) for x in smiles_as_list]
X = np.array(X)
X.shape

(56, 100, 57)

In [99]:
Y = np.zeros((56, 64))
Y.shape

(56, 64)

In [104]:
decoder.compile(loss=MSE, optimizer='adam')

In [105]:
decoder.fit(Y, X, epochs=6, batch_size=10)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.callbacks.History at 0x260d3f1eef0>

In [114]:
test_input = np.zeros((3,64))

In [115]:
result = decoder.predict(test_input)

In [116]:
result.shape

(3, 100, 57)

In [34]:
def calculate_loss(x, x_decoded_mean):
    xent_loss = objectives.mse(x, x_decoded_mean)
    kl_loss = - 0.5 * K.mean(1 + zvar - K.square(zmean) - K.exp(zvar))
    loss = xent_loss + kl_loss
    return loss

In [58]:
def z_loss(x, x_new):
    xent_loss = objectives.mse(x, x_new)
    kl_loss = - 0.5 * K.mean(1 + zvar - K.square(zmean) - K.exp(zvar))
    #loss = kl_loss + xent_loss
    return kl_loss

In [20]:
Y = np.zeros((56, 64))
Y.shape

(56, 64)

In [59]:
encoder.compile(loss=z_loss, optimizer='adam')

Tensor("loss_7/lambda_1_loss/z_loss/mul:0", shape=(), dtype=float32)


In [60]:
encoder.fit(X, X, epochs=6, batch_size=10)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.callbacks.History at 0x260d1752fd0>

In [119]:
X1 = X[0:1]
X1 = np.reshape(X1, (1,100,57))

In [120]:
vec = encoder.predict(X1)
vec

array([[ 6.6693872e-03,  6.4322568e-04,  3.8759988e-03, -1.1439424e-02,
         6.6224020e-05,  7.4769338e-03, -1.7660586e-03, -2.1218286e-04,
         8.7804403e-03,  6.2055010e-03, -2.4987780e-03,  6.8087643e-04,
         7.4393756e-05, -4.5133028e-03,  6.6193566e-03,  6.8425620e-03,
         3.5003098e-03,  1.4129945e-03,  1.5073682e-03,  3.1247204e-03,
         8.9625875e-04,  1.9637654e-03,  3.7939402e-03,  2.8203754e-03,
         2.3255171e-03, -1.7197831e-02,  4.1089072e-03, -4.3943855e-03,
         1.6800459e-02, -6.1089383e-03, -4.7918516e-03,  1.6784719e-03,
        -4.5848945e-03, -3.9659161e-03,  6.0304534e-04,  1.3128249e-03,
        -8.6719170e-04, -1.6955063e-03, -7.0038931e-03, -3.5102221e-03,
        -4.3439888e-04, -1.0867801e-02,  2.3273728e-03,  1.2125955e-03,
         4.6385487e-04,  3.1461292e-03, -1.3763383e-03, -8.9177839e-04,
         7.9907216e-03,  2.6354024e-03, -6.4172214e-03, -2.0542890e-03,
         5.5181184e-03, -4.9536675e-04, -2.3948587e-03,  4.85830

In [11]:
X_tens = tf.convert_to_tensor(X)

In [124]:
X_tens.shape

TensorShape([56, 100, 57])

In [12]:
outputs = decoder(encoder(X_tens))
vae = Model(X_tens,outputs)

TypeError: Input 'b' of 'MatMul' Op has type float32 that does not match type float64 of argument 'a'.

In [9]:
# vae_.compile(loss=calculate_loss, optimizer='adam')
# \

# outputs = decoder(encoder(X))
outputs = encoder(X)

vae = Model(input, outputs)


AttributeError: 'NoneType' object has no attribute '_inbound_nodes'