In [1]:
import numpy as np
import csv
from rdkit import Chem
import matplotlib.pyplot as plt
%matplotlib inline

from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import Conv2D, Conv2DTranspose, Dense, Flatten, Dropout, BatchNormalization, Reshape, LeakyReLU
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

In [2]:
SMILES_CHARS = [' ',
                  '#', '%', '(', ')', '+', '-', '.', '/', '_', '_', '_', '_', '_', '_', '_', '_'
                  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                  '=', '@',
                  'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                  'R', 'S', 'T', 'V', 'X', 'Z',
                  '[', '\\', ']',
                  'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                  't', 'u', 'd']
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )
def smiles_encoder( smiles, maxlen=1024 ):
    smiles = Chem.MolToSmiles(Chem.MolFromSmiles( smiles ))
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X


In [3]:
cid = []
smiles = []
with open('SMILES.csv', 'r') as csvfile: 
    reader = csv.reader(csvfile, skipinitialspace=True)
    next(reader)
    for num, val in reader:
        cid.append(num)
        smiles.append(val)

In [4]:
temp_list = []
for i in smiles:
    temp_list.append(smiles_encoder(i))
onehot_smiles_list = np.array(temp_list)
print(onehot_smiles_list.shape)

(155, 1024, 64)


In [5]:
train_x = onehot_smiles_list.reshape((-1, 1024, 64, 1))
print(train_x.shape)

(155, 1024, 64, 1)


In [6]:
input_shape = (1024, 64, 1)
input_tensor = layers.Input(input_shape)

conv1 = layers.Conv2D(1, (3,3), padding='same', activation='relu')(input_tensor)
pooling1 = layers.MaxPool2D(name='imlatent_layer')(conv1)

conv2 = layers.Conv2D(1, (3,3), padding='same', activation='relu')(pooling1)
pooling2 = layers.MaxPool2D(name='imlatent_layer2')(conv2)

conv3 = layers.Conv2D(1, (3,3), padding='same', activation='relu')(pooling2)
pooling3 = layers.MaxPool2D(name='imlatent_layer3')(conv3)

conv4 = layers.Conv2D(1, (3,3), padding='same', activation='relu')(pooling3)
pooling4 = layers.MaxPool2D(name='imlatent_layer4')(conv4)

flatten1 = layers.Flatten()(pooling4)
dense1 = layers.Dense(128, activation='relu',name='latent_layer')(flatten1)

latent_tensor = dense1

dense2 = layers.Dense(64 * 4, activation='relu')(latent_tensor)
reshaped = tf.reshape(dense2, [-1, 64, 4, 1])

upsample1 = layers.UpSampling2D()(reshaped)
deconv1 = layers.Conv2DTranspose(1, (3,3), padding='same', activation='sigmoid')(upsample1)

upsample2 = layers.UpSampling2D()(deconv1)
deconv2 = layers.Conv2DTranspose(1, (3,3), padding='same', activation='sigmoid')(upsample2)

upsample3 = layers.UpSampling2D()(deconv2)
deconv3 = layers.Conv2DTranspose(1, (3,3), padding='same', activation='sigmoid')(upsample3)

upsample4 = layers.UpSampling2D()(deconv3)
deconv4 = layers.Conv2DTranspose(1, (3,3), padding='same', activation='sigmoid')(upsample4)

output_tensor = deconv4

In [7]:
ae = models.Model(input_tensor, output_tensor)
ae.compile(optimizer = 'Adam', loss='mse', metrics=['accuracy'])

In [8]:
ae.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1024, 64, 1)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 1024, 64, 1)       10        
_________________________________________________________________
imlatent_layer (MaxPooling2D (None, 512, 32, 1)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 512, 32, 1)        10        
_________________________________________________________________
imlatent_layer2 (MaxPooling2 (None, 256, 16, 1)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 256, 16, 1)        10        
_________________________________________________________________
imlatent_layer3 (MaxPooling2 (None, 128, 8, 1)         0     

In [9]:
epoch=100
batch_size=5

ae.fit(train_x, train_x,
      batch_size = batch_size,
      epochs=epoch,
      verbose=2)

Train on 155 samples
Epoch 1/100
155/155 - 2s - loss: 0.1143 - accuracy: 0.9982
Epoch 2/100
155/155 - 0s - loss: 0.0786 - accuracy: 0.9989
Epoch 3/100
155/155 - 0s - loss: 0.0541 - accuracy: 0.9992
Epoch 4/100
155/155 - 0s - loss: 0.0389 - accuracy: 0.9992
Epoch 5/100
155/155 - 0s - loss: 0.0289 - accuracy: 0.9992
Epoch 6/100
155/155 - 0s - loss: 0.0222 - accuracy: 0.9992
Epoch 7/100
155/155 - 0s - loss: 0.0177 - accuracy: 0.9992
Epoch 8/100
155/155 - 0s - loss: 0.0144 - accuracy: 0.9992
Epoch 9/100
155/155 - 0s - loss: 0.0121 - accuracy: 0.9992
Epoch 10/100
155/155 - 0s - loss: 0.0103 - accuracy: 0.9992
Epoch 11/100
155/155 - 0s - loss: 0.0089 - accuracy: 0.9992
Epoch 12/100
155/155 - 0s - loss: 0.0078 - accuracy: 0.9992
Epoch 13/100
155/155 - 0s - loss: 0.0070 - accuracy: 0.9992
Epoch 14/100
155/155 - 0s - loss: 0.0062 - accuracy: 0.9992
Epoch 15/100
155/155 - 0s - loss: 0.0057 - accuracy: 0.9992
Epoch 16/100
155/155 - 0s - loss: 0.0052 - accuracy: 0.9992
Epoch 17/100
155/155 - 0s - 

<tensorflow.python.keras.callbacks.History at 0x1f8f9a20048>

In [10]:
print(train_x[19].reshape(1024, 64))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
decoded_smiles = ae.predict(train_x[19].reshape(1, 1024 , 64, 1))
print(decoded_smiles.reshape(1024, 64))

[[0.10395217 0.0367756  0.03544835 ... 0.03678493 0.04039227 0.09129008]
 [0.04550865 0.01086783 0.01051379 ... 0.01150884 0.01238969 0.06541866]
 [0.04485118 0.01039741 0.01006857 ... 0.01103273 0.01181365 0.06200372]
 ...
 [0.04730641 0.01111997 0.01062462 ... 0.01163894 0.01274571 0.06732802]
 [0.04800796 0.01162309 0.01107088 ... 0.01215365 0.01338698 0.07118256]
 [0.19461042 0.09298883 0.09219861 ... 0.09751165 0.09917062 0.21479225]]


In [12]:
feature_layer_model = models.Model(inputs=ae.input,
                                       outputs=ae.get_layer('latent_layer').output)
feature_output = feature_layer_model.predict(train_x[0].reshape(1, 1024, 64, 1))

In [13]:
print(feature_output)

[[1.8768187  1.6279799  0.         0.         0.         0.
  1.1966372  1.3011992  0.         0.         1.5726727  1.2234371
  0.         1.8687235  0.         0.26868668 1.4559013  1.3739319
  1.5518051  0.         0.         1.771576   1.2109431  0.
  2.006953   0.         2.039132   1.8487719  0.         1.0133162
  0.         0.         0.         0.         0.         1.4228611
  1.6946957  1.0815735  1.3320941  1.3091203  0.         0.
  1.7096692  0.         0.41279632 1.7198887  1.7899405  0.
  1.7230891  1.3594626  0.         1.5856155  1.3514068  1.6141181
  0.         1.5273019  1.667758   1.04409    0.         0.8885827
  0.04330643 0.         1.6298231  0.         1.5751554  1.78937
  0.         1.5261304  0.         1.6853427  1.6776731  0.07835371
  1.0538318  1.8390489  1.7020702  1.1593007  0.95902085 1.5746633
  0.         1.5979958  1.3469589  0.         1.1969461  1.1993632
  1.7820381  0.9564518  1.7683437  0.         1.6722438  1.8454072
  0.57355523 0.9465299  

In [14]:
feature_output = feature_layer_model.predict(train_x[19].reshape(1, 1024, 64, 1))
print(feature_output)

[[1.988765   1.6177822  0.         0.         0.         0.
  1.2413208  1.3086995  0.         0.         1.6321636  1.3439754
  0.         1.9236605  0.         0.2873781  1.512734   1.4536446
  1.5629421  0.         0.         1.830711   1.1838508  0.
  2.1294224  0.         2.1021967  1.9117955  0.         1.0978346
  0.         0.         0.         0.         0.         1.4633055
  1.7524985  1.1352438  1.3715003  1.4306504  0.         0.
  1.6903331  0.         0.413427   1.8000267  1.8571514  0.
  1.7736176  1.3652292  0.         1.5768154  1.3600342  1.6679938
  0.         1.4318469  1.7110913  1.0179607  0.         1.0019001
  0.04684038 0.         1.7469618  0.         1.5985178  1.8635166
  0.         1.5640324  0.         1.722074   1.758937   0.12067952
  1.0569512  1.8594985  1.7612034  1.1793566  0.9308579  1.6308728
  0.         1.6036444  1.3567336  0.         1.2792561  1.3393447
  1.8847947  0.97279084 1.837759   0.         1.7195375  1.9388602
  0.6242478  1.0366404