In [1]:
import numpy as np
import csv
from rdkit import Chem
import matplotlib.pyplot as plt
import openpyxl as xl
%matplotlib inline

from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import Conv2D, Conv2DTranspose, Dense, Flatten, Dropout, BatchNormalization, Reshape, LeakyReLU
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

In [2]:
SMILES_CHARS = [' ',
                  '#', '%', '(', ')', '+', '-', '.', '/', '_', '_', '_', '_', '_', '_', '_', '_'
                  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                  '=', '@',
                  'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                  'R', 'S', 'T', 'V', 'X', 'Z',
                  '[', '\\', ']',
                  'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                  't', 'u', 'd']
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )
def smiles_encoder( smiles, maxlen=1024 ):
    smiles = Chem.MolToSmiles(Chem.MolFromSmiles( smiles ))
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X


In [3]:
cid = []
smiles = []
with open('SMILES.csv', 'r') as csvfile: 
    reader = csv.reader(csvfile, skipinitialspace=True)
    next(reader)
    for num, val in reader:
        cid.append(num)
        smiles.append(val)

In [4]:
temp_list = []
for i in smiles:
    temp_list.append(smiles_encoder(i))
onehot_smiles_list = np.array(temp_list)
print(onehot_smiles_list.shape)

(206, 1024, 64)


In [5]:
train_x = onehot_smiles_list.reshape((-1, 1024, 64, 1))
print(train_x.shape)

(206, 1024, 64, 1)


In [20]:
input_shape = (1024, 64, 1)
input_tensor = layers.Input(input_shape)

conv1 = layers.Conv2D(1, (3,3), padding='same', activation='relu')(input_tensor)
pooling1 = layers.MaxPool2D((4,4),name='imlatent_layer')(conv1)

conv2 = layers.Conv2D(1, (3,3), padding='same', activation='relu')(pooling1)
pooling2 = layers.MaxPool2D((4,4),name='imlatent_layer2')(conv2)

#conv3 = layers.Conv2D(1, (3,3), padding='same', activation='relu')(pooling2)
#pooling3 = layers.MaxPool2D((4,4),name='imlatent_layer3')(conv3)

#conv4 = layers.Conv2D(1, (3,3), padding='same', activation='relu')(pooling3)
#pooling4 = layers.MaxPool2D((2,2),name='imlatent_layer4')(conv4)

flatten1 = layers.Flatten()(pooling2)
dense1 = layers.Dense(128, activation="relu",name='latent_layer')(flatten1)

latent_tensor = dense1

dense2 = layers.Dense(64 * 4, activation="sigmoid")(latent_tensor)
reshaped = tf.reshape(dense2, [-1, 64, 4, 1])

upsample1 = layers.UpSampling2D((4,4))(reshaped)
deconv1 = layers.Conv2DTranspose(1, (3,3), padding='same', activation='sigmoid')(upsample1)

upsample2 = layers.UpSampling2D((4,4))(deconv1)
deconv2 = layers.Conv2DTranspose(1, (3,3), padding='same', activation='sigmoid')(upsample2)

#upsample3 = layers.UpSampling2D((2,2))(deconv2)
#deconv3 = layers.Conv2DTranspose(1, (3,3), padding='same', activation='sigmoid')(upsample3)

#upsample4 = layers.UpSampling2D((2,2))(deconv3)
#deconv4 = layers.Conv2DTranspose(1, (3,3), padding='same', activation='sigmoid')(upsample4)

output_tensor = deconv2

In [21]:
ae = models.Model(input_tensor, output_tensor)
ae.compile(optimizer = tf.keras.optimizers.Adam(0.0005), loss='mse', metrics=['accuracy'])

In [22]:
ae.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1024, 64, 1)]     0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 1024, 64, 1)       10        
_________________________________________________________________
imlatent_layer (MaxPooling2D (None, 256, 16, 1)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 256, 16, 1)        10        
_________________________________________________________________
imlatent_layer2 (MaxPooling2 (None, 64, 4, 1)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 256)               0         
_________________________________________________________________
latent_layer (Dense)         (None, 128)               3289

In [23]:
epoch=100
batch_size=32

ae.fit(train_x, train_x,
      batch_size = batch_size,
      epochs=epoch,
      verbose=2)

Train on 206 samples
Epoch 1/100
206/206 - 1s - loss: 0.2662 - accuracy: 0.0330
Epoch 2/100
206/206 - 0s - loss: 0.2618 - accuracy: 0.0331
Epoch 3/100
206/206 - 0s - loss: 0.2575 - accuracy: 0.0331
Epoch 4/100
206/206 - 0s - loss: 0.2533 - accuracy: 0.0331
Epoch 5/100
206/206 - 0s - loss: 0.2493 - accuracy: 0.3792
Epoch 6/100
206/206 - 0s - loss: 0.2454 - accuracy: 0.9882
Epoch 7/100
206/206 - 0s - loss: 0.2416 - accuracy: 0.9981
Epoch 8/100
206/206 - 0s - loss: 0.2378 - accuracy: 0.9982
Epoch 9/100
206/206 - 0s - loss: 0.2340 - accuracy: 0.9989
Epoch 10/100
206/206 - 0s - loss: 0.2302 - accuracy: 0.9991
Epoch 11/100
206/206 - 0s - loss: 0.2264 - accuracy: 0.9991
Epoch 12/100
206/206 - 0s - loss: 0.2224 - accuracy: 0.9991
Epoch 13/100
206/206 - 0s - loss: 0.2184 - accuracy: 0.9991
Epoch 14/100
206/206 - 0s - loss: 0.2142 - accuracy: 0.9991
Epoch 15/100
206/206 - 0s - loss: 0.2099 - accuracy: 0.9991
Epoch 16/100
206/206 - 0s - loss: 0.2055 - accuracy: 0.9991
Epoch 17/100
206/206 - 0s - 

<tensorflow.python.keras.callbacks.History at 0x2b10a866e88>

In [10]:
print(train_x[19].reshape(1024, 64))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
decoded_smiles = ae.predict(train_x[19].reshape(1, 1024 , 64, 1))
print(decoded_smiles.reshape(1024, 64))

[[0.20712812 0.16648932 0.16051024 ... 0.16687477 0.17921308 0.27409315]
 [0.18912381 0.12427933 0.11485466 ... 0.10564551 0.11985246 0.15845707]
 [0.1936338  0.12629987 0.11290579 ... 0.10500666 0.11653868 0.15540384]
 ...
 [0.22403754 0.16265222 0.14142466 ... 0.12864196 0.13882913 0.16192141]
 [0.24537995 0.17261769 0.1468199  ... 0.1348542  0.14467111 0.16926016]
 [0.35638878 0.3632024  0.32092187 ... 0.311336   0.32527378 0.27170733]]


In [12]:
feature_layer_model = models.Model(inputs=ae.input,
                                       outputs=ae.get_layer('latent_layer').output)
feature_output = feature_layer_model.predict(train_x[5].reshape(1, 1024, 64, 1))

In [13]:
print(feature_output)

[[3.9444644 0.2804415 0.        3.4649694 0.        6.068765  3.4025013
  0.        1.2272887 0.        6.1782007 1.3769165 6.4567766 0.
  0.        0.        0.        0.        0.        8.997435  3.1276824
  5.0074215 0.        0.        0.        0.        3.1346252 0.
  0.        7.8993464 1.2959527 2.9485197 0.        0.        0.
  0.        0.        5.5912127 0.        0.        0.        0.
  4.8017926 0.        1.9556081 4.643879  4.341683  0.        6.5942855
  0.        7.879068  0.        0.        2.0449655 4.94953   0.
  0.        0.        8.4678545 0.        0.        0.        0.
  0.        0.        7.2130604 0.        4.038893  0.        0.
  6.213877  6.525354  7.997054  0.5715022 4.015274  0.        0.
  5.3507576 4.199129  4.4273577 0.        1.75201   7.169049  0.
  0.        7.3910565 0.        6.8272853 3.2000897 0.        3.2368815
  0.        0.        0.        0.        7.431835  0.        8.553125
  0.        1.7989001 5.6844244 0.        8.316126  0.  

In [14]:
feature_output = feature_layer_model.predict(train_x[19].reshape(1, 1024, 64, 1))
print(feature_output)

[[4.05404    0.16977392 0.         3.5206544  0.         6.070969
  3.4253461  0.         1.1486619  0.         6.454553   1.4920875
  6.5996037  0.         0.         0.         0.         0.
  0.         9.219298   3.2673113  5.102209   0.         0.
  0.         0.         3.3190012  0.         0.         8.029146
  1.2464694  2.943461   0.         0.         0.         0.
  0.         5.6530833  0.         0.         0.         0.
  4.862102   0.         1.8952048  4.7464323  4.543423   0.
  6.7621408  0.         7.957645   0.         0.         2.026635
  5.0748463  0.         0.         0.         8.611637   0.
  0.         0.         0.         0.         0.         7.285817
  0.         4.1539283  0.         0.         6.3066792  6.6341586
  8.064503   0.5489042  4.161539   0.         0.         5.545892
  4.3930163  4.464465   0.         1.7219397  7.4374714  0.
  0.         7.406318   0.         6.8944993  3.0676563  0.
  3.3007038  0.         0.         0.         0.        

In [15]:
featureList = []
for i in range (0, train_x.shape[0]):
    fo = feature_layer_model.predict(train_x[i].reshape(1, 1024, 64, 1))
    featureList.append(fo.tolist())

In [16]:
print(featureList[1][0])
print(cid)

[3.9732320308685303, 0.22670704126358032, 0.0, 3.414579153060913, 0.0, 6.002467155456543, 3.442251443862915, 0.0, 1.1483378410339355, 0.0, 6.262481689453125, 1.434977412223816, 6.503018379211426, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.041176795959473, 3.202993154525757, 4.975229263305664, 0.0, 0.0, 0.0, 0.0, 3.1910784244537354, 0.0, 0.0, 7.930080413818359, 1.2571386098861694, 2.940086841583252, 0.0, 0.0, 0.0, 0.0, 0.0, 5.579267501831055, 0.0, 0.0, 0.0, 0.0, 4.853625774383545, 0.0, 1.9774384498596191, 4.67396879196167, 4.451536178588867, 0.0, 6.650375843048096, 0.0, 7.941138744354248, 0.0, 0.0, 2.0049214363098145, 4.994362831115723, 0.0, 0.0, 0.0, 8.49977970123291, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.21952486038208, 0.0, 4.114166736602783, 0.0, 0.0, 6.212805271148682, 6.5273966789245605, 8.015233039855957, 0.5626670122146606, 4.03936767578125, 0.0, 0.0, 5.349024772644043, 4.277531147003174, 4.485561370849609, 0.0, 1.8250845670700073, 7.214290142059326, 0.0, 0.0, 7.402918338775635, 0.0, 6.813878536

In [17]:
wb = xl.Workbook()
w1 = wb['Sheet']
w1.cell(1, 1).value = 'CID'
w1.cell(1, 2).value = 'feature'

In [18]:
for i in range(2, len(featureList) + 2):
        w1.cell(i, 1).value = cid[i-2]
        for j in range(0, 128):
            w1.cell(i, 2 + j).value = featureList[i-2][0][j]
        

In [19]:
wb.save('drugFeature.xlsx')