In [20]:
import pandas as pd
import numpy as np

In [21]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.2
set_session(tf.Session(config=config))

In [23]:
df = pd.read_csv("data/preprocessed_smiles.csv",header=None)

In [31]:
text = "".join(df[1])
print('corpus length:', len(text))
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))# cutting the SMILES corpus into the sequences used for training
maxlen = 40
step = 3
text = "".join(df[1][:100])

corpus length: 96434193
total chars: 47


In [32]:
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print('Vectorization done')

nb sequences: 4374
Vectorization...
Vectorization done


In [33]:
X.shape

(4374, 40, 47)

In [34]:
y.shape

(4374, 47)

In [35]:
import keras
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

In [43]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1],X.shape[2]),return_sequences=True,recurrent_dropout=0.2))
model.add(BatchNormalization(axis=1))
model.add(LSTM(256,return_sequences=False,recurrent_dropout=0.2))
model.add(BatchNormalization(axis=1))
model.add(Dense(y.shape[1]))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
                  metrics=['accuracy'])


In [44]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
early_stop = keras.callbacks.EarlyStopping(monitor='loss', verbose=1, patience=7)
callbacks_list = [checkpoint]

In [51]:
l_history = model.fit(X, y, epochs=20, batch_size=256, callbacks=callbacks_list)

Epoch 1/1

Epoch 00001: loss improved from 1.58844 to 1.38855, saving model to weights-improvement-01-1.3886.hdf5


In [52]:
pd.DataFrame(l_history.history,index=[0]).to_csv("history.csv")

In [44]:
# pick a random seed
start = np.random.randint(0, len(X)-1)
pattern = X[start]
print ("Seed:")
print ("\"", ''.join([indices_char[value.argmax()] for value in pattern]), "\"")
# generate characters
for i in range(1000):
    x = np.reshape(pattern,(1,pattern.shape[0],pattern.shape[1]))
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = indices_char[index]
    seq_in = [indices_char[value.argmax()]for value in pattern]
    print(result)
    row = np.zeros(len(chars))
    row[index] = 1
    pattern = np.vstack([pattern,row])
    pattern = pattern[1:len(pattern)]
print ("\nDone.")

Seed:
" 5ccccc5CC12CCC3C(CCCCA(Cc4ccccc4)C5=CC(= "
O
)
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
C
O
c
1
c
c
c
c
c
1
C
(
=
O
)
N
C
C
C
A
(
C
c
2
c
c
c
c
c
2
)
C
(
=
O
)
c
3
A
A
c
4
c
c
c
c
c
3
4
C
C
N
(
C
C
=
C
(
C
)
C
C
C
C
C
C
C
C
C
C
C
C
C
C
O
)
N
C
(
C
)
C
c
1
c
c
c
c
c
1
C
(
=
O
)
N
C
C
C
A
(
C
c
2
c
c
c
c
c
2
)
C
(
=
O
)
c
3
A
A
c
4
c
c
c
c
c
3
4
C
C
N
(
C
C
=
C
(
C
)
C
C
C
C
C
C
C
C
C
C
C
C
C
C
O
)
N
C
(
C
)
C
c
1
c
c
c
c
c
1
C
(
=
O
)
N
C
C
C
A
(
C
c
2
c
c
c
c
c
2
)
C
(
=
O
)
c
3
A
A
c
4
c
c
c
c
c
3
4
C
C
N
(
C
C
=
C
(
C
)
C
C
C
C
C
C
C
C
C
C
C
C
C
C
O
)
N
C
(
C
)
C
c
1
c
c
c
c
c
1
C
(
=
O
)
N
C
C
C
A
(
C
c
2
c
c
c
c
c
2
)
C
(
=
O
)
c
3
A
A
c
4
c
c
c
c
c
3
4
C
C
N
(
C
C
=
C
(
C
)
C
C
C
C
C
C
C
C
C
C
C
C
C
C
O
)
N
C
(
C
)
C
c
1
c
c
c
c
c
1
C
(
=
O
)
N
C
C
C
A
(
C
c
2
c
c
c
c
c
2
)
C
(
=
O
)
c
3
A
A
c
4
c
c
c
c
c
3
4
C
C
N
(
C
C
=
C
(
C
)
C
C
C
C
C
C
C
C
C
C
C
C
C
C
O
)
N
C
(
C
)
C
c
1
c
c
c
c
c
1
C
(
=
O
)
N
C
C
C
A
(
C
c
2
c
c
c
c
c
2
)
C
(
=
O

In [32]:
pattern.shape

(40, 47)

In [45]:
chars

['#',
 '%',
 '(',
 ')',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 '=',
 'A',
 'B',
 'C',
 'F',
 'I',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'V',
 'X',
 'Z',
 'a',
 'b',
 'c',
 'e',
 'g',
 'i',
 'l',
 'o',
 'p',
 'r',
 's',
 't',
 'u']