In [1]:
import pandas as pd
import numpy as np

In [3]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.2
set_session(tf.Session(config=config))

In [2]:
import keras
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

Using TensorFlow backend.


In [12]:
df = pd.read_csv("preprocessed_smiles.csv",header=None).dropna()

In [13]:
text = "\n".join(df[1])
print('corpus length:', len(text))
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))# cutting the SMILES corpus into the sequences used for training
maxlen = 40
step = 3
text = "\n".join(df[1][:2000])

corpus length: 93318139
total chars: 27


In [14]:
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print('Vectorization done')

nb sequences: 73230
Vectorization...
Vectorization done


In [30]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1],X.shape[2]),return_sequences=True,recurrent_dropout=0.2))
model.add(BatchNormalization(axis=1))
model.add(LSTM(256,return_sequences=False,recurrent_dropout=0.2))
model.add(BatchNormalization(axis=1))
model.add(Dense(y.shape[1]))
model.add(Activation('softmax'))


In [31]:
filename = "biggerdata-weights-improvement-12-0.5986.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [4]:
seq = "CCN(CC)C(=O)[C@H]1CN(C)[C@@H]2Cc3c[nH]c4cccc(C2=C1)c34"
df1 = pd.DataFrame([seq])


In [7]:
df1.replace(to_replace={
    "nH" : "A",
    "Cl" : "L",
    "Br" : "R",
    "C@" : "C",
    "C@@" : "C",
    "C@H" : "C",
    "C@@H" : "C",
    "/" : "",
    '\\\\' : ""
},inplace=True, regex=True)
needed_symbs = 'CFLRIONSAcons123456789=#()'
df1 = df1[0].str.replace("[^CFLRIONSAcons123456789=#()]", "").str.strip()

In [11]:
seq = df1.values[0]

In [25]:
text = "\n".join(df[1])
print('corpus length:', len(text))
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))# cutting the SMILES corpus into the sequences used for training
maxlen = 40
step = 3
text = "".join(df1[0])


corpus length: 93318139
total chars: 27


In [26]:
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print('Vectorization done')

nb sequences: 1
Vectorization...
Vectorization done


In [27]:
X.shape

(1, 40, 27)

In [28]:
pattern = X[0]

In [32]:
print ("Seed:")
print ("\"", ''.join([indices_char[value.argmax()] for value in pattern]), "\"")

# generate characters
with open("generated_stuff","w+") as f:
    for i in range(1000):
        x = np.reshape(pattern,(1,pattern.shape[0],pattern.shape[1]))
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = indices_char[index]
        seq_in = [indices_char[value.argmax()]for value in pattern]
        print(result,end="")
        row = np.zeros(len(chars))
        row[index] = 1
        pattern = np.vstack([pattern,row])
        pattern = pattern[1:len(pattern)]
    print ("\nDone.")

Seed:
" CCN(CC)C(=O)C1CN(C)C2Cc3cnc4cccc(C2=C1)c "
3cR(C)RR3C
CC(C)CC(LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L)LC(=N)C(CCCL=C(L)L

KeyboardInterrupt: 