In [2]:
from __future__ import print_function
from rdkit import Chem
import numpy as np
import pandas as pd

In [5]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.2
set_session(tf.Session(config=config))

In [3]:
import keras
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

Using TensorFlow backend.


In [4]:
df = pd.read_csv("NLP_project/preprocessed_smiles.csv",header=None).dropna()

In [5]:
text = "\n".join(df[1])
print('corpus length:', len(text))
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))# cutting the SMILES corpus into the sequences used for training
maxlen = 80
step = 1
text = "\n".join(df[1][:2000])

corpus length: 93318139
total chars: 27


In [7]:
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print('Vectorization done')

nb sequences: 219648
Vectorization...
Vectorization done


In [8]:
def load_model(filename):
    model = Sequential()
    model.add(LSTM(256, input_shape=( maxlen, len(chars)),return_sequences=True,recurrent_dropout=0.2))
    model.add(BatchNormalization(axis=1))
    model.add(LSTM(256,return_sequences=False,recurrent_dropout=0.2))
    model.add(BatchNormalization(axis=1))
    model.add(Dense(len(chars)))
    model.add(Activation('softmax'))
    model.load_weights(filename)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [9]:
def decode(seq):
    seq = seq.replace("A","[nH]")
    seq = seq.replace("L","Cl")
    seq = seq.replace("R","Br")
    return seq

In [68]:
def generate(seed="",nseq_max=10):
    model = load_model("NLP_project/checkpoints/cudnn-overfitting-bigdata-weights-improvement-15-0.4576.hdf5")
    
    nseq = 0
    while (nseq<nseq_max):
        pattern = np.zeros((maxlen, len(chars)), dtype=np.bool)
        seed = seed[::-1]
        pattern[-1] = char_indices["\n"]
        for i, char in enumerate(seed):
            pattern[(-2-i)][char_indices[char]] = 1
        seq = []
        for i in range(200):
            x = np.reshape(pattern,(1,pattern.shape[0],pattern.shape[1]))
            prediction = model.predict(x, verbose=0)
            index = np.argmax(prediction)
            result = indices_char[index]
    
            if(result=="\n"):
                break
            seq.append(result)
            row = np.zeros(len(chars))
            row[index] = 1
            pattern = np.vstack([pattern,row])
            pattern = pattern[1:len(pattern)]
        sec = decode(''.join(seq))
        if(len(sec)>1 and Chem.MolFromSmiles(sec) is not None):
            print("Mol no.",nseq,decode(''.join(seq)))
            nseq+=1
        start = np.random.randint(maxlen, len(text) - 3000)
        end = text.index("\n",start)
        seed= text[end-maxlen+1:end]

    print ("\nDone.")

In [69]:
generate("")

Mol no. 0 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
Mol no. 1 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
Mol no. 2 CC(=N)NC1CCC2(C)C(CCC3(C)C2CC(NC(=N)C)C4C(CCC34C)NC(=N)C)C1=N
Mol no. 3 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
Mol no. 4 CC(=CCCC(=CCCC(=CCCC1(C)CCC(=N)C(C)(C)C1)C)C)CCCC(=C)C
Mol no. 5 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
Mol no. 6 CCCCCCCCC

In [63]:
a = Chem.MolFromSmiles("")

In [64]:
a

<rdkit.Chem.rdchem.Mol at 0x25067d3ca30>