# BUILDING THE GENERATOR
An LSTM-based RNN model serve as the SMILES string generator in this step, subsequently an RL network will
utilize this as it's initialization step


In [4]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import keras
from keras import layers
import random
import sys
import h5py

## The Dataset
The GDBChEMBL SMILES string dataset gotten from GenBank was used for training the generator in this step

In [5]:
smiles = []
chars = []

with open('data/GDBChEMBL.smi', 'r') as f:
    text = f.read()
    chars = sorted(list(set(text)))
    smiles = text.split('\n')

print(len(smiles))
print(len(chars))
# print(text[:1000])
print(smiles[:10])
print(chars[:10])

9978096
25
['CC1(O)C([NH3+])C1O', 'NS(=O)(=O)C(O)C([NH3+])C([O-])=O', 'CC1C([NH3+])C1O', 'CS(=O)(=O)NC1C([NH3+])C1O', 'NS(=O)(=O)C1OC1C([O-])=O', 'NS(=O)(=O)CCS(N)(=O)=O', 'NS(=O)(=O)C1OCC([NH3+])C1O', 'CC1CC1C', '[NH3+]C1C(O)C1(O)C([O-])=O', 'CC1CC2C3CC(C)C2C13']
['\n', '#', '(', ')', '+', '-', '1', '2', '3', '4']


In [6]:
maxlen = max([len(j) for j in smiles])
print(maxlen)

57


In [7]:
def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    return distribution / np.sum(distribution)

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## Preparing the next Character List
A list of subsequent characters per every chunk subset of the original SMILES token, 
the resulting character list contained 137405065 elements

In [6]:
smile_next_chars = []
smile_subsets = []
start_idx = 0
charslen = 3

for smile in smiles:
    for _ in range(int(len(smile)/2)):
        end = 0
        if len(smile) > charslen+1: end = len(smile)-charslen-1
        start_idx = random.randint(0, end)
        smile_subsets.append(smile[start_idx: start_idx+charslen])
        smile_next_chars.append(smile[start_idx+charslen: start_idx+charslen+1])

print(len(smile_next_chars))

137405065


## Preparing the Input
First randomly shuffling to increaing the entropy 
and then slicing the list to a maximum of one million entries (My system constraint)

In [None]:
# ONE HOT ENCODING
print('Vectorization...')

# np.random.shuffle(smile_subsets)
train_samples = smile_subsets[:1000000]

x = np.zeros((len(train_samples), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(train_samples), len(chars)), dtype=np.bool)

for i, smile in enumerate(smiles):
    for t, char in enumerate(smile[0]):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

## SETTING US THE MODEL
An LSTM layer with 128 units followed by a Dense layer with a softmax activation

In [None]:
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## TRAINING THE GENERATOR
The generator is trained on the subset of the prepared subset for 60 Epochs using a sampling mechanism after every
sequence generation to further increase the entropy, using a series of sampling temperatures to analyze the distinctions 
in their randomness

In [None]:
for epoch in range(1, 60):
    print('\nepoch -- ', epoch)
    
    model.fit(x, y, batch_size=128, epochs=1)
    random_smile_idx = random.randint(0, len(smiles)-1)
    start_index = random.randint(0, len(smiles[random_smile_idx][0]) - charslen - 1)
   
    generated_text = smiles[random_smile_idx][0][start_index: start_index + charslen]
    
    print('--- Generating with seed: "' + generated_text + '"')
    
    for temperature in [0.5, 1.0, 1.2]:
        print('\n------ temperature:', temperature)
        sys.stdout.write(generated_text)
        
        for i in range(100):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):3
                sampled[0, t, char_indices[char]] = 1

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index][0]
            generated_text += next_char
            generated_text = generated_text[1:]
            sys.stdout.write(next_char)

        sys.stdout.write(generated_text)

In [None]:
print('Saving model...')
# serialize model to json
model_json = model.to_json()

with open("models/lstm-generator.json", "w") as json_file:
    json_file.write(model_json)
    
# save the model
model.save("models/lstm-generator.h5")

# serialize weights to HDF5
model.save_weights("models/lstm-generator-weights.h5")
print('saved')