# 1
Some functions that might be useful

In [1]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import sklearn 
import numpy as np
import seaborn as sns

In [2]:
smiles = pickle.load(open("ani_smiles.pkl", "rb"))
print(len(smiles))
type(smiles)

1770


list

In [3]:
EOS_char = 'EOS'
SOS_char = 'SOS'

In [4]:
# Adding SOS_char and EOS_char
smiles_copy = smiles.copy()
for smile_str in smiles_copy:
    smile_str.append(EOS_char)
    smile_str.insert(0, SOS_char)
smiles_copy[:2]

[['SOS',
  '[',
  'H',
  ']',
  'C',
  '(',
  '[',
  'H',
  ']',
  ')',
  '(',
  '[',
  'H',
  ']',
  ')',
  '[',
  'H',
  ']',
  'EOS'],
 ['SOS', '[', 'H', ']', 'N', '(', '[', 'H', ']', ')', '[', 'H', ']', 'EOS']]

In [5]:
# Return array of unique entries in all SMILES strings
temp = [list(set(entry)) for entry in smiles_copy] # Return list of lists of unique entries in each SMILES string
temps = np.array(sum(temp, [])) # Return a collapsed 1-D array of temp 
unique_chars = np.unique(temps)
unique_chars

array(['#', '(', ')', '1', '2', '=', 'C', 'EOS', 'H', 'N', 'O', 'SOS',
       '[', ']', 'c', 'n', 'o'], dtype='<U3')

In [6]:
# Matching expected
len(unique_chars)

17

In [7]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(unique_chars.reshape(-1,1))

In [8]:
enc.categories_

[array(['#', '(', ')', '1', '2', '=', 'C', 'EOS', 'H', 'N', 'O', 'SOS',
        '[', ']', 'c', 'n', 'o'], dtype='<U3')]

In [9]:
enc.transform(np.array(smiles[0]).reshape(-1,1))

<19x17 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [10]:
import torch
import torch.nn as nn
import numpy as np

def batches_gen(smiles, batchsize, encoder):
    '''Create a generator that returns batches of size (batch_size,seq_leng,nchars) from smiles, 
    where seq_leng is the length of the longest smiles string and nchar is the length of one-hot encoded characters (17)
       
       Arguments
       ---------
       smiles: python list(nsmiles,nchar) smiles array shape you want to make batches from
       batchsize: Batch size, the number of sequences per batch
       encoder: one hot encoder

    '''
    arr=[torch.tensor(np.array(encoder.transform(np.array(s).reshape(-1,1)).toarray()),dtype=torch.float) for s in smiles] 
        #size (nsmiles,seq_length(variable),nchars)
        
    # The features
    X = [s[:-1,:] for s in arr]
    # The targets, shifted by one
    y = [s[1:,:] for s in arr]
    # pad sequence so that all smiles are the same length
    X = nn.utils.rnn.pad_sequence(X,batch_first=True)
    y = nn.utils.rnn.pad_sequence(y,batch_first=True)

    
    for i in range(len(arr)//batchsize):
        yield X[i*batchsize:(i+1)*batchsize],y[i*batchsize:(i+1)*batchsize]
        
    #drop last batch that is not the same size due to hidden state constraint

    
   



In [11]:
smiles_prop80 = int(len(smiles_copy) * 0.8)
smiles_prop80

1416

In [12]:
train_set = batches_gen(smiles= smiles_copy, batchsize= 1, encoder= enc)
val_set = batches_gen(smiles = smiles_copy[smiles_prop80 :], batchsize= 1, encoder= enc)

## Defining LSTM model

In [13]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.n_layers = 1
        self.n_hidden = 32

        self.lstm = nn.LSTM(
            input_size= 17, # Input Size 
            hidden_size=32,     # rnn hidden unit
            num_layers=1,       # number of rnn layer
            batch_first=True,)
        self.out = nn.Linear(32, 17)

    def forward(self, x, h_state):
        # x (batch, time_step, input_size)
        # h_state (n_layers, batch, hidden_size)
        # r_out (batch, time_step, hidden_size)
        r_out, h_state = self.lstm(x, h_state)
        outs = self.out(r_out)
        return outs, h_state
    
    def init_state(self, batchsize):
        return (torch.zeros(self.n_layers, batchsize, self.n_hidden), #hidden state
                torch.zeros(self.n_layers, batchsize, self.n_hidden)) #cell state

In [14]:
lstm = LSTM()
LR = 0.02           # learning rate

In [15]:
from torch.optim import SGD, Adam
import torch
from torch import nn
import torch.nn.functional as F
import random
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
class Trainer():
    def __init__(self, model, optimizer_type, learning_rate, epoch, batch_size, input_transform=lambda x: x.reshape(x.shape[0], -1)):
        """ The class for training the model
        model: nn.Module
            A pytorch model
        optimizer_type: 'adam' or 'sgd'
        learning_rate: float
        epoch: int
        batch_size: int
        input_transform: func
            transforming input. Can do reshape here
        """
        self.model = model
        if optimizer_type == "sgd":
            self.optimizer = SGD(model.parameters(), learning_rate,momentum=0.9)
        elif optimizer_type == "adam":
            self.optimizer = Adam(model.parameters(), learning_rate)
            
        self.epoch = epoch
        self.batch_size = batch_size
        self.input_transform = input_transform


    def train(self,early_stop=False,l2=False,silent=False):
        """ train self.model with specified arguments
        inputs: np.array, The shape of input_transform(input) should be (ndata,nfeatures)
        outputs: np.array shape (ndata,)
        val_inputs: np.array, The shape of input_transform(val_input) should be (ndata,nfeatures)
        val_outputs: np.array shape (ndata,)
        early_stop: bool
        l2: bool
        silent: bool. Controls whether or not to print the train and val error during training
        
        @return
        a dictionary of arrays with train and val losses and accuracies
        """
        
        losses = []
        accuracies = []
        val_losses = []
        val_accuracies = []
        h_state,c_state = self.model.init_state(1)
        weights = self.model.state_dict()
        lowest_val_loss = np.inf
        loss_func = nn.MSELoss()
        for n_epoch in tqdm(range(self.epoch), leave=False):
            self.model.train()
            epoch_loss = 0
            epoch_acc = 0
            train_set = batches_gen(smiles= smiles_copy, batchsize= 1, encoder= enc)
            for x,y in train_set:
                prediction, (h_state, c_state) = lstm(x, (h_state,c_state))   # rnn output
                h_state = h_state.detach()
                c_state = c_state.detach()
                loss = loss_func(prediction, y)         # calculate loss
                self.optimizer.zero_grad()                   # clear gradients for this training step
                loss.backward()                         # backpropagation, compute gradients
                self.optimizer.step()                        # apply gradients
                
        if early_stop:
            self.model.load_state_dict(weights)    

        return {"model": self.model}

In [16]:
lstm_trainer = Trainer(lstm, "adam", 0.02, 17, 12)

In [17]:
trained_lstm = lstm_trainer.train()

                                                                                                                                                                                     

In [18]:
trained_lstm['model']

LSTM(
  (lstm): LSTM(17, 32, batch_first=True)
  (out): Linear(in_features=32, out_features=17, bias=True)
)

In [19]:
# Defining a method to generate the next character
def predict(net, inputs, h, top_k=None):
        ''' Given a onehot encoded character, predict the next character.
            Returns the predicted onehot encoded character and the hidden state.
        Arguments:
            net: the lstm model
            inputs: input to the lstm model. shape (batch, time_step/length_of_smiles, input_size) with batchsize of 1
            h: hidden state (h,c)
            top_k: int. sample from top k possible characters
            
        '''
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)
        # get the character probabilities
        p = out.data

        # get top characters
        if top_k is None:
            top_ch = np.arange(17) #index to choose from
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        # return the onehot encoded value of the predicted char and the hidden state
        output = np.zeros(inputs.detach().numpy().shape)
        output[:,:,char] = 1
        output = torch.tensor(output,dtype=torch.float)
        return output, h

# Declaring a method to generate new text
def sample(net, encoder, prime=['SOS'], top_k=None):
    """generate a smiles string starting from prime. I use 'SOS' (start of string) and 'EOS'(end of string). 
    You may need to change this based on your starting and ending character.

    """
    net.eval() # eval mode
    # get initial hidden state with batchsize 1
    h = net.init_state(1)
    # First off, run through the prime characters
    chars=[]
    for ch in prime:
        ch = encoder.transform(np.array([ch]).reshape(-1, 1)).toarray() #(1,17)
        ch = torch.tensor(ch,dtype=torch.float).reshape(1,1,17)
        char, h = predict(net, ch, h, top_k=top_k)
    chars.append(char)
    end  = encoder.transform(np.array(['EOS']).reshape(-1, 1)).toarray()
    end = torch.tensor(end,dtype=torch.float).reshape(1,1,17)

    # Now pass in the previous character and get a new one
    while not torch.all(end.eq(chars[-1])):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)
    chars =[c.detach().numpy() for c in chars]
    chars = np.array(chars).reshape(-1,17)
    chars = encoder.inverse_transform(chars).reshape(-1)
    return ''.join(chars[:-1])

A website to check if your smiles is valid: https://chemwriter.com/smiles/ It'll show you a figure for the valid string!

## Testing out LSTM Generated SMILE Strings

In [20]:
# Not a valid SMILE string
sample(trained_lstm['model'], enc, prime = ['SOS'], top_k=6)

'[H]ON(=O)C([H])C([H])([H])=N==CC([H])([H])C([H])=#NC([H])([H])[H]'

In [21]:
# Not a valid SMILE string
sample(trained_lstm['model'], enc, prime = ['SOS'], top_k=6)

'[H](C(C([H])([H])[H])[H])C([H])([H])['

In [22]:
# Not a valid SMILE string
sample(trained_lstm['model'], enc, prime = ['SOS'], top_k=6)

'[H]C(=NOC([#N'

In [23]:
# Not a valid SMILE string
sample(trained_lstm['model'], enc, prime = ['SOS'], top_k=3)

'[H]C=NN([H])OC([H])C(#C([H]C1[=NC([H])([HC([H])([H])[H])[H]'

In [24]:
# Not a valid SMILE string
sample(trained_lstm['model'], enc, prime = ['SOS'], top_k=4)

'[HC(=#CC([H])#CC([H]#C(([#CC([H])([H])[H=C1[H]'