# TIF360 Project

# Data Pre-processing

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

UPDATED ON 15/05-2023  11:20

In [56]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math
import atomInSmiles
from torch.utils.data import DataLoader


In [2]:
# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates
def create_Onehot_Matrix(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    onehot_Matrix = np.zeros((len(dictionary),maxTokenLength))
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        onehot_Matrix[dictionary[key],keyCount] = 1
        keyCount+=1

    # Return it
    return(onehot_Matrix)

In [3]:

# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates

# THIS VERSION RETURNS TORCH TENSOR

#Version taht returns tensor
def create_encoded_tensor(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    #encoded_vector = np.zeros((1,maxTokenLength))
    encoded_tensor = torch.zeros(maxTokenLength,dtype=torch.long)
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        encoded_tensor[keyCount] = dictionary[key]
        keyCount+=1
    
    return encoded_tensor.unsqueeze(0)

In [4]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))


(132820, 21)


In [59]:

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
targetTensor = torch.tensor(df.loc[:, properties_names].values, dtype=torch.float32)  # shape = (n_samples, n_properties)


In [6]:

testSmile = x_smiles[8]
print(testSmile)
tokens = atomInSmiles.encode(testSmile)
tokens = tokens.split()
print(tokens)
print(x_smiles.shape)



CC(C)C#N
['[CH3;!R;C]', '[CH;!R;CCC]', '(', '[CH3;!R;C]', ')', '[C;!R;CN]', '#', '[N;!R;C]']
(132820,)


In [7]:
# tokenize all smiles
#import atomInSmiles

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

Longest word (max amount of tokens): 22


In [8]:

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

In [28]:
encodedTensors = torch.tensor([],dtype=int)
for token in tokenList:
    encodedTensors = torch.cat((encodedTensors,create_encoded_tensor(token,tokenDict,maxTokenLength)),dim=0)

print(encodedTensors.shape)


torch.Size([132820, 22])


In [29]:
# From attention is all you need
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, max_len,dropout):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x = x + torch.tensor(self.pe[:, :x.size(1)], 
        #                  requires_grad=False)
        x = x + self.pe[:x.size(0), :].detach()
        return self.dropout(x)

In [31]:


# Define the size of the embedding space
embedding_dim = 64
vocab_size = len(tokenDict)

embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

# Try two inputs and concat them (this will be done for batches)
input_sentence = create_encoded_tensor(tokenList[100],tokenDict,maxTokenLength)
other_input = create_encoded_tensor(tokenList[150],tokenDict,maxTokenLength)

input_sentence = torch.cat((input_sentence,other_input),dim=0)
print(input_sentence)

# Pass the input tensor through the embedding layer
embedded_tensor = embedding_layer(input_sentence)

# Print the shape of the output tensor
print(embedded_tensor.shape)  # should be (1, 22, 64)
print(embedded_tensor)

posencoder = PositionalEncoding(embedding_dim,maxTokenLength,0.1)
encoded = posencoder.forward(embedded_tensor)
print(encoded.shape)

tensor([[ 6, 22,  8, 15,  9, 26, 18, 10, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0],
        [17, 10, 78, 33, 34, 35,  8, 15,  9, 34, 33,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0]])
torch.Size([2, 22, 64])
tensor([[[-0.1868,  1.1513,  0.4625,  ..., -0.7419,  1.0518, -1.1054],
         [-1.8999,  1.1342, -0.4428,  ...,  0.0293, -0.2000, -0.0839],
         [ 0.8472, -0.4422,  0.9121,  ...,  2.9850,  1.7387, -1.0390],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.3443, -0.3187,  0.1928,  ..., -1.1231, -1.4207,  0.1440],
         [ 0.9567,  0.9758, -1.7416,  ..., -0.8060, -0.8557, -1.2738],
         [ 0.0918, -1.8911, -0.3763,  ...,  0.2317,  1.3684,  1.3230],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0

In [32]:

class TransformerLayer(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.Attention = torch.nn.MultiheadAttention(embedding_dim,num_heads=4,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(embedding_dim)
        self.Dense1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.Dense2 = torch.nn.Linear(hidden_channels,hidden_channels)
        self.Norm2 = torch.nn.LayerNorm(hidden_channels)
        

    def forward(self, x):
        addNormX = x
        #print(x.shape)
        x, _ = self.Attention(x,x,x)
        #print('attention output',x.shape)
        x = self.Norm1(x + addNormX)
        #print('norm + input',x.shape)
        addNormX = x
        x = self.Dense1(x)
        #print('first dense output',x.shape)
        x = self.Dense2(x)
        #print('second dense output',x.shape)
        x = self.Norm2(x + addNormX)
        #print(x.shape)

 
        return x

In [33]:
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim, vocab_size, maxTokenLength):
        super().__init__()
        torch.manual_seed(12345)
        self.embedding_dim = 64
        # Embedd and add pos encoding to input
        self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim = self.embedding_dim , max_norm=True)
        self.PositionalEncoding = PositionalEncoding(self.embedding_dim, maxTokenLength, dropout = 0.1)

        
        self.TransEnc1 = TransformerLayer(self.embedding_dim,hidden_channels)
        self.Pooling = torch.nn.AvgPool1d(kernel_size= 2)

        self.DenseOut1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.DenseOut2 = torch.nn.Linear(hidden_channels,output_dim)

    def forward(self,x):
        x = self.EmbeddingLayer(x)
        x = self.PositionalEncoding(x)
        x = self.TransEnc1(x)
        #x = self.Pooling(x)
        x = x[:,-1,:]
        x = self.DenseOut1(x)
        x = self.DenseOut2(x)

        return x

In [34]:
transformer = TransformerNetwork(64,15,len(tokenDict),maxTokenLength)
input_sentence
output = transformer.forward(input_sentence)
print(output, output.shape)

tensor([[-0.6938,  0.1116,  0.7285, -0.3562,  0.4006, -0.0077, -0.0471,  0.0307,
          0.2455, -0.1964, -0.0895, -0.2404,  0.6831, -0.6525, -0.3149],
        [-0.5684,  0.0187,  0.4665, -0.3207,  0.3235,  0.1496,  0.0632,  0.0263,
          0.0735, -0.2491,  0.2192, -0.1793,  0.4808, -0.6225, -0.1946]],
       grad_fn=<AddmmBackward0>) torch.Size([2, 15])


In [66]:
from torch.utils.data import Dataset, DataLoader
batch_size = 64
## Create iterable dataset class:

class datasetObject(Dataset):
    def __init__(self,data,targets):
        self.data = data
        self.targets = targets
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sample = self.data[index]
        target = self.targets[index]
        return torch.tensor(sample), torch.tensor(target)


print(encodedTensors.shape)
data = DataLoader(datasetObject(encodedTensors,targetTensor),batch_size,shuffle=False)

torch.Size([132820, 22])


In [67]:
d_model = 64
hidden_channels = 64
d_target = 15
vocab_size = len(tokenDict)


model = TransformerNetwork(hidden_channels,d_target,vocab_size,maxTokenLength) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = torch.nn.MSELoss()

def train(data_in, targets):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in)
      loss = criterion(out, targets)

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.

      return loss



testTensors = encodedTensors[:100]
testTargets = targetTensor[:100]

for epoch in range(1000):
      for input,targets in data:
            loss = train(input,targets)
      print(loss)

  return torch.tensor(sample), torch.tensor(target)


In [54]:
print(model(testTensors[0]))
print(testTargets[0])

tensor([[ 7.7383e+00,  3.2666e+00,  2.5561e+00,  2.3051e+00,  4.8976e+01,
         -2.1962e-01,  3.4287e-02,  2.7112e-01,  5.3636e+02,  9.4944e-02,
         -2.6862e+02, -2.6863e+02, -2.6862e+02, -2.6865e+02,  2.1880e+01]],
       grad_fn=<AddmmBackward0>)
tensor([ 0.0000e+00,  4.4260e+00,  4.4260e+00,  0.0000e+00,  3.8520e+01,
        -2.5990e-01, -2.1400e-02,  2.3860e-01,  2.7863e+02,  3.7354e-02,
        -1.5346e+02, -1.5346e+02, -1.5345e+02, -1.5348e+02,  1.5312e+01])
