# TIF360 Project

# Data Pre-processing

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

UPDATED ON 15/05-2023  11:20

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math
import atomInSmiles



In [2]:
# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates
def create_Onehot_Matrix(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    onehot_Matrix = np.zeros((len(dictionary),maxTokenLength))
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        onehot_Matrix[dictionary[key],keyCount] = 1
        keyCount+=1

    # Return it
    return(onehot_Matrix)

In [3]:

# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates

# THIS VERSION RETURNS TORCH TENSOR

#Version taht returns tensor
def create_encoded_tensor(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    #encoded_vector = np.zeros((1,maxTokenLength))
    encoded_tensor = torch.zeros(maxTokenLength,dtype=torch.long)
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        encoded_tensor[keyCount] = dictionary[key]
        keyCount+=1
    
    return encoded_tensor.unsqueeze(0)

In [4]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))


(132820, 21)


In [5]:

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y = df.loc[:, properties_names].values  # shape = (n_samples, n_properties)


In [6]:

testSmile = x_smiles[8]
print(testSmile)
tokens = atomInSmiles.encode(testSmile)
tokens = tokens.split()
print(tokens)
print(x_smiles.shape)



CC(C)C#N
['[CH3;!R;C]', '[CH;!R;CCC]', '(', '[CH3;!R;C]', ')', '[C;!R;CN]', '#', '[N;!R;C]']
(132820,)


In [7]:
# tokenize all smiles
#import atomInSmiles

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

Longest word (max amount of tokens): 22


In [8]:

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

In [15]:
encodedTensors = []
for token in tokenList:
    encodedTensors.append(create_encoded_tensor(token,tokenDict,maxTokenLength))


In [10]:
# From attention is all you need
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, max_len,dropout):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x = x + torch.tensor(self.pe[:, :x.size(1)], 
        #                  requires_grad=False)
        x = x + self.pe[:x.size(0), :].detach()
        return self.dropout(x)

In [11]:


# Define the size of the embedding space
embedding_dim = 64
vocab_size = len(tokenDict)

embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

# Try two inputs and concat them (this will be done for batches)
input_sentence = create_encoded_tensor(tokenList[100],tokenDict,maxTokenLength)
other_input = create_encoded_tensor(tokenList[150],tokenDict,maxTokenLength)

input_sentence = torch.cat((input_sentence,other_input),dim=0)
print(input_sentence)

# Pass the input tensor through the embedding layer
embedded_tensor = embedding_layer(input_sentence)

# Print the shape of the output tensor
print(embedded_tensor.shape)  # should be (1, 22, 64)
print(embedded_tensor)

posencoder = PositionalEncoding(embedding_dim,maxTokenLength,0.1)
encoded = posencoder.forward(embedded_tensor)
print(encoded.shape)

tensor([[ 6, 22,  8, 15,  9, 26, 18, 10, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0],
        [17, 10, 78, 33, 34, 35,  8, 15,  9, 34, 33,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0]])
torch.Size([2, 22, 64])
tensor([[[-2.1050, -0.2118, -0.5040,  ...,  1.0604, -0.1704,  1.3155],
         [-0.5433,  0.5343,  0.7047,  ..., -0.9396,  0.5494,  0.7721],
         [ 1.8270, -0.0359, -0.3390,  ...,  0.3046,  0.3862, -1.1825],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.1980,  0.5071, -0.7858,  ...,  0.3391,  0.1083, -0.6995],
         [ 0.3467, -1.1938, -0.4162,  ..., -0.6308,  0.9829, -1.8134],
         [-0.3443,  0.0583, -0.7377,  ..., -1.0526, -0.4647,  0.3889],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0

In [12]:

class TransformerLayer(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.Attention = torch.nn.MultiheadAttention(embedding_dim,num_heads=4,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(embedding_dim)
        self.Dense1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.Dense2 = torch.nn.Linear(hidden_channels,hidden_channels)
        self.Norm2 = torch.nn.LayerNorm(hidden_channels)
        

    def forward(self, x):
        addNormX = x
        #print(x.shape)
        x, _ = self.Attention(x,x,x)
        #print('attention output',x.shape)
        x = self.Norm1(x + addNormX)
        #print('norm + input',x.shape)
        addNormX = x
        x = self.Dense1(x)
        #print('first dense output',x.shape)
        x = self.Dense2(x)
        #print('second dense output',x.shape)
        x = self.Norm2(x + addNormX)
        #print(x.shape)

 
        return x

In [13]:
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim, vocab_size, maxTokenLength):
        super().__init__()
        torch.manual_seed(12345)
        self.embedding_dim = 64
        # Embedd and add pos encoding to input
        self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim = self.embedding_dim , max_norm=True)
        self.PositionalEncoding = PositionalEncoding(self.embedding_dim, maxTokenLength, dropout = 0.1)

        
        self.TransEnc1 = TransformerLayer(self.embedding_dim,hidden_channels)
        self.Pooling = torch.nn.AvgPool1d(kernel_size= 2)

        self.DenseOut1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.DenseOut2 = torch.nn.Linear(hidden_channels,output_dim)

    def forward(self,x):
        print('\n\n')
        x = self.EmbeddingLayer(x)
        print(x.shape)
        x = self.PositionalEncoding(x)
        print(x.shape)
        x = self.TransEnc1(x)
        print(x.shape)
        print(x[0,21])
        #x = self.Pooling(x)
        x = x[:,-1,:]
        print('pooling dim',x.shape)
        x = self.DenseOut1(x)
        print(x.shape)
        x = self.DenseOut2(x)
        print(x.shape)


        return x

In [14]:
transformer = TransformerNetwork(64,15,len(tokenDict),maxTokenLength)
input_sentence
output = transformer.forward(input_sentence)
print(output, output.shape)




torch.Size([2, 22, 64])
torch.Size([2, 22, 64])
torch.Size([2, 22, 64])
tensor([ 7.1570e-01, -7.3668e-01, -4.9669e-01, -1.6925e+00, -2.2311e+00,
         9.0536e-01,  4.3680e-01, -1.6489e+00,  1.2685e+00,  1.6000e+00,
        -2.2714e+00,  2.6225e-01, -1.0952e+00, -1.6934e+00,  1.7096e-03,
        -1.9170e+00,  1.4335e+00, -7.1031e-01,  3.8876e-01, -7.4992e-01,
         7.2632e-01,  4.4680e-01,  2.8045e-01, -5.5665e-01,  1.4424e-01,
         5.6454e-01,  1.2163e-01,  1.0425e+00, -9.9575e-01,  9.9013e-01,
        -6.0947e-02,  7.3357e-01,  3.8932e-01,  1.2274e+00, -3.7945e-01,
         1.1557e+00, -1.1910e+00,  1.1991e+00, -6.6615e-01, -5.6490e-01,
        -3.6339e-01,  1.4515e+00, -4.5286e-01,  1.5222e+00, -1.0882e+00,
         1.7610e+00, -1.9938e-01, -5.6633e-01, -2.5335e-01,  6.0761e-01,
         1.4522e-01, -5.1201e-02, -6.9115e-01, -1.7498e-01, -7.5945e-01,
         8.5050e-01,  1.4900e-01,  1.1494e+00, -1.3176e+00,  1.4731e+00,
        -6.2288e-01,  9.3401e-01, -5.3023e-01,  6