# TIF360 Project

# Data Pre-processing

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

UPDATED ON 15/05-2023  11:20

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math
import atomInSmiles



In [None]:
# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates
def create_Onehot_Matrix(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    onehot_Matrix = np.zeros((len(dictionary),maxTokenLength))
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        onehot_Matrix[dictionary[key],keyCount] = 1
        keyCount+=1

    # Return it
    return(onehot_Matrix)

In [None]:

# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates

# THIS VERSION RETURNS TORCH TENSOR

#Version taht returns tensor
def create_encoded_tensor(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    #encoded_vector = np.zeros((1,maxTokenLength))
    encoded_tensor = torch.zeros(maxTokenLength,dtype=torch.long)
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        encoded_tensor[keyCount] = dictionary[key]
        keyCount+=1
    
    return encoded_tensor.unsqueeze(0)

In [None]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))


(132820, 21)


In [None]:

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y = df.loc[:, properties_names].values  # shape = (n_samples, n_properties)


In [None]:

testSmile = x_smiles[8]
print(testSmile)
tokens = atomInSmiles.encode(testSmile)
tokens = tokens.split()
print(tokens)
print(x_smiles.shape)



CC(C)C#N
['[CH3;!R;C]', '[CH;!R;CCC]', '(', '[CH3;!R;C]', ')', '[C;!R;CN]', '#', '[N;!R;C]']
(132820,)


In [None]:
# tokenize all smiles
#import atomInSmiles

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

Longest word (max amount of tokens): 22


In [None]:

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

In [None]:
create_encoded_tensor(tokenList[100],tokenDict,maxTokenLength)

tensor([[ 6, 22,  8, 15,  9, 26, 18, 10, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0]])

In [None]:
# From attention is all you need
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, max_len,dropout):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x = x + torch.tensor(self.pe[:, :x.size(1)], 
        #                  requires_grad=False)
        x = x + self.pe[:x.size(0), :].detach()
        return self.dropout(x)

In [None]:


# Define the size of the embedding space
embedding_dim = 64
vocab_size = len(tokenDict)

embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

# Try two inputs and concat them (this will be done for batches)
input_sentence = create_encoded_tensor(tokenList[100],tokenDict,maxTokenLength)
other_input = create_encoded_tensor(tokenList[150],tokenDict,maxTokenLength)

input_sentence = torch.cat((input_sentence,other_input),dim=0)
print(input_sentence)

# Pass the input tensor through the embedding layer
embedded_tensor = embedding_layer(input_sentence)

# Print the shape of the output tensor
print(embedded_tensor.shape)  # should be (1, 22, 64)
print(embedded_tensor)

posencoder = PositionalEncoding(embedding_dim,maxTokenLength,0.1)
encoded = posencoder.forward(embedded_tensor)
print(encoded.shape)

tensor([[ 6, 22,  8, 15,  9, 26, 18, 10, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0],
        [17, 10, 78, 33, 34, 35,  8, 15,  9, 34, 33,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0]])
torch.Size([2, 22, 64])
tensor([[[ 1.2193,  1.2010,  0.3540,  ...,  0.3075, -0.9805,  1.1671],
         [-0.0438, -0.8617,  0.5251,  ..., -0.4871,  0.4306, -0.9962],
         [-0.4599,  0.7623, -0.0772,  ..., -0.8726, -0.3948,  0.4671],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0691,  0.9693, -0.6145,  ...,  0.2498, -0.5422, -2.0475],
         [-0.5789,  0.6443, -0.2303,  ...,  0.2763, -0.5292,  0.5862],
         [ 0.4219,  2.1597,  1.8189,  ...,  0.4619, -0.6049,  0.8817],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0

In [None]:

class TransformerLayer(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.Attention = torch.nn.MultiheadAttention(embedding_dim,num_heads=4,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(embedding_dim)
        self.Dense1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.Dense2 = torch.nn.Linear(hidden_channels,hidden_channels)
        self.Norm2 = torch.nn.LayerNorm(hidden_channels)
        

    def forward(self, x):
        addNormX = x
        #print(x.shape)
        x, _ = self.Attention(x,x,x)
        #print('attention output',x.shape)
        x = self.Norm1(x + addNormX)
        #print('norm + input',x.shape)
        addNormX = x
        x = self.Dense1(x)
        #print('first dense output',x.shape)
        x = self.Dense2(x)
        #print('second dense output',x.shape)
        x = self.Norm2(x + addNormX)
        #print(x.shape)

 
        return x

In [None]:
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim, vocab_size, maxTokenLength):
        super().__init__()
        torch.manual_seed(12345)
        self.embedding_dim = 64
        # Embedd and add pos encoding to input
        self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim = self.embedding_dim , max_norm=True)
        self.PositionalEncoding = PositionalEncoding(self.embedding_dim, maxTokenLength, dropout = 0.1)

        
        self.TransEnc1 = TransformerLayer(self.embedding_dim,hidden_channels)
        self.Pooling = torch.nn.AvgPool1d(kernel_size= 2)

        self.DenseOut1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.DenseOut2 = torch.nn.Linear(hidden_channels,output_dim)

    def forward(self,x):
        print('\n\n')
        x = self.EmbeddingLayer(x)
        print(x.shape)
        x = self.PositionalEncoding(x)
        print(x.shape)
        x = self.TransEnc1(x)
        print(x.shape)
        print(x[0,21])
        #x = self.Pooling(x)
        x = x[:,-1,:]
        print('pooling dim',x.shape)
        x = self.DenseOut1(x)
        print(x.shape)
        x = self.DenseOut2(x)
        print(x.shape)


        return x

In [None]:
transformer = TransformerNetwork(64,15,len(tokenDict),maxTokenLength)
input_sentence
output = transformer.forward(input_sentence)
print(output, output.shape)




torch.Size([2, 22, 64])
torch.Size([2, 22, 64])
torch.Size([2, 22, 64])
tensor([ 0.2317, -1.7196, -0.4611, -2.1485, -1.7559,  0.7880,  0.4311, -1.2885,
         0.9518,  1.5542, -2.2297,  0.1562, -0.4292, -1.5930, -0.1786, -2.3906,
         0.6507, -0.4795,  1.4866, -0.1383,  0.8093,  0.5200,  0.6291, -0.1061,
         0.7442,  0.7699,  0.1646,  1.3861, -0.1468, -0.0071, -0.6010,  0.2687,
         0.4238, -0.0924, -0.1894,  1.2532, -1.2466,  0.8433, -0.5003,  1.2910,
        -0.0162,  1.4309, -0.6904,  2.1680, -1.2105,  0.4798, -0.4218,  0.3475,
        -0.1546, -0.5598,  0.0577, -0.0787, -0.4059,  0.0695, -0.3042, -0.7817,
        -0.2326,  0.9268, -1.3745,  2.1353, -0.6652,  0.8512, -0.2480,  1.0262],
       grad_fn=<SelectBackward0>)
pooling dim torch.Size([2, 64])
torch.Size([2, 64])
torch.Size([2, 15])
tensor([[-0.3553, -0.0640,  0.4376, -0.3863,  0.4904,  0.0465,  0.2543, -0.1788,
          0.3031, -0.3016, -0.0719, -0.1588,  0.4374, -0.7130, -0.0026],
        [-0.3615,  0.012