# TIF360 Project

# Data Pre-processing

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

UPDATED ON 15/05-2023  11:20

In [1]:
import os
import numpy as np
import pandas as pd
import torch



In [2]:
# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates
def create_Onehot_Matrix(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    onehot_Matrix = np.zeros((len(dictionary),maxTokenLength))
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        onehot_Matrix[dictionary[key],keyCount] = 1
        keyCount+=1

    # Return it
    return(onehot_Matrix)

In [3]:

# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates
def create_encoded_vector(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    encoded_vector = np.zeros((1,maxTokenLength))
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        encoded_vector[0,keyCount] = dictionary[key]
        keyCount+=1
    
    return encoded_vector


In [4]:
testDict = {}
testDict['a'] = 1
testDict['b'] = 2
testDict['c'] = 3

In [5]:
create_encoded_vector(['b','c'],testDict,5)

array([[2., 3., 0., 0., 0.]])

In [6]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))


(132820, 21)


In [7]:

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y = df.loc[:, properties_names].values  # shape = (n_samples, n_properties)


### Transformer Network

#### Model for all targets at once

Transformer

In [8]:
import atomInSmiles


testSmile = x_smiles[8]
print(testSmile)
tokens = atomInSmiles.encode(testSmile)
tokens = tokens.split()
print(tokens)
print(x_smiles.shape)



CC(C)C#N
['[CH3;!R;C]', '[CH;!R;CCC]', '(', '[CH3;!R;C]', ')', '[C;!R;CN]', '#', '[N;!R;C]']
(132820,)


In [9]:
# tokenize all smiles
import atomInSmiles

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

Longest word (max amount of tokens): 22


In [10]:

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

In [11]:

encodedVectors = []
for token in tokenList:
    encodedVectors.append(create_encoded_vector(token,tokenDict,maxTokenLength))


In [25]:
import math

## Parameters
seq_len = 22
d_model = 128
n_heads = 4
device = 'cpu'

class PositionalEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # Make initial positional encoding matrix with 0
        pe_matrix= torch.zeros(seq_len, d_model) # (L, d_model)

        # Calculating position encoding values
        for pos in range(seq_len):
            for i in range(d_model):
                if i % 2 == 0:
                    pe_matrix[pos, i] = math.sin(pos / (10000 ** (2 * i / d_model)))
                elif i % 2 == 1:
                    pe_matrix[pos, i] = math.cos(pos / (10000 ** (2 * i / d_model)))

        pe_matrix = pe_matrix.unsqueeze(0) # (1, L, d_model)
        self.positional_encoding = pe_matrix.to(device=device).requires_grad_(False)

    def forward(self, x):
        x = x * math.sqrt(d_model) # (B, L, d_model)
        x = x + self.positional_encoding # (B, L, d_model)

        return x

class TransformerLayer(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.Attention = torch.nn.MultiheadAttention(d_model,num_heads=n_heads,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(d_model)
        self.Dense1 = torch.nn.Linear(hidden_channels,hidden_channels)
        self.Norm2 = torch.nn.LayerNorm(hidden_channels)
        self.Dense2 = torch.nn.Linear(hidden_channels,hidden_channels)
        

    def forward(self, x):
        addNormX = x
        x = self.Attention(x,x,x)
        x = self.Norm1(x + addNormX)
        addNormX = x
        x = self.Dense1(x)
        x = self.Dense2(x)
        x = self.Norm2(x + addNormX)

       
 
        return x
    
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim):
        super().__init__()
        torch.manual_seed(12345)
        #self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=148,embedding_dim = 600 , max_norm=True)

        self.PositionalEncoding = PositionalEncoder()
        self.TransEnc1 = TransformerLayer(hidden_channels)
        
        self.Pooling = torch.nn.AvgPool1d(hidden_channels)

        self.DenseOut1 = torch.nn.Linear(hidden_channels,hidden_channels)
        self.DenseOut2 = torch.nn.Linear(hidden_channels,output_dim)

    def forward(self,x):
        
        x = self.PositionalEncoding(x)
        x = self.TransEnc1(x)
        x = self.Pooling(x)
        x = self.DenseOut1(x)
        x = self.DenseOut2(x)


        return x






In [26]:
## Train
hidden_channels = 64

print("model init start")

model = TransformerNetwork(hidden_channels=hidden_channels,output_dim=15)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)
criterion = torch.nn.MSELoss()
print("model init finished")

def train(data_in, target):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in)
      
      loss = criterion(out, target)   

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

for epoch in range(10):
      loss = train(encodedVectors[10],target = y)
      print(loss)

model init start
model init finished


TypeError: Concatenation operation is not implemented for NumPy arrays, use np.concatenate() instead. Please do not rely on this error; it may not be given on all Python implementations.