# TIF360 Project

# Data Pre-processing

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

UPDATED ON 15/05-2023  11:20

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math
import atomInSmiles
from torch.utils.data import DataLoader

import sys
sys.path.append('../')
from code_graphs.utility_functions import get_num_parameters , get_data_split_indices, scale_targets


import random


In [2]:

# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates

# THIS VERSION RETURNS TORCH TENSOR

#Version taht returns tensor
def create_encoded_tensor(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    #encoded_vector = np.zeros((1,maxTokenLength))
    encoded_tensor = np.zeros(maxTokenLength,dtype=np.int32)
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        encoded_tensor[keyCount] = dictionary[key]
        keyCount+=1
    
    # encoded_tensor = np.expand_dims(encoded_tensor,axis=1)
    return encoded_tensor


In [3]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))
mol_descriptor = np.load("../data/mol_descriptors.npy")
print(mol_descriptor.shape)

#print(mol_descriptor[0])

(132820, 21)
(132820, 179)


In [4]:

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
targetTensor = df.loc[:, properties_names].values # shape = (n_samples, n_properties)


In [5]:
# tokenize all smiles
#import atomInSmiles

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

Longest word (max amount of tokens): 22


In [6]:

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

In [7]:
encodedTokens = []
for token in tokenList:
    encodedTokens.append(create_encoded_tensor(token,tokenDict,maxTokenLength))
    
encodedTokens = np.array(encodedTokens)


In [8]:
# From attention is all you need
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, max_len,dropout):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x = x + torch.tensor(self.pe[:, :x.size(1)], 
        #                  requires_grad=False)
        x = x + self.pe[:x.size(0), :].detach()
        return self.dropout(x)

In [13]:

class TransformerLayer(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_channels):
        super().__init__()
        self.Attention = torch.nn.MultiheadAttention(embedding_dim,num_heads=8,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(embedding_dim)
        self.Dense1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.relu = torch.nn.ReLU()
        self.Dense2 = torch.nn.Linear(hidden_channels,embedding_dim)
        

        self.Norm2 = torch.nn.LayerNorm(embedding_dim)
        

    def forward(self, x):
        addNormX = x
        #print(x.shape)
        x, _ = self.Attention(x,x,x)
        #print('attention output',x.shape)
        x = self.Norm1(x + addNormX)
        #print('norm + input',x.shape)
        addNormX = x
        x = self.Dense1(x)
        x = self.relu(x)
        #print('first dense output',x.shape)
        x = self.Dense2(x)
        #print('second dense output',x.shape)
        x = self.Norm2(x + addNormX)
        #print(x.shape)

 
        return x

In [10]:
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim, vocab_size, embedding_dim,maxTokenLength, nDescriptors):
        super().__init__()
        # Embedd and add pos encoding to input
        self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim = embedding_dim , max_norm=True)
        self.PositionalEncoding = PositionalEncoding(embedding_dim, maxTokenLength, dropout = 0.15)

        self.TransEnc1 = TransformerLayer(embedding_dim,hidden_channels)
        self.TransEnc2 = TransformerLayer(embedding_dim,hidden_channels)
        self.TransEnc3 = TransformerLayer(embedding_dim,hidden_channels)
        self.Pooling = torch.nn.AvgPool1d(kernel_size= 22)

        self.DenseOut1 = torch.nn.Linear(embedding_dim+nDescriptors,hidden_channels)
        self.DenseOut2 = torch.nn.Linear(hidden_channels,output_dim)
        self.relu = torch.nn.ReLU()


    def forward(self,x,descriptors):
        x = self.EmbeddingLayer(x)
        x = self.PositionalEncoding(x)
        x = self.TransEnc1(x)
        x = self.TransEnc2(x)
        x = self.TransEnc3(x)
        x = self.Pooling(x.permute((0,2,1))).permute((0,2,1))
        x = torch.squeeze(x,axis=1)
        x = torch.cat((x,descriptors),1)
        #x = x[:,-1,:]


        x = self.DenseOut1(x)
        x = self.relu(x)
        x = self.DenseOut2(x)
        
        return x

In [11]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, StandardScaler
batch_size = 50
## Create iterable dataset class:

class datasetObject(Dataset):
    def __init__(self,data,targets,descriptors):
        self.data = data
        self.descriptors = descriptors
        self.targets = targets
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sample = self.data[index]
        target = self.targets[index]
        descriptor = self.descriptors[index]
        return (sample), (target), (descriptor)

trainSplit, validationSplit, testSplit = get_data_split_indices(len(encodedTokens),0.1,0.1)

# Training
encodedTrainData = encodedTokens[trainSplit]
trainTargets = targetTensor[trainSplit]

# Validation
encodedValidationData = encodedTokens[validationSplit]
validationTargets = targetTensor[validationSplit]

# Test
encodedTestData = encodedTokens[testSplit]
testTargets = targetTensor[testSplit]

# Descriptors
trainDescriptors = mol_descriptor[trainSplit]
valDescriptors = mol_descriptor[validationSplit]
testDescriptors = mol_descriptor[testSplit]


trainTargets, validationTargets, testTargets, scalerTargets = scale_targets(trainTargets,validationTargets,testTargets)


# Normalize descriptors and fingerprints using minmax scaler
minmax_scaler = MinMaxScaler()
trainDescriptors = minmax_scaler.fit_transform(trainDescriptors)
valDescriptors = minmax_scaler.transform(valDescriptors)
testDescriptors = minmax_scaler.transform(testDescriptors)


# Make into tensors
#print(np.dtype(trainTargets[0,0]))
trainTargets = torch.tensor(trainTargets,dtype=torch.float)
validationTargets = torch.tensor(validationTargets,dtype=torch.float)
testTargets = torch.tensor(testTargets,dtype=torch.float)

# Decriptors to tensor
trainDescriptors = torch.from_numpy(trainDescriptors).float()
valDescriptors = torch.from_numpy(valDescriptors).float()
testDescriptors = torch.from_numpy(testDescriptors).float()

trainingData = DataLoader(datasetObject(encodedTrainData,trainTargets,trainDescriptors),batch_size)
testData = DataLoader(datasetObject(encodedTestData,testTargets,testDescriptors),batch_size)
validationData = DataLoader(datasetObject(encodedValidationData,validationTargets,valDescriptors),batch_size)

In [14]:
hidden_channels = 64
embedding_dim = 32
d_target = 15
nDescriptors = len(mol_descriptor[0])
vocab_size = len(tokenDict)+1
torch.manual_seed(12345)

model = TransformerNetwork(hidden_channels,d_target,vocab_size, embedding_dim, maxTokenLength,nDescriptors) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)
# decay learning rate
decayRate = 0.8
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
criterion = torch.nn.MSELoss()

def train(data_in, targets, descriptors):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in,descriptors)
      loss = criterion(out, targets)
      
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def infer(data_in, targets, descriptors):
      model.eval()
      out = model(data_in, descriptors)
      loss = criterion(out, targets)
      return loss


for epoch in range(1000):
      loss = 0
      for input,targets,descriptor in trainingData:
            loss += train(input,targets,descriptor).detach()
      
      test = 0
      for input, targets, descriptor in testData:
            test += infer(input,targets,descriptor)

      val = 0
      for input,targets,descriptor in validationData:
            val += infer(input,targets,descriptor)
      
      lr_scheduler.step()

      print(f"#### Epoch: {epoch} of 1000 ####\n Training loss:\t{loss/len(trainingData)}\n Test loss:\t{test/len(testData)}\n Validation loss:\t{val/len(validationData)}")

#### Epoch: 0 of 1000 ####
 Training loss:	0.26369285583496094
 Test loss:	0.16000008583068848
 Validation loss:	0.16272114217281342
#### Epoch: 1 of 1000 ####
 Training loss:	0.14833083748817444
 Test loss:	0.14073042571544647
 Validation loss:	0.1433548778295517
#### Epoch: 2 of 1000 ####
 Training loss:	0.1372070461511612
 Test loss:	0.13319948315620422
 Validation loss:	0.13584037125110626
#### Epoch: 3 of 1000 ####
 Training loss:	0.13164262473583221
 Test loss:	0.12859566509723663
 Validation loss:	0.13133202493190765


KeyboardInterrupt: 