# TIF360 Project

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

UPDATED ON 15/05-2023  11:20

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math
import atomInSmiles
from torch.utils.data import DataLoader

import sys
sys.path.append('../')
from code_graphs.utility_functions import get_num_parameters , get_data_split_indices, scale_targets
from classes import create_encoded_vector, PositionalEncoding


import random


In [3]:
print("cuda available:", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device:", "cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

cuda available: True
device: cuda
NVIDIA GeForce GTX 970


In [2]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
targetTensor = df.loc[:, properties_names].values # shape = (n_samples, n_properties)



(132820, 21)


In [None]:
# tokenize all smiles
#import atomInSmiles

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

In [2]:
# Encode all the tokenized inputs
encodedTokens = []
for token in tokenList:
    encodedTokens.append(create_encoded_vector(token,tokenDict,maxTokenLength))
    
encodedTokens = np.array(encodedTokens)


NameError: name 'tokenList' is not defined

In [None]:

class TransformerLayer(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_channels, num_heads):
        super().__init__()
        self.Attention = torch.nn.MultiheadAttention(embedding_dim,num_heads=num_heads,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(embedding_dim)
        self.Dense1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.relu = torch.nn.ReLU()
        self.Dense2 = torch.nn.Linear(hidden_channels,embedding_dim)
        

        self.Norm2 = torch.nn.LayerNorm(embedding_dim)
        

    def forward(self, x):
        addNormX = x
        #print(x.shape)
        x, _ = self.Attention(x,x,x)
        x = self.Norm1(x + addNormX)
        addNormX = x
        x = self.Dense1(x)
        x = self.relu(x)
        x = self.Dense2(x)
        x = self.Norm2(x + addNormX)
        #print(x.shape)

 
        return x

In [None]:
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim, vocab_size, embedding_dim, num_heads,maxTokenLength):
        super().__init__()
        # Embedd and add pos encoding to input
        self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim = embedding_dim , max_norm=True)
        self.PositionalEncoding = PositionalEncoding(embedding_dim, maxTokenLength, dropout = 0.15)

        self.TransEnc1 = TransformerLayer(embedding_dim,hidden_channels,num_heads)
        self.TransEnc2 = TransformerLayer(embedding_dim,hidden_channels,num_heads)
        self.TransEnc3 = TransformerLayer(embedding_dim,hidden_channels,num_heads)
        self.Pooling = torch.nn.AvgPool1d(kernel_size= 22)

        self.DenseOut1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.DenseOut2 = torch.nn.Linear(hidden_channels,output_dim)
        self.relu = torch.nn.ReLU()


    def forward(self,x):
        x = self.EmbeddingLayer(x)
        x = self.PositionalEncoding(x)
        x = self.TransEnc1(x)
        x = self.TransEnc2(x)
        x = self.TransEnc3(x)
        x = self.Pooling(x.permute((0,2,1))).permute((0,2,1))
        x = torch.squeeze(x,axis=1)
        #x = x[:,-1,:]
        x = self.DenseOut1(x)
        x = self.relu(x)
        x = self.DenseOut2(x)
        
        return x

In [None]:
from torch.utils.data import Dataset, DataLoader
batch_size = 64
## Create iterable dataset class:

class datasetObject(Dataset):
    def __init__(self,data,targets):
        self.data = data
        self.targets = targets
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sample = self.data[index]
        target = self.targets[index]
        return (sample), (target)

trainSplit, validationSplit, testSplit = get_data_split_indices(len(encodedTokens),0.1,0.1)

# Training
encodedTrainData = torch.tensor(encodedTokens[trainSplit], dtype=torch.long, device=device)
trainTargets = targetTensor[trainSplit]

# Validation
encodedValidationData = torch.tensor(encodedTokens[validationSplit], dtype=torch.long, device=device)
validationTargets = targetTensor[validationSplit]

# Test
encodedTestData = torch.tensor(encodedTokens[testSplit], dtype=torch.long, device=device)
testTargets = targetTensor[testSplit]

trainTargets, validationTargets, testTargets, scalerTargets = scale_targets(trainTargets,validationTargets,testTargets)

trainTargets = torch.tensor(trainTargets,dtype=torch.float,device=device)
validationTargets = torch.tensor(validationTargets,dtype=torch.float, device=device)
testTargets = torch.tensor(testTargets,dtype=torch.float, device=device)

trainingData = DataLoader(datasetObject(encodedTrainData,trainTargets),batch_size,shuffle=False)
testData = DataLoader(datasetObject(encodedTestData,testTargets),batch_size,shuffle=False)
validationData = DataLoader(datasetObject(encodedValidationData,validationTargets),batch_size,shuffle=False)

In [None]:
# Creates list of dictionary for many runs of different parametervalues
heads = [4, 8]
embeddings = [128, 256]
hiddenChannels = [128, 256, 512]
parameterList = []

for channel in hiddenChannels:
    for embed in embeddings:
        for ihead in heads:
            tempDict = {}
            tempDict['hidden_channels'] = channel
            tempDict['embedding_dim'] = embed
            tempDict['nHeads'] = ihead
            parameterList.append(tempDict)

parameterList

In [None]:
from sklearn.metrics import r2_score
hidden_channels = 512
embedding_dim = 128
learningRate = 0.001
d_target = 15
vocab_size = len(tokenDict)+1
nHeads = 4

# paramList = [{hidden_channels : 128, embedding_dim : 32, nHeads : 16},
#              {hidden_channels : 128, embedding_dim : 32, nHeads : 16},
#              {hidden_channels : 128, embedding_dim : 32, nHeads : 16},
#              {hidden_channels : 128, embedding_dim : 32, nHeads : 16}]


torch.manual_seed(12345)

model = TransformerNetwork(hidden_channels,d_target,vocab_size, embedding_dim, nHeads, maxTokenLength).to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate, weight_decay=5e-4)
# decay learning rate
decayRate = 0.95
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
criterion = torch.nn.MSELoss().to(device)

def train(data_in, targets):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in)
      loss = criterion(out, targets)
      
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def infer(data_in, targets):
      with torch.no_grad():
            model.eval()
            out = model(data_in)
            loss = criterion(out, targets)
            r2 = r2_score(targets.cpu().detach().numpy(), out.cpu().detach().numpy())

      return loss, r2

parameterTest = False
earlyStop = False
stopTolerance = 3
minDiff = 0.01
epoch = 0
finalLoss = []

if parameterTest == False:
      lossList = []
      r2List = []
      counter = 0
      r2array = np.zeros(d_target)
      while not earlyStop and epoch < 100:
            loss = 0
            for input,targets in trainingData:
                  loss += train(input,targets).detach()
            
            test = 0
            r2 = 0
            for input, targets in testData:
                  tempTest, tempR2 = infer(input,targets)
                  test += tempTest
                  r2 += tempR2

            r2List.append(r2.item()/len(testData))

            val = 0
            for input,targets in validationData:
                  tempVal,tempR2 = infer(input,targets)
                  val += tempVal.detach()
            
            lr_scheduler.step()
            lossList.append([loss.item()/len(trainingData), test.item()/len(testData), val.item()/len(validationData)])
            # Early stopping
            if (lossList[epoch][2]-lossList[epoch][0]) > minDiff:
                  counter += 1
                  if counter >= stopTolerance:
                        earlyStop = True         
            else:
                  counter = 0
            print(f"#### Epoch: {epoch} of 100 ####\n Training loss:\t{lossList[epoch][0]}\n Test loss:\t{lossList[epoch][1]}\n Validation loss:\t{lossList[epoch][2]}")
            print('R2: ',r2List[epoch])
            epoch+=1   
      if earlyStop == False:
                  epoch -= stopTolerance
                  print('\n NO EARLY STOPPING')
                  print('Training Loss: ', lossList[epoch][0], '\tTesting loss: ', lossList[epoch][1], '\tValidation Loss: ', lossList[epoch][2],'\n\n')
                  print('R2: ',r2List[epoch])
                  finalLoss.append(lossList[epoch])       
else: # THIS LOOP RUNS WHEN DOING PARAMETER TESTING CHANGE BOOL TO TURN OFF
      c = 0
      for currentParam in parameterList:
            c+=1
            print("network counter:", c)
            hidden_channels = currentParam['hidden_channels']
            embedding_dim = currentParam['embedding_dim']
            nHeads = currentParam['nHeads']
            
            model = TransformerNetwork(hidden_channels=hidden_channels, output_dim=d_target, embedding_dim=embedding_dim,
                                       vocab_size=vocab_size, num_heads=nHeads, maxTokenLength=maxTokenLength).to(device) 
            optimizer = torch.optim.Adam(model.parameters(), lr=learningRate, weight_decay=5e-4)
            # decay learning rate
            decayRate = 0.94
            lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
            criterion = torch.nn.MSELoss().to(device)
            
            r2List = []
            lossList = []
            counter = 0
            epoch = 0
            earlyStop = False
            while not earlyStop and epoch < 100:
                  loss = 0
                  for input,targets in trainingData:
                        loss += train(input,targets).detach()
                  
                  test = 0
                  r2 = 0
                  for input, targets in testData:
                        tempTest, tempR2 = infer(input,targets)
                        test += tempTest
                        r2 += tempR2
                  
                  r2List.append(r2.item()/len(testData))

                  val = 0
                  for input,targets in validationData:
                        tempVal,tempR2 = infer(input,targets)
                        val += tempVal.detach()
                  
                  lr_scheduler.step()
                  lossList.append([loss.item()/len(trainingData), test.item()/len(testData), val.item()/len(validationData)])
                  # Early stopping
                  if (lossList[epoch][2]-lossList[epoch][0]) > minDiff:
                        counter += 1
                        if counter >= stopTolerance:
                              print('\n EARLY STOPPING')
                              print('Hidden_channels = ',currentParam['hidden_channels'],'\tembedding_dim = ',currentParam['embedding_dim'], '\tnHeads = ',currentParam['nHeads'])
                              print('Training Loss: ', lossList[epoch][0], '\tTesting loss: ', lossList[epoch][1], '\tValidation Loss: ', lossList[epoch][2],'\n\n')
                              print('R2: ',r2List[epoch])
                              finalLoss.append(lossList[epoch])
                              

                              earlyStop = True
                  else:
                        counter = 0       

                  if epoch%10 == 0:
                        print(f"#### Epoch: {epoch} of 100 ####\n Training loss:\t{lossList[epoch][0]}\n Test loss:\t{lossList[epoch][1]}\n Validation loss:\t{lossList[epoch][2]}")
                  epoch += 1
            if earlyStop == False:
                  epoch -= 1
                  print('\n NO EARLY STOPPING')
                  print('Hidden_channels = ',currentParam['hidden_channels'],'\tembedding_dim = ',currentParam['embedding_dim'], '\tnHeads = ',currentParam['nHeads'])
                  print('Training Loss: ', lossList[epoch][0], '\tTesting loss: ', lossList[epoch][1], '\tValidation Loss: ', lossList[epoch][2],'\n\n')
                  print('R2: ',r2List[epoch])
                  finalLoss.append(lossList[epoch])
                  

In [None]:
print(finalLoss)
print(r2List)