# TIF360 Project

# Data Pre-processing

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

UPDATED ON 15/05-2023  11:20

In [15]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math
import atomInSmiles
from torch.utils.data import DataLoader

import sys
sys.path.append('../')
from code_graphs.utility_functions import get_num_parameters , get_data_split_indices, scale_targets


import random


In [16]:

# check if cuda is available
print('cuda available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', "cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print('cuda version:', torch.version.cuda)
    print('gpu:', torch.cuda.get_device_name(0))



cuda available: True
device: cuda
cuda version: 11.7
gpu: NVIDIA GeForce GTX 970


In [17]:

# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates

# THIS VERSION RETURNS TORCH TENSOR

#Version taht returns tensor
def create_encoded_tensor(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    #encoded_vector = np.zeros((1,maxTokenLength))
    encoded_tensor = np.zeros(maxTokenLength,dtype=np.int32)
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        encoded_tensor[keyCount] = dictionary[key]
        keyCount+=1
    
    # encoded_tensor = np.expand_dims(encoded_tensor,axis=1)
    return encoded_tensor


In [18]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))
mol_fingerprint = np.load("../data/mol_morgan_fingerprints.npy")

#print(mol_descriptor[0])

(132820, 21)


In [19]:

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
targetTensor = df.loc[:, properties_names].values # shape = (n_samples, n_properties)


In [20]:
# tokenize all smiles
#import atomInSmiles

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

Longest word (max amount of tokens): 22


In [21]:

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

In [22]:
encodedTokens = []
for token in tokenList:
    encodedTokens.append(create_encoded_tensor(token,tokenDict,maxTokenLength))
    
encodedTokens = np.array(encodedTokens)


In [23]:
# From attention is all you need
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, max_len,dropout):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x = x + torch.tensor(self.pe[:, :x.size(1)], 
        #                  requires_grad=False)
        x = x + self.pe[:x.size(0), :].detach()
        return self.dropout(x)

In [24]:

class TransformerLayer(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_channels,num_heads):
        super().__init__()
        self.Attention = torch.nn.MultiheadAttention(embedding_dim,num_heads=num_heads,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(embedding_dim)
        self.Dense1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.relu = torch.nn.ReLU()
        self.Dense2 = torch.nn.Linear(hidden_channels,embedding_dim)
        

        self.Norm2 = torch.nn.LayerNorm(embedding_dim)
        

    def forward(self, x):
        addNormX = x
        #print(x.shape)
        x, _ = self.Attention(x,x,x)
        #print('attention output',x.shape)
        x = self.Norm1(x + addNormX)
        #print('norm + input',x.shape)
        addNormX = x
        x = self.Dense1(x)
        x = self.relu(x)
        #print('first dense output',x.shape)
        x = self.Dense2(x)
        #print('second dense output',x.shape)
        x = self.Norm2(x + addNormX)
        #print(x.shape)

 
        return x

In [25]:
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim, vocab_size, embedding_dim, num_heads,maxTokenLength, nFingerprints):
        super().__init__()
        # Embedd and add pos encoding to input
        self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim = embedding_dim , max_norm=True)
        self.PositionalEncoding = PositionalEncoding(embedding_dim, maxTokenLength, dropout = 0.15)

        self.TransEnc1 = TransformerLayer(embedding_dim,hidden_channels,num_heads)
        self.TransEnc2 = TransformerLayer(embedding_dim,hidden_channels,num_heads)
        self.TransEnc3 = TransformerLayer(embedding_dim,hidden_channels,num_heads)
        self.Pooling = torch.nn.AvgPool1d(kernel_size= 22)

        self.DenseOut1 = torch.nn.Linear(embedding_dim+nFingerprints,hidden_channels)
        self.DenseOut2 = torch.nn.Linear(hidden_channels,hidden_channels)
        self.DenseOut3 = torch.nn.Linear(hidden_channels,output_dim)
        self.relu = torch.nn.ReLU()


    def forward(self,x,fingerprints):
        x = self.EmbeddingLayer(x)
        x = self.PositionalEncoding(x)
        x = self.TransEnc1(x)
        x = self.TransEnc2(x)
        x = self.TransEnc3(x)
        x = self.Pooling(x.permute((0,2,1))).permute((0,2,1))
        x = torch.squeeze(x,axis=1)
        x = torch.cat((x,fingerprints),1)
        #x = x[:,-1,:]


        x = self.DenseOut1(x)
        x = self.relu(x)
        x = self.DenseOut2(x)
        x = self.relu(x)
        x = self.DenseOut3(x)
        
        return x

In [26]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, StandardScaler
batch_size = 64
## Create iterable dataset class:

class datasetObject(Dataset):
    def __init__(self,data,targets,fingerprints):
        self.data = data
        self.fingerprints = fingerprints
        self.targets = targets
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sample = self.data[index]
        target = self.targets[index]
        descriptor = self.fingerprints[index]
        return (sample), (target), (descriptor)

trainSplit, validationSplit, testSplit = get_data_split_indices(len(encodedTokens),0.1,0.1)

# Training
encodedTrainData = torch.tensor(encodedTokens[trainSplit], dtype=torch.long, device=device)
trainTargets = targetTensor[trainSplit]

# Validation
encodedValidationData = torch.tensor(encodedTokens[validationSplit], dtype=torch.long, device=device)
validationTargets = targetTensor[validationSplit]

# Test
encodedTestData = torch.tensor(encodedTokens[testSplit], dtype=torch.long, device=device)
testTargets = targetTensor[testSplit]

# fingerprints
trainfingerprints = mol_fingerprint[trainSplit]
valfingerprints = mol_fingerprint[validationSplit]
testfingerprints = mol_fingerprint[testSplit]


trainTargets, validationTargets, testTargets, scalerTargets = scale_targets(trainTargets,validationTargets,testTargets)


# Normalize fingerprints and fingerprints using minmax scaler
minmax_scaler = MinMaxScaler()
trainfingerprints = minmax_scaler.fit_transform(trainfingerprints)
valfingerprints = minmax_scaler.transform(valfingerprints)
testfingerprints = minmax_scaler.transform(testfingerprints)


# Make into tensors
#print(np.dtype(trainTargets[0,0]))
trainTargets = torch.tensor(trainTargets,dtype=torch.float, device=device)
validationTargets = torch.tensor(validationTargets,dtype=torch.float,device=device)
testTargets = torch.tensor(testTargets,dtype=torch.float,device=device)

# Decriptors to tensor
trainfingerprints = torch.from_numpy(trainfingerprints).float().to(device)
valfingerprints = torch.from_numpy(valfingerprints).float().to(device)
testfingerprints = torch.from_numpy(testfingerprints).float().to(device)

trainingData = DataLoader(datasetObject(encodedTrainData,trainTargets,trainfingerprints),batch_size)
testData = DataLoader(datasetObject(encodedTestData,testTargets,testfingerprints),batch_size)
validationData = DataLoader(datasetObject(encodedValidationData,validationTargets,valfingerprints),batch_size)

In [33]:
from sklearn.metrics import r2_score
hidden_channels = 1024
embedding_dim = 128
d_target = 15
nHeads = 4
learningRate = 0.0005
nfingerprints = len(mol_fingerprint[0])
vocab_size = len(tokenDict)+1
torch.manual_seed(12345)

model = TransformerNetwork(hidden_channels,d_target,vocab_size, embedding_dim, nHeads,maxTokenLength,nfingerprints).to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate, weight_decay=5e-4)
# decay learning rate
decayRate = 0.95
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
criterion = torch.nn.MSELoss().to(device)

def train(data_in, targets, fingerprints):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in,fingerprints).to(device)
      loss = criterion(out, targets)
      
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def infer(data_in, targets, fingerprints):
      with torch.no_grad():
            model.eval()
            out = model(data_in, fingerprints)
            loss = criterion(out, targets)
            r2 = r2_score(targets.cpu().detach().numpy(), out.cpu().detach().numpy())
      return loss, r2

def inferNew(data_in, targets, fingerprints):
    with torch.no_grad():
        model.eval()
        out = model(data_in, fingerprints)
        loss = criterion(out, targets)

        r2_scores = []
        for item in range(targets.shape[1]):
            r2 = r2_score(targets[:, item].cpu().detach().numpy(), out[:, item].cpu().detach().numpy())
            r2_scores.append(r2)

    return loss, r2_scores





earlyStop = False
stopTolerance = 3
minDiff = 0.005
counter = 0
epoch = 0
lossList = []
r2List = []

while not earlyStop and epoch < 250:
    loss = 0
    for input, targets, descriptor in trainingData:
        tempLoss = train(input, targets, descriptor).detach()
        loss += tempLoss
    
    test = 0
    r2 = [0]*d_target
    for input, targets, descriptor in testData:
        tempTest, tempR2 = inferNew(input, targets, descriptor)
        test += tempTest
        # Add r2 for each variable
        r2 = [x + y for x, y in zip(r2, tempR2)]
    # Average over batch
    r2 = [x / len(testData) for x in r2]
    r2List.append(r2)
    

    val = 0
    for input, targets, descriptor in validationData:
        tempVal, tempR2 = infer(input, targets, descriptor)
        val += tempVal

    lr_scheduler.step()

    lossList.append([loss.item() / len(trainingData), test.item() / len(testData), val.item() / len(validationData)])

    # Early stopping
    if (lossList[epoch][2] - lossList[epoch][0]) > minDiff:
        counter += 1
    else:
        counter = 0
        if counter >= stopTolerance:
            earlyStop = True

    # Save loss as a Python number (not tensor) in the list
    print(f"#### Epoch: {epoch} of 1000 ####\n Training loss:\t{lossList[epoch][0]}\n Test loss:\t{lossList[epoch][1]}\n Validation loss:\t{lossList[epoch][2]}")
    print('R2:', r2List[epoch])
    print('avg R2',np.mean(r2List[epoch]),'\n')
    epoch += 1



#### Epoch: 0 of 1000 ####
 Training loss:	0.24641389978857803
 Test loss:	0.14549935781038725
 Validation loss:	0.14565388972942644
R2: [0.7132781510476817, 0.8030907113926781, 0.8428063237873606, 0.5862828913070788, 0.8895643146505126, 0.7899987182988059, 0.9001681176331103, 0.8840549792410298, 0.8806741002052516, 0.9227933813083362, 0.8995281736940213, 0.8996363358948872, 0.8999545994852209, 0.8994694114401022, 0.902627515820122]
avg R2 0.8475951816804133
#### Epoch: 1 of 1000 ####
 Training loss:	0.12359488692217602
 Test loss:	0.1039059987434974
 Validation loss:	0.1061160289324247
R2: [0.778070402903916, 0.844885666040162, 0.8841990098707965, 0.6193431339384146, 0.9346099992874254, 0.8203122834420794, 0.9310366378503224, 0.9146385424877811, 0.9223077026292584, 0.952780134711118, 0.9550684069511396, 0.9551731534539812, 0.9551480017091354, 0.9550267484251985, 0.9433090247297776]
avg R2 0.8910605898953673
#### Epoch: 2 of 1000 ####
 Training loss:	0.09686775368283414
 Test loss:	0.0

KeyboardInterrupt: 