In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math
import atomInSmiles
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score

import sys
sys.path.append('../')
from code_graphs.utility_functions import get_num_parameters , get_data_split_indices, scale_targets
from classes import create_encoded_vector, PositionalEncoding 


import random


Will attempt to gather all the variant in this file then changing if descriptors are used etc by only changing bools instead of having four different files.

In [None]:
useRdkitDescriptors = True
useFingerprints = True

In [2]:

# check if cuda is available
print('cuda available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', "cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print('cuda version:', torch.version.cuda)
    print('gpu:', torch.cuda.get_device_name(0))



cuda available: True
device: cuda
cuda version: 11.7
gpu: NVIDIA GeForce GTX 970


In [3]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))


mol_descriptor = np.load("../data/mol_descriptors.npy")
mol_fingerprint = np.load("../data/mol_morgan_fingerprints.npy")
print(mol_descriptor.shape)


properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
targetTensor = df.loc[:, properties_names].values # shape = (n_samples, n_properties)

(132820, 21)
(132820, 179)


In [4]:

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

Longest word (max amount of tokens): 22


In [5]:
encodedTokens = []
for token in tokenList:
    encodedTokens.append(create_encoded_vector(token,tokenDict,maxTokenLength))
    
encodedTokens = np.array(encodedTokens)


In [6]:

class TransformerLayer(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_channels,num_heads):
        super().__init__()
        self.Attention = torch.nn.MultiheadAttention(embedding_dim,num_heads=num_heads,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(embedding_dim)
        self.Dense1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.relu = torch.nn.ReLU()
        self.Dense2 = torch.nn.Linear(hidden_channels,embedding_dim)
        

        self.Norm2 = torch.nn.LayerNorm(embedding_dim)
        

    def forward(self, x):
        addNormX = x
        #print(x.shape)
        x, _ = self.Attention(x,x,x)
        #print('attention output',x.shape)
        x = self.Norm1(x + addNormX)
        #print('norm + input',x.shape)
        addNormX = x
        x = self.Dense1(x)
        x = self.relu(x)
        #print('first dense output',x.shape)
        x = self.Dense2(x)
        #print('second dense output',x.shape)
        x = self.Norm2(x + addNormX)
        #print(x.shape)

 
        return x

In [7]:
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim, vocab_size, embedding_dim, num_heads,maxTokenLength, nDescriptors,nFingerprints):
        super().__init__()
        # Embedd and add pos encoding to input
        self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim = embedding_dim , max_norm=True)
        self.PositionalEncoding = PositionalEncoding(embedding_dim, maxTokenLength, dropout = 0.15)

        self.TransEnc1 = TransformerLayer(embedding_dim,hidden_channels, num_heads)
        self.TransEnc2 = TransformerLayer(embedding_dim,hidden_channels, num_heads)
        self.TransEnc3 = TransformerLayer(embedding_dim,hidden_channels, num_heads)
        self.Pooling = torch.nn.AvgPool1d(kernel_size= 22)

        self.DenseOut1 = torch.nn.Linear(embedding_dim+nDescriptors+nFingerprints,hidden_channels)
        self.DenseOut2 = torch.nn.Linear(hidden_channels,output_dim)
        self.relu = torch.nn.ReLU()


    def forward(self,x,descriptors,fingerprints):
        x = self.EmbeddingLayer(x)
        x = self.PositionalEncoding(x)
        x = self.TransEnc1(x)
        x = self.TransEnc2(x)
        x = self.TransEnc3(x)
        x = self.Pooling(x.permute((0,2,1))).permute((0,2,1))
        x = torch.squeeze(x,axis=1)
        x = torch.cat((x,descriptors,fingerprints),1)
        #x = x[:,-1,:]


        x = self.DenseOut1(x)
        x = self.relu(x)
        x = self.DenseOut2(x)
        
        return x

In [8]:

batch_size = 64
## Create iterable dataset class:

class datasetObject(Dataset):
    def __init__(self,data,targets,descriptors,fingerprints):
        self.data = data
        self.descriptors = descriptors
        self.targets = targets
        self.fingerprints = fingerprints
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sample = self.data[index]
        target = self.targets[index]
        descriptor = self.descriptors[index]
        fingerprint = self.fingerprints[index]
        return (sample), (target), (descriptor), (fingerprint)

trainSplit, validationSplit, testSplit = get_data_split_indices(len(encodedTokens),0.1,0.1)

# Training
encodedTrainData = torch.tensor(encodedTokens[trainSplit], dtype=torch.long, device=device)
trainTargets = targetTensor[trainSplit]

# Validation
encodedValidationData = torch.tensor(encodedTokens[validationSplit], dtype=torch.long, device=device)
validationTargets = targetTensor[validationSplit]

# Test
encodedTestData = torch.tensor(encodedTokens[testSplit], dtype=torch.long, device=device)
testTargets = targetTensor[testSplit]

# Descriptors
trainDescriptors = mol_descriptor[trainSplit]
valDescriptors = mol_descriptor[validationSplit]
testDescriptors = mol_descriptor[testSplit]

trainFingerprints = mol_fingerprint[trainSplit]
valFingerprints = mol_fingerprint[validationSplit]
testFingerprints = mol_fingerprint[testSplit]


trainTargets, validationTargets, testTargets, scalerTargets = scale_targets(trainTargets,validationTargets,testTargets)


# Normalize descriptors and fingerprints using minmax scaler
minmax_scaler = MinMaxScaler()
trainDescriptors = minmax_scaler.fit_transform(trainDescriptors)
valDescriptors = minmax_scaler.transform(valDescriptors)
testDescriptors = minmax_scaler.transform(testDescriptors)

trainFingerprints = minmax_scaler.fit_transform(trainFingerprints)
valFingerprints = minmax_scaler.transform(valFingerprints)
testFingerprints = minmax_scaler.transform(testFingerprints)


# Make into tensors
#print(np.dtype(trainTargets[0,0]))
trainTargets = torch.tensor(trainTargets,dtype=torch.float, device=device)
validationTargets = torch.tensor(validationTargets,dtype=torch.float, device=device)
testTargets = torch.tensor(testTargets,dtype=torch.float, device=device)

# Decriptors to tensor
trainDescriptors = torch.from_numpy(trainDescriptors).float().to(device)
valDescriptors = torch.from_numpy(valDescriptors).float().to(device)
testDescriptors = torch.from_numpy(testDescriptors).float().to(device)

trainFingerprints = torch.from_numpy(trainFingerprints).float().to(device)
valFingerprints = torch.from_numpy(valFingerprints).float().to(device)
testFingerprints = torch.from_numpy(testFingerprints).float().to(device)

trainingData = DataLoader(datasetObject(encodedTrainData,trainTargets,trainDescriptors,trainFingerprints),batch_size)
testData = DataLoader(datasetObject(encodedTestData,testTargets,testDescriptors,testFingerprints),batch_size)
validationData = DataLoader(datasetObject(encodedValidationData,validationTargets,valDescriptors,valFingerprints),batch_size)

In [9]:

hidden_channels = 1024
embedding_dim = 128
d_target = 15
nHeads = 8
learningRate = 0.0005
nDescriptors = len(mol_descriptor[0])
nFingerprints = len(mol_fingerprint[0])
vocab_size = len(tokenDict)+1
torch.manual_seed(12345)

model = TransformerNetwork(hidden_channels,d_target,vocab_size, embedding_dim, nHeads,maxTokenLength,nDescriptors,nFingerprints).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate, weight_decay=5e-4)
# decay learning rate
decayRate = 0.95
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
criterion = torch.nn.MSELoss().to(device)

def train(data_in, targets, descriptors,fingerprints):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in,descriptors,fingerprints).to(device)
      loss = criterion(out, targets)
      
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def infer(data_in, targets, descriptors, fingerprints):
      with torch.no_grad():
            model.eval()
            out = model(data_in, descriptors,fingerprints)
            loss = criterion(out, targets)
            r2 = r2_score(targets.cpu().numpy(), out.cpu().numpy())
      return loss, r2


earlyStop = False
stopTolerance = 3
minDiff = 0.01
counter = 0
epoch = 0
lossList = []
r2List = []
while not earlyStop and epoch < 1000:
      loss = 0
      for input,targets,descriptor,fingerprint in trainingData:
            loss += train(input,targets,descriptor,fingerprint).detach()
      
      test = 0
      r2 = 0
      for input, targets, descriptor, fingerprint in testData:
            tempTest, tempR2 = infer(input,targets,descriptor,fingerprint)
            test += tempTest
            r2 += tempR2
      r2List.append(r2.item()/len(testData))

      
      val = 0
      for input,targets,descriptor,fingerprint in validationData:
            tempVal,tempR2 = infer(input,targets, descriptor, fingerprint)
            val += tempVal
      
      lr_scheduler.step()

      # Save loss AS PYTHON NUMBER (not tensor) in list
      lossList.append([loss.item()/len(trainingData), test.item()/len(testData), val.item()/len(validationData)])

      # Early stopping
      if (lossList[epoch][2]-lossList[epoch][0]) > minDiff:
            counter += 1
      else:
            counter = 0
            if counter >= stopTolerance:
                  earlyStop = True

      print(f"#### Epoch: {epoch} of 1000 ####\n Training loss:\t{lossList[epoch][0]}\n Test loss:\t{lossList[epoch][1]}\n Validation loss:\t{lossList[epoch][2]}")
      print('R2: ',r2List[epoch])

      epoch+=1 

#### Epoch: 0 of 1000 ####
 Training loss:	0.15171954518409467
 Test loss:	0.11236921640542838
 Validation loss:	0.11531659273000863
R2:  0.8826854418819219


KeyboardInterrupt: 