In [1]:

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import atomInSmiles
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

import sys
sys.path.append('../')
from code_graphs.utility_functions import get_num_parameters , get_data_split_indices, scale_targets
from classes import create_encoded_vector, PositionalEncoding, TransformerLayer, TransformerNetwork



Will attempt to gather all the variant in this file then changing if descriptors are used etc by only changing bools instead of having four different files.
Alot of redundancy this way bit the program is much shorter, we do all calcualtion as if we are using both descriptors and fingerprints. But do not concat them in the transformerencoder if their bool is set to False

In [2]:
useDescriptors = False
useFingerprints = False

In [3]:

# check if cuda is available
print('cuda available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', "cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print('cuda version:', torch.version.cuda)
    print('gpu:', torch.cuda.get_device_name(0))



cuda available: True
device: cuda
cuda version: 11.7
gpu: NVIDIA GeForce GTX 970


In [4]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))

mol_descriptor = np.load("../data/mol_descriptors.npy")
mol_fingerprint = np.load("../data/mol_morgan_fingerprints.npy")
print(mol_descriptor.shape)


properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y_targets = df.loc[:, properties_names].values # shape = (n_samples, n_properties)

(132820, 21)
(132820, 179)


In [5]:

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

Longest word (max amount of tokens): 22


In [6]:
encodedTokens = []
for token in tokenList:
    encodedTokens.append(create_encoded_vector(token,tokenDict,maxTokenLength))
    
encodedTokens = np.array(encodedTokens)


In [7]:
batch_size = 64

# Divide into splits
trainSplit, validationSplit, testSplit = get_data_split_indices(len(encodedTokens), 0.1, 0.1)

# Training
encodedTrainData = torch.tensor(encodedTokens[trainSplit], dtype=torch.long, device=device)
trainTargets = y_targets[trainSplit]

# Validation
encodedValidationData = torch.tensor(encodedTokens[validationSplit], dtype=torch.long, device=device)
validationTargets = y_targets[validationSplit]

# Test
encodedTestData = torch.tensor(encodedTokens[testSplit], dtype=torch.long, device=device)
testTargets = y_targets[testSplit]

trainTargets, validationTargets, testTargets, scalerTargets = scale_targets(trainTargets, validationTargets, testTargets)

minmax_scaler = MinMaxScaler()

# Descriptors
trainDescriptors = mol_descriptor[trainSplit]
valDescriptors = mol_descriptor[validationSplit]
testDescriptors = mol_descriptor[testSplit]
# Normalize
trainDescriptors = minmax_scaler.fit_transform(trainDescriptors)
valDescriptors = minmax_scaler.transform(valDescriptors)
testDescriptors = minmax_scaler.transform(testDescriptors)
# Convert descriptors and fingerprints to tensors
trainDescriptors = torch.from_numpy(trainDescriptors).float().to(device)
valDescriptors = torch.from_numpy(valDescriptors).float().to(device)
testDescriptors = torch.from_numpy(testDescriptors).float().to(device)

# Fingerprints
trainFingerprints = mol_fingerprint[trainSplit]
valFingerprints = mol_fingerprint[validationSplit]
testFingerprints = mol_fingerprint[testSplit]
# Normalize
trainFingerprints = minmax_scaler.fit_transform(trainFingerprints)
valFingerprints = minmax_scaler.transform(valFingerprints)
testFingerprints = minmax_scaler.transform(testFingerprints)
# Convert to tensor
trainFingerprints = torch.from_numpy(trainFingerprints).float().to(device)
valFingerprints = torch.from_numpy(valFingerprints).float().to(device)
testFingerprints = torch.from_numpy(testFingerprints).float().to(device)




# Convert targets to tensors
trainTargets = torch.tensor(trainTargets, dtype=torch.float, device=device)
validationTargets = torch.tensor(validationTargets, dtype=torch.float, device=device)
testTargets = torch.tensor(testTargets, dtype=torch.float, device=device)




In [8]:
class datasetObject(Dataset):
    def __init__(self,data,targets,descriptors,fingerprints):
        self.data = data
        self.descriptors = descriptors
        self.targets = targets
        self.fingerprints = fingerprints
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sample = self.data[index]
        target = self.targets[index]
        descriptor = self.descriptors[index]
        fingerprint = self.fingerprints[index]
        return (sample), (target), (descriptor), (fingerprint)

In [9]:
trainingData = DataLoader(datasetObject(encodedTrainData, trainTargets, trainDescriptors, trainFingerprints), batch_size)
testData = DataLoader(datasetObject(encodedTestData, testTargets,  testDescriptors, testFingerprints), batch_size)
validationData = DataLoader(datasetObject(encodedValidationData, validationTargets, valDescriptors, valFingerprints), batch_size)

In [10]:

hidden_channels = 512
embedding_dim = 128
d_target = 15
nHeads = 4
learningRate = 0.0005
if useDescriptors:
      nDescriptors = len(mol_descriptor[0])
else:
      nDescriptors = 0

if useFingerprints:
      nFingerprints = len(mol_fingerprint[0])
else:
      nFingerprints = 0

vocab_size = len(tokenDict)+1
torch.manual_seed(12345)

model = TransformerNetwork(hidden_channels,d_target,vocab_size, embedding_dim, nHeads,maxTokenLength,
                           nDescriptors,nFingerprints,useDescriptors,useFingerprints).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate, weight_decay=0)
# decay learning rate
decayRate = 1
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
criterion = torch.nn.MSELoss().to(device)

def train(data_in, targets, descriptors, fingerprints):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in,descriptors,fingerprints).to(device)
      loss = criterion(out, targets)
      
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

# This calculates r2 for each target separately 
def inferNew(data_in, targets, descriptors, fingerprints):
    with torch.no_grad():
            model.eval()
            out = model(data_in, descriptors, fingerprints).to(device)
            loss = criterion(out, targets)

            r2_scores = []
            for item in range(targets.shape[1]):
                  r2 = r2_score(targets[:, item].cpu().detach().numpy(), out[:, item].cpu().detach().numpy())
                  r2_scores.append(r2)

    return loss, r2_scores


earlyStop = False
stopTolerance = 3
minDiff = 0.01
counter = 0
epoch = 0
lossList = []
r2List = []
while not earlyStop and epoch < 100:
      loss = 0
      for input,targets,descriptor,fingerprint in trainingData:
            loss += train(input,targets,descriptor,fingerprint).detach()
      
      test = 0
      r2 = [0]*d_target
      for input, targets, descriptor, fingerprint in testData:
            tempTest, tempR2 = inferNew(input,targets,descriptor,fingerprint)
            test += tempTest
            # Add r2 for each variable
            r2 = [x + y for x, y in zip(r2, tempR2)]
      # Average over batch
      r2 = [x / len(testData) for x in r2]
      r2List.append(r2)

      val = 0
      for input,targets,descriptor,fingerprint in validationData:
            tempVal,tempR2 = inferNew(input,targets, descriptor, fingerprint)
            val += tempVal
      
      lr_scheduler.step()

      # Save loss AS PYTHON NUMBER (not tensor) in list
      lossList.append([loss.item()/len(trainingData), test.item()/len(testData), val.item()/len(validationData)])

      # Early stopping
      if (lossList[epoch][2]-lossList[epoch][0]) > minDiff:
            counter += 1
      else:
            counter = 0
            if counter >= stopTolerance:
                  earlyStop = True

      print(f"#### Epoch: {epoch} of 100 ####\n Training loss:\t{lossList[epoch][0]}\n Test loss:\t{lossList[epoch][1]}\n Validation loss:\t{lossList[epoch][2]}")
      print('R2:', r2List[epoch])
      print('avg R2',np.mean(r2List[epoch]),'\n')
      epoch+=1 

#### Epoch: 0 of 100 ####
 Training loss:	0.380817668255663
 Test loss:	0.1615614340855525
 Validation loss:	0.16400819558363694
R2: [0.5089178397889479, 0.7255251138258012, 0.8029658810922542, 0.447592706682468, 0.9353581707253813, 0.7283268139274488, 0.8721544657626086, 0.8503837864739473, 0.8659699591898343, 0.9542576647658447, 0.9637948905734687, 0.9650740479419557, 0.9636901503940941, 0.9630483260102161, 0.9304794362470131]
avg R2 0.8318359502267523 

#### Epoch: 1 of 100 ####
 Training loss:	0.15439043039302378
 Test loss:	0.1260609351671659
 Validation loss:	0.12879112133613
R2: [0.5821497308857393, 0.7704893510414424, 0.8489197311407194, 0.49354308892378557, 0.9618911554816818, 0.7784392558079687, 0.9062868586221637, 0.8824718262518764, 0.8955157425832148, 0.9840424322520285, 0.9913233727970368, 0.9910137344661815, 0.9914185634612964, 0.9908391059346737, 0.9564706815410694]
avg R2 0.8683209754127253 

#### Epoch: 2 of 100 ####
 Training loss:	0.12814113334458252
 Test loss:	0.1

KeyboardInterrupt: 

In [None]:
lastRow = r2List[-1]
print(lastRow)
print(np.mean(lastRow))

del(lastRow[3])
print(lastRow)
print(np.mean(lastRow))



[0.7374174639688904, 0.8468307442783691, 0.8926509429882702, 0.575811530605992, 0.9741914082581106, 0.8291913662528797, 0.9386550269223121, 0.9162573986517647, 0.9298462251219253, 0.9952611402526274, 0.9853929708954238, 0.9857257772646325, 0.9857025497125973, 0.98580312773329, 0.9780792388581795]
0.9037877941176842
[0.7374174639688904, 0.8468307442783691, 0.8926509429882702, 0.9741914082581106, 0.8291913662528797, 0.9386550269223121, 0.9162573986517647, 0.9298462251219253, 0.9952611402526274, 0.9853929708954238, 0.9857257772646325, 0.9857025497125973, 0.98580312773329, 0.9780792388581795]
0.9272146700828049
