In [1]:

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import atomInSmiles
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

import sys
sys.path.append('../')
from code_graphs.utility_functions import get_num_parameters , get_data_split_indices, scale_targets
from classes import create_encoded_vector, PositionalEncoding, TransformerLayer, TransformerNetwork



Will attempt to gather all the variant in this file then changing if descriptors are used etc by only changing bools instead of having four different files.
Alot of redundancy this way bit the program is much shorter, we do all calcualtion as if we are using both descriptors and fingerprints. But do not concat them in the transformerencoder if their bool is set to False

In [36]:
useDescriptors = True

# rdkitDescriptor == True uses rdKit descriptors otherwise mordred is used
rdkitDescriptor = True

useFingerprints = True

In [3]:

# check if cuda is available
print('cuda available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', "cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print('cuda version:', torch.version.cuda)
    print('gpu:', torch.cuda.get_device_name(0))



cuda available: True
device: cuda
cuda version: 11.7
gpu: NVIDIA GeForce GTX 970


In [4]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))

if rdkitDescriptor:
    mol_descriptor = np.load("../data/mol_descriptors.npy")
else :
    mol_descriptor = np.load("../data/Mordred_mol_descriptors.npy")



mol_fingerprint = np.load("../data/mol_morgan_fingerprints.npy")
print(mol_descriptor.shape)


properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y_targets = df.loc[:, properties_names].values # shape = (n_samples, n_properties)

(132820, 21)
(132820, 179)


In [5]:

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

Longest word (max amount of tokens): 22


In [6]:
encodedTokens = []
for token in tokenList:
    encodedTokens.append(create_encoded_vector(token,tokenDict,maxTokenLength))
    
encodedTokens = np.array(encodedTokens)


In [7]:
batch_size = 64
torch.manual_seed(12345)

# Divide into splits
trainSplit, validationSplit, testSplit = get_data_split_indices(len(encodedTokens), 0.1, 0.1)

# Training
encodedTrainData = torch.tensor(encodedTokens[trainSplit], dtype=torch.long, device=device)
trainTargets = y_targets[trainSplit]

# Validation
encodedValidationData = torch.tensor(encodedTokens[validationSplit], dtype=torch.long, device=device)
validationTargets = y_targets[validationSplit]

# Test
encodedTestData = torch.tensor(encodedTokens[testSplit], dtype=torch.long, device=device)
testTargets = y_targets[testSplit]

trainTargets, validationTargets, testTargets, scalerTargets = scale_targets(trainTargets, validationTargets, testTargets)

minmax_scaler = MinMaxScaler()

# Descriptors
trainDescriptors = mol_descriptor[trainSplit]
valDescriptors = mol_descriptor[validationSplit]
testDescriptors = mol_descriptor[testSplit]
# Normalize
trainDescriptors = minmax_scaler.fit_transform(trainDescriptors)
valDescriptors = minmax_scaler.transform(valDescriptors)
testDescriptors = minmax_scaler.transform(testDescriptors)
# Convert descriptors and fingerprints to tensors
trainDescriptors = torch.from_numpy(trainDescriptors).float().to(device)
valDescriptors = torch.from_numpy(valDescriptors).float().to(device)
testDescriptors = torch.from_numpy(testDescriptors).float().to(device)

# Fingerprints
trainFingerprints = mol_fingerprint[trainSplit]
valFingerprints = mol_fingerprint[validationSplit]
testFingerprints = mol_fingerprint[testSplit]
# Normalize
trainFingerprints = minmax_scaler.fit_transform(trainFingerprints)
valFingerprints = minmax_scaler.transform(valFingerprints)
testFingerprints = minmax_scaler.transform(testFingerprints)
# Convert to tensor
trainFingerprints = torch.from_numpy(trainFingerprints).float().to(device)
valFingerprints = torch.from_numpy(valFingerprints).float().to(device)
testFingerprints = torch.from_numpy(testFingerprints).float().to(device)




# Convert targets to tensors
trainTargets = torch.tensor(trainTargets, dtype=torch.float, device=device)
validationTargets = torch.tensor(validationTargets, dtype=torch.float, device=device)
testTargets = torch.tensor(testTargets, dtype=torch.float, device=device)




In [8]:
class datasetObject(Dataset):
    def __init__(self,data,targets,descriptors,fingerprints):
        self.data = data
        self.descriptors = descriptors
        self.targets = targets
        self.fingerprints = fingerprints
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sample = self.data[index]
        target = self.targets[index]
        descriptor = self.descriptors[index]
        fingerprint = self.fingerprints[index]
        return (sample), (target), (descriptor), (fingerprint)

In [9]:
trainingData = DataLoader(datasetObject(encodedTrainData, trainTargets, trainDescriptors, trainFingerprints), batch_size)
testData = DataLoader(datasetObject(encodedTestData, testTargets,  testDescriptors, testFingerprints), batch_size)
validationData = DataLoader(datasetObject(encodedValidationData, validationTargets, valDescriptors, valFingerprints), batch_size)

In [37]:

hidden_channels = 512
embedding_dim = 128
d_target = 15
nHeads = 8
learningRate = 0.0005
if useDescriptors:
      nDescriptors = len(mol_descriptor[0])
else:
      nDescriptors = 0

if useFingerprints:
      nFingerprints = len(mol_fingerprint[0])
else:
      nFingerprints = 0

vocab_size = len(tokenDict)+1

model = TransformerNetwork(hidden_channels,d_target,vocab_size, embedding_dim, nHeads,maxTokenLength,
                           nDescriptors,nFingerprints,useDescriptors,useFingerprints).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate, weight_decay=0)
# decay learning rate
decayRate = 1
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
criterion = torch.nn.MSELoss().to(device)

def train(data_in, targets, descriptors, fingerprints):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in,descriptors,fingerprints).to(device)
      loss = criterion(out, targets)
      
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

# This calculates r2 for each target separately 
def inferNew(data_in, targets, descriptors, fingerprints):
    with torch.no_grad():
            model.eval()
            out = model(data_in, descriptors, fingerprints).to(device)
            loss = criterion(out, targets)

            r2_scores = []
            for item in range(targets.shape[1]):
                  r2 = r2_score(targets[:, item].cpu().detach().numpy(), out[:, item].cpu().detach().numpy())
                  r2_scores.append(r2)

    return loss, r2_scores


earlyStop = False
stopTolerance = 3
minDiff = 0.01
counter = 0
epoch = 0
bestValLoss = float('inf')
lossList = []
r2List = []
while not earlyStop and epoch < 100:
      loss = 0
      for input,targets,descriptor,fingerprint in trainingData:
            loss += train(input,targets,descriptor,fingerprint).detach()
      
      test = 0
      r2 = [0]*d_target
      for input, targets, descriptor, fingerprint in testData:
            tempTest, tempR2 = inferNew(input,targets,descriptor,fingerprint)
            test += tempTest
            # Add r2 for each variable
            r2 = [x + y for x, y in zip(r2, tempR2)]
      # Average over batch
      r2 = [x / len(testData) for x in r2]
      r2List.append(r2)

      val = 0
      for input,targets,descriptor,fingerprint in validationData:
            tempVal,tempR2 = inferNew(input,targets, descriptor, fingerprint)
            val += tempVal
      
      lr_scheduler.step()

      # Save loss AS PYTHON NUMBER (not tensor) in list
      lossList.append([loss.item()/len(trainingData), test.item()/len(testData), val.item()/len(validationData)])

      # Early stopping
      if val < bestValLoss - minDiff:
            bestValLoss = val
            counter = 0
      else:
            counter += 1
            if counter >= stopTolerance:
                  earlyStop = True

      print(f"#### Epoch: {epoch} of 100 ####\n Training loss:\t{lossList[epoch][0]}\n Test loss:\t{lossList[epoch][1]}\n Validation loss:\t{lossList[epoch][2]}")
      print('R2:', r2List[epoch])
      print('avg R2',np.mean(r2List[epoch]),'\n')
      epoch+=1 

#### Epoch: 0 of 100 ####
 Training loss:	0.15851375756674255
 Test loss:	0.0986159214606652
 Validation loss:	0.1016926306944627
R2: [0.705687904044878, 0.8037934739910815, 0.8744658790654019, 0.5896553865859613, 0.9705635572428257, 0.8148101596286601, 0.9386414046858408, 0.906278140161477, 0.9176495474868557, 0.9879258596738546, 0.9931336008236177, 0.992580987157414, 0.993072256023943, 0.9927320398715909, 0.9744462343793379]
avg R2 0.8970290953881827 

#### Epoch: 1 of 100 ####
 Training loss:	0.08648311424370203
 Test loss:	0.07685562280508187
 Validation loss:	0.07955734546367939
R2: [0.780004671568446, 0.8567497797289442, 0.9030264449462115, 0.6369269448804096, 0.9785274678384095, 0.8561513794585017, 0.954465747665105, 0.9308512026158368, 0.9398166464981998, 0.9893305211946823, 0.995341264228358, 0.9954516363261737, 0.995645749198824, 0.9955655792072272, 0.9809988700304686]
avg R2 0.9192569270257197 

#### Epoch: 2 of 100 ####
 Training loss:	0.06824287630433709
 Test loss:	0.0659

In [15]:
r2ResultsList = []
lossResultList = []

In [38]:
print(r2ResultsList[0][-1], '\n',r2ResultsList[1][-1],'\n',r2ResultsList[2][-1], '\n', r2List[-1])

[0.8646386301877237, 0.9283815679475169, 0.9450812473918203, 0.6910083686443527, 0.9874655047743294, 0.887775569127897, 0.9688351197757891, 0.9500867504338024, 0.9602256016829921, 0.997828370056202, 0.9988321801383298, 0.9988337075067906, 0.9988341061461555, 0.9988320998042559, 0.9887341839176582] 
 [0.86625421074764, 0.9316138929959028, 0.951153619443174, 0.6623161665708791, 0.9865322779509239, 0.9105202891101809, 0.969592211067584, 0.9567827183751209, 0.963917258810526, 0.998534796341051, 0.9993756071436712, 0.9993733523845045, 0.9993736456839012, 0.9993740816892989, 0.9911018717164113] 
 [0.8495609824870629, 0.904121631666355, 0.9326948743376868, 0.6828483713957751, 0.9766347721411918, 0.8780140030682374, 0.961413228732725, 0.9442646982895891, 0.9560375734224605, 0.9874243764658291, 0.989024773054134, 0.9890203187716863, 0.9890006606720448, 0.9890212987270223, 0.9763105790164521] 
 [0.8657691711963978, 0.9064977822145035, 0.940750552133279, 0.6707883639398146, 0.987226978825102, 0.9

In [39]:
## DO NOT RUN THIS
r2ResultsList.append(r2List)
lossResultList.append(lossList)

In [48]:
print(r2ResultsList[0][-1])
print(r2ResultsList[1][-1])
print(r2ResultsList[2][-1])
print(r2ResultsList[3][-1])
print(np.mean(r2ResultsList[3][-1]))
# First both false
# Second descriptor true, fingerprints false
# Third descriptor false, fingerprints true
# fourth all true

[0.8646386301877237, 0.9283815679475169, 0.9450812473918203, 0.6910083686443527, 0.9874655047743294, 0.887775569127897, 0.9688351197757891, 0.9500867504338024, 0.9602256016829921, 0.997828370056202, 0.9988321801383298, 0.9988337075067906, 0.9988341061461555, 0.9988320998042559, 0.9887341839176582]
[0.86625421074764, 0.9316138929959028, 0.951153619443174, 0.6623161665708791, 0.9865322779509239, 0.9105202891101809, 0.969592211067584, 0.9567827183751209, 0.963917258810526, 0.998534796341051, 0.9993756071436712, 0.9993733523845045, 0.9993736456839012, 0.9993740816892989, 0.9911018717164113]
[0.8495609824870629, 0.904121631666355, 0.9326948743376868, 0.6828483713957751, 0.9766347721411918, 0.8780140030682374, 0.961413228732725, 0.9442646982895891, 0.9560375734224605, 0.9874243764658291, 0.989024773054134, 0.9890203187716863, 0.9890006606720448, 0.9890212987270223, 0.9763105790164521]
[0.8657691711963978, 0.9064977822145035, 0.940750552133279, 0.6707883639398146, 0.987226978825102, 0.9001408

In [50]:


# Extract specific values from the three-dimensional list
values_to_save = [r2ResultsList[i][-1] for i in range(4)]

# Create a DataFrame from the extracted values
df = pd.DataFrame(values_to_save)

# Specify the file name
file_name = 'r2Results.csv'

# Write the DataFrame to a CSV file
df.to_csv(file_name, index=False)

print(f"The values have been saved to '{file_name}'.")



# Create a DataFrame from the extracted values
df = pd.DataFrame(lossResultList)

# Specify the file name
file_name = 'lossResults.csv'

# Write the DataFrame to a CSV file
df.to_csv(file_name, index=False)

print(f"The values have been saved to '{file_name}'.")


The values have been saved to 'r2Results.csv'.
The values have been saved to 'lossResults.csv'.
