# TIF360 Project

# Data Pre-processing

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

UPDATED ON 15/05-2023  11:20

In [78]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math
import atomInSmiles
from torch.utils.data import DataLoader
import random


In [50]:
# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates
def create_Onehot_Matrix(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    onehot_Matrix = np.zeros((len(dictionary),maxTokenLength))
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        onehot_Matrix[dictionary[key],keyCount] = 1
        keyCount+=1

    # Return it
    return(onehot_Matrix)

In [51]:

# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates

# THIS VERSION RETURNS TORCH TENSOR

#Version taht returns tensor
def create_encoded_tensor(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    #encoded_vector = np.zeros((1,maxTokenLength))
    encoded_tensor = torch.zeros(maxTokenLength,dtype=torch.long)
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        encoded_tensor[keyCount] = dictionary[key]
        keyCount+=1
    
    return encoded_tensor.unsqueeze(0)

In [96]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))


(132820, 21)


In [97]:

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
targetTensor = torch.tensor(df.loc[:, properties_names].values, dtype=torch.float32)  # shape = (n_samples, n_properties)


In [55]:
# tokenize all smiles
#import atomInSmiles

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

Longest word (max amount of tokens): 22


In [56]:

# Give each token a index in a dictionary
tokenDict = {}
count = 1

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

In [89]:


from torch_geometric.loader import DataLoader
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def scale_and_split_data(dataset, input_scaler):
    # split the dataset into test and validation:
    num_samples = len(dataset)

    # Want to divide data randomly
    random.seed(42)
    random_indexes = np.array(random.sample(range(num_samples), num_samples)) # random.sample ensures no duplicates

    train_data = [dataset[index] for index in random_indexes[int(.2 * num_samples ) :]] # 80%
    test_data = [dataset[index] for index in random_indexes[: int(.2 * num_samples )]] # 20%
    print(np.shape(train_data))
    print(np.shape(test_data))

    input_scaler = StandardScaler()
    train_data = input_scaler.fit_transform(train_data)
    test_data = input_scaler.transform(test_data)

    # print("Example train data target", train_data_targets[0].reshape(1,-1).shape)
    train_data = torch.tensor(train_data, dtype=torch.float)
    test_data = torch.tensor(test_data, dtype=torch.float)


    return train_data, test_data, input_scaler

In [79]:
encodedTensors = torch.tensor([],dtype=int)
for token in tokenList:
    encodedTensors = torch.cat((encodedTensors,create_encoded_tensor(token,tokenDict,maxTokenLength)),dim=0)


torch.Size([132820, 22])
torch.Size([132820, 15])


In [98]:
print(encodedTensors.shape)
print(targetTensor[:,0].shape)
#trainTarget, testTarget = scale_and_split_data(targetTensor,input_scaler=StandardScaler())
for i in range(len(targetTensor[0])):
    print(i)
    print(targetTensor[:,i].mean())
    targetTensor[:,i] = (targetTensor[:,i]-targetTensor[:,i].mean())/targetTensor[:,i].std()
    print(targetTensor[:,i].mean())





torch.Size([132820, 22])
torch.Size([132820])
0
tensor(3.3658)
tensor(-1.8749e-07)
1
tensor(1.3958)
tensor(-9.5583e-08)
2
tensor(1.1171)
tensor(2.4999e-07)
3
tensor(2.6836)
tensor(7.3410e-08)
4
tensor(75.2409)
tensor(2.0955e-07)
5
tensor(-0.2400)
tensor(-9.9259e-08)
6
tensor(0.0113)
tensor(-1.1029e-08)
7
tensor(0.2513)
tensor(-8.4554e-08)
8
tensor(1188.4229)
tensor(-1.1764e-07)
9
tensor(0.1487)
tensor(3.6763e-08)
10
tensor(-411.9466)
tensor(1.1764e-07)
11
tensor(-411.9381)
tensor(-1.1764e-07)
12
tensor(-411.9372)
tensor(-1.6911e-07)
13
tensor(-411.9800)
tensor(1.6911e-07)
14
tensor(31.6310)
tensor(-3.0145e-07)


In [58]:
# From attention is all you need
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, max_len,dropout):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x = x + torch.tensor(self.pe[:, :x.size(1)], 
        #                  requires_grad=False)
        x = x + self.pe[:x.size(0), :].detach()
        return self.dropout(x)

In [66]:


# Define the size of the embedding space
embedding_dim = 64
vocab_size = len(tokenDict)+1

embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

# Try two inputs and concat them (this will be done for batches)
input_sentence = create_encoded_tensor(tokenList[100],tokenDict,maxTokenLength)
other_input = create_encoded_tensor(tokenList[150],tokenDict,maxTokenLength)

input_sentence = torch.cat((input_sentence,other_input),dim=0)
print(input_sentence)

# Pass the input tensor through the embedding layer
embedded_tensor = embedding_layer(input_sentence)

# Print the shape of the output tensor
print(embedded_tensor.shape)  # should be (1, 22, 64)
print(embedded_tensor)

posencoder = PositionalEncoding(embedding_dim,maxTokenLength,0.1)
encoded = posencoder.forward(embedded_tensor)
print(encoded.shape)

tensor([[ 6, 22,  8, 15,  9, 26, 18, 10, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0],
        [17, 10, 78, 33, 34, 35,  8, 15,  9, 34, 33,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0]])
torch.Size([2, 22, 64])
tensor([[[-0.3604,  0.7489, -1.0202,  ...,  0.2771, -0.3119,  2.0057],
         [-0.9915,  0.6525,  2.0010,  ..., -1.6723, -1.6019,  0.5795],
         [ 0.6226,  0.1602,  1.1697,  ...,  0.1893, -0.6191,  0.2015],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.9583, -0.6257,  0.2897,  ...,  0.1599,  0.2993, -1.1867],
         [-0.0793, -1.1045,  0.4194,  ..., -0.8032, -0.1406,  0.5390],
         [-0.3788,  1.4723,  0.3950,  ...,  0.4583,  0.6272,  1.5565],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0

In [67]:

class TransformerLayer(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.Attention = torch.nn.MultiheadAttention(embedding_dim,num_heads=4,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(embedding_dim)
        self.Dense1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.Dense2 = torch.nn.Linear(hidden_channels,hidden_channels)
        self.Norm2 = torch.nn.LayerNorm(hidden_channels)
        

    def forward(self, x):
        addNormX = x
        #print(x.shape)
        x, _ = self.Attention(x,x,x)
        #print('attention output',x.shape)
        x = self.Norm1(x + addNormX)
        #print('norm + input',x.shape)
        addNormX = x
        x = self.Dense1(x)
        #print('first dense output',x.shape)
        x = self.Dense2(x)
        #print('second dense output',x.shape)
        x = self.Norm2(x + addNormX)
        #print(x.shape)

 
        return x

In [118]:
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim, vocab_size, maxTokenLength):
        super().__init__()
        torch.manual_seed(12345)
        self.embedding_dim = 64
        # Embedd and add pos encoding to input
        self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim = self.embedding_dim , max_norm=True)
        self.PositionalEncoding = PositionalEncoding(self.embedding_dim, maxTokenLength, dropout = 0.1)

        
        self.TransEnc1 = TransformerLayer(self.embedding_dim,hidden_channels)
        self.TransEnc2 = TransformerLayer(self.embedding_dim,hidden_channels)
        self.TransEnc3 = TransformerLayer(self.embedding_dim,hidden_channels)
        self.Pooling = torch.nn.AvgPool1d(kernel_size= 22)

        self.DenseOut1 = torch.nn.Linear(embedding_dim,hidden_channels)
        self.DenseOut2 = torch.nn.Linear(hidden_channels,output_dim)

    def forward(self,x):
        x = self.EmbeddingLayer(x)
        x = self.PositionalEncoding(x)
        x = self.TransEnc1(x)
        x = self.TransEnc2(x)
        x = self.TransEnc3(x)
        x = self.Pooling(x.permute((0,2,1))).permute((0,2,1))
        x = torch.squeeze(x,axis=1)
        #x = x[:,-1,:]
        x = self.DenseOut1(x)
        x = self.DenseOut2(x)

        return x

In [119]:
transformer = TransformerNetwork(64,15,len(tokenDict),maxTokenLength)
input_sentence
output = transformer.forward(input_sentence)
print(output, output.shape)

tensor([[-0.1613, -0.2594, -0.1061, -0.3813,  0.2653,  0.4150, -0.1055,  0.0313,
         -0.0556, -0.0638,  0.2629, -0.1072, -0.1429, -0.3202,  0.1680],
        [-0.1690, -0.2735, -0.1781, -0.3596,  0.2547,  0.3867, -0.1938,  0.0224,
         -0.0737, -0.0391,  0.2733, -0.1121, -0.1632, -0.2877,  0.2045]],
       grad_fn=<AddmmBackward0>) torch.Size([2, 15])


In [120]:
from torch.utils.data import Dataset, DataLoader
batch_size = 25
## Create iterable dataset class:

class datasetObject(Dataset):
    def __init__(self,data,targets):
        self.data = data
        self.targets = targets
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sample = self.data[index]
        target = self.targets[index]
        return (sample), (target)


print(encodedTensors.shape)
data = DataLoader(datasetObject(encodedTensors,targetTensor),batch_size,shuffle=True)

torch.Size([132820, 22])


In [121]:
d_model = 64
hidden_channels = 64
d_target = 15
vocab_size = len(tokenDict)+1


model = TransformerNetwork(hidden_channels,d_target,vocab_size,maxTokenLength) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

def train(data_in, targets):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in)
      loss = criterion(out, targets)

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.

      return loss



testTensors = encodedTensors[:100]
testTargets = targetTensor[:100]

for epoch in range(1000):
      loss = []
      for input,targets in data:
            #print(input)
            loss.append(train(input,targets).detach())
      print(np.mean(loss))

0.2891061
0.19375692
0.17707199
0.1698929
0.16597769
0.16245265
0.16035533
0.1573339
0.15695736
0.15460584
0.1533905
0.15378942


KeyboardInterrupt: 

In [124]:
print(model(testTensors[0]))
print(testTargets[0],'\n')
print(model(testTensors[10]))
print(testTargets[10],'\n')
print(model(testTensors[20]))
print(testTargets[20],'\n')
print(model(testTensors[30]))
print(testTargets[30],'\n')

tensor([[ 6.4858,  6.5508,  7.1296, -1.1955, -6.6019, -1.4614,  0.4762,  1.1594,
         -6.1393, -3.0303,  8.7178,  8.7233,  8.7187,  8.7145, -5.6774]],
       grad_fn=<AddmmBackward0>)
tensor([-2.8462,  6.9271, 10.4219, -1.8652, -4.5817, -0.9100, -0.6975, -0.2690,
        -3.3279, -3.3636,  6.5917,  6.5916,  6.5916,  6.5919, -4.0631]) 

tensor([[ 5.3248,  5.5519,  6.3477, -1.0306, -6.1503, -1.6785,  0.4770,  1.2614,
         -5.4162, -3.0018,  7.3534,  7.3462,  7.3466,  7.3442, -4.7522]],
       grad_fn=<AddmmBackward0>)
tensor([ 4.3917,  5.7236,  5.6744, -0.9212, -3.8942, -1.2120,  0.4761,  1.0338,
        -2.7206, -1.8069,  4.6116,  4.6115,  4.6115,  4.6116, -2.5707]) 

tensor([[ 4.5141,  5.0823,  6.0408, -1.8171, -4.2379, -0.8721,  1.9609,  2.3628,
         -3.9469, -0.6333,  7.5464,  7.5443,  7.5426,  7.5410, -3.7141]],
       grad_fn=<AddmmBackward0>)
tensor([ 3.2974,  4.4188,  4.4971, -1.8223, -2.3683, -3.1334,  1.5729,  3.0102,
        -2.2773,  0.3286,  5.4653,  5.4653,  5.4