# TIF360 Project

# Data Pre-processing

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

UPDATED ON 15/05-2023  11:20

In [4]:
import os
import rdkit
from rdkit import Chem  # To extract information of the molecules
from rdkit.Chem import Draw  # To draw the molecules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import torch
import torch_geometric
from torch_geometric.loader import DataLoader
import torch_geometric.utils as utils
import networkx as nx
from torch.nn import Linear
from torch_geometric.nn import global_mean_pool, GraphConv, GATConv, GCNConv
import torch.nn.functional as F

from sklearn.metrics import r2_score

In [95]:
# Input is list of strings on the form ['a','b','c']
# where the string are tokens
# Dictionary is the dictionary containign all possible tokens
# and an index for them
# MaxTokenLength is the max amount of tokens any input creates
def create_Onehot_Matrix(input, dictionary,maxTokenLength):
    # Create a matrix of zeros
    onehot_Matrix = np.zeros((len(dictionary),maxTokenLength))
    
    # Change value in right place to one
    keyCount = 0
    for key in input:
        onehot_Matrix[dictionary[key],keyCount] = 1
        keyCount+=1

    # Return it
    return(onehot_Matrix)


In [5]:


df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))


(132820, 21)


In [6]:

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'RÂ²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y = df.loc[:, properties_names].values  # shape = (n_samples, n_properties)


### Transformer Network

#### Model for all targets at once

Transformer

In [17]:
import atomInSmiles


testSmile = x_smiles[8]
print(testSmile)
tokens = atomInSmiles.encode(testSmile)
tokens = tokens.split()
print(tokens)
print(x_smiles.shape)



NC1=CC(=CN=C1)C#C
['[CH;!R;C]', '#', '[C;!R;CC]', '[c;R;CCC]', '1', '[cH;R;CN]', '[n;R;CC]', '[cH;R;CN]', '[c;R;CCN]', '(', '[NH2;!R;C]', ')', '[cH;R;CC]', '1']
(132820,)


In [49]:
# tokenize all smiles
import atomInSmiles

tokenList = []
for smile in x_smiles:
    tokenList.append(atomInSmiles.encode(smile).split())
#tokenList is target

# find longest word (max tokens from one smile+1)
maxTokenLength = 0
for token in tokenList:
    if len(token)>maxTokenLength:
        maxTokenLength = len(token)
print('Longest word (max amount of tokens):', maxTokenLength)

Longest word (max amount of tokens: 22


In [72]:

# Give each token a index in a dictionary
tokenDict = {}
count = 0

dictList = []
for itokens in tokenList:
    for itoke in itokens:
        #print(itoke)
        if tokenDict.get(itoke) == None:
            tokenDict[itoke] = count
        
            # current = [itoke, count]
            # dictList.append(current)
            count+=1

In [92]:

# Create one hot encoded matrix containing all the data made into encoded matrices
inputData = []
for token in tokenList:
    inputData.append(create_Onehot_Matrix(token,tokenDict,maxTokenLength))
inputData = np.asarray(inputData)

['[CH3;!R;C]', '[C;!R;CCO]', '(', '=', '[O;!R;C]', ')', '[C;R;CCCO]', '1', '(', '[OH;!R;C]', ')', '[CH2;R;CC]', '[CH2;R;CO]', '[O;R;CC]', '[CH2;R;CO]', '1']


In [94]:
print(inputData[0])
print(inputData.shape)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(132820, 149, 22)


In [1]:
class PositionalEncoding(torch.nn.Module):
    
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):

        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


class TransformerLayer(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.Attention = torch.nn.MultiheadAttention(data_features,num_heads=3,dropout=0.15)
        self.Norm1 = torch.nn.LayerNorm(data_features)
        self.Dense1 = torch.nn.Linear(hidden_channels)
        self.Norm2 = torch.nn.LayerNorm(hidden_channels)
        self.Dense2 = torch.nn.Linear(hidden_channels)
        

    def forward(self, x):
        addNormX = x
        x = self.Attention(x)
        x = self.Norm1(x + addNormX)
        addNormX = x
        x = self.Dense1(x)
        x = self.Dense2(x)
        x = self.Norm2(x + addNormX)

       
 
        return x
    
class TransformerNetwork(torch.nn.Module):
    def __init__(self,hidden_channels,output_dim):
        super().__init__()
        torch.manual_seed(12345)
        self.EmbeddingLayer = torch.nn.Embedding(num_embeddings=148,embedding_dim = 600 , max_norm=True)
        self.PositionalEncoding = PositionalEncoding(d_model=1024)
        self.TransEnc1 = TransformerLayer(hidden_channels)
        
        self.Pooling = torch.nn.AvgPool1d()

        self.DenseOut1 = torch.nn.Linear(hidden_channels)
        self.DenseOut2 = torch.nn.Linear(output_dim)

    def forward(self,x):
        x = self.EmbeddingLayer(x)
        x = self.PositionalEncoding(x)
        x = self.TransEnc1(x)
        x = self.Pooling(x)
        x = self.DenseOut1(x)
        x = self.DenseOut2(x)


        return x





NameError: name 'torch' is not defined