# TIF360 Project

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

In [1]:
import os
import rdkit
from rdkit import Chem  # To extract information of the molecules
from rdkit.Chem import Draw  # To draw the molecules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import torch
from torch_geometric.loader import DataLoader
import torch_geometric.utils as utils
import networkx as nx
from torch.nn import Linear
from torch_geometric.nn import global_mean_pool, GraphConv, GATConv, GCNConv
import torch.nn.functional as F

from sklearn.metrics import r2_score

### Load data

In [None]:
df = pd.read_csv("../data/smiles_and_targets.csv")

### Load descriptors

In [None]:
#mol_descriptor = np.load("../data/mol_descriptors.npy")

# Investigate Neural Networks

## Graph Neural Networks

#### Convert data to graphs

In [14]:
# import packages
from rdkit.Chem import GetAdjacencyMatrix
import torch
from torch_geometric.data import Data
from torch.utils.data import DataLoader

In [15]:
def one_hot_encoding(x, permitted_list):
    if x not in permitted_list:
        x = permitted_list[-1]  # If the atom is not in the list, get "Unknown"
        
    binary_encoding = [int(boolean) for boolean in list(map(lambda s: x==s, permitted_list))]
    
    return binary_encoding    

Atom featurisation\
Currently generates ca. 80 node features

In [16]:
def get_atom_features(atom, use_chirality = True, hydrogens_implicit = True):
    # list of permitted atoms
    permitted_atom_list = ['C','N','O','S','F','Si','P','Cl','Br','Mg','Na','Ca',
                           'Fe','As','Al','I', 'B','V','K','Tl','Yb','Sb','Sn','Ag','Pd','Co',
                           'Se','Ti','Zn', 'Li','Ge','Cu','Au','Ni','Cd','In','Mn','Zr','Cr','Pt',
                           'Hg','Pb','Unknown']
    atom_type_enc = one_hot_encoding(str(atom.GetSymbol()), permitted_atom_list)
    
    n_heavy_neighbors = one_hot_encoding(int(atom.GetDegree()), [0,1,2,3,4,"MoreThanFour"])
    
    formal_charge_enc = one_hot_encoding(int(atom.GetFormalCharge()), [-3, -2, -1, 0, 1, 2, 3, 'Extreme'])
    
    hybridisation_type_enc = one_hot_encoding(str(atom.GetHybridization()), ["S", "SP", "SP2", "SP3", "SP3D", "SP3D2", "OTHER"])
    
    is_in_ring_enc = one_hot_encoding(int(atom.IsInRing()), [0, 1])
    
    is_aromatic_enc = one_hot_encoding(int(atom.GetIsAromatic()), [0, 1])
    
    atomic_mass_scaled = [float(atom.GetMass() - 10.812)/116.092] # (?) replace 10.812 with mean the and 116.092 with std
    
    vdw_radius_scaled = [float((Chem.GetPeriodicTable().GetRvdw(atom.GetAtomicNum()) - 1.5)/0.6)] # (?) replace 1.5 with mean the and 0.6 with std
    
    covalent_radius_scaled = [float((Chem.GetPeriodicTable().GetRcovalent(atom.GetAtomicNum()) - 0.64)/0.76)] # (?) replace 0.64 with mean the and 0.76 with std
                              
    atom_feature_vector = atom_type_enc + n_heavy_neighbors + formal_charge_enc + hybridisation_type_enc + is_in_ring_enc + is_aromatic_enc + atomic_mass_scaled + vdw_radius_scaled + covalent_radius_scaled
    
    if use_chirality:
        chirality_type_enc = one_hot_encoding(str(atom.GetChiralTag()), ["CHI_UNSPECIFIED", "CHI_TETRAHEDRAL_CW", "CHI_TETRAHEDRAL_CCW", "CHI_OTHER"])
        atom_feature_vector += chirality_type_enc
        
    if hydrogens_implicit:
        n_hydrogens_enc = one_hot_encoding(int(atom.GetTotalNumHs()), [0, 1, 2, 3, 4, "MoreThanFour"])
        atom_feature_vector += n_hydrogens_enc
        
    return np.array(atom_feature_vector) 

Bond Featurisation\
Currently generates ca. 10 edge features

In [17]:
def get_bond_features(bond, use_stereochemistry=True):
    permitted_bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, 
                            Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    
    bond_type_enc = one_hot_encoding(str(bond.GetBondType()), permitted_bond_types)
    
    bond_is_conjugated_enc = [int(bond.GetIsConjugated())]
    
    bond_is_in_ring_enc = [int(bond.IsInRing())]
    
    bond_feature_vector = bond_type_enc + bond_is_conjugated_enc + bond_is_in_ring_enc
    
    if use_stereochemistry:
        stereo_type_enc = one_hot_encoding(str(bond.GetStereo()), ["STEREOZ", "STEREOE", "STEREOANY", "STEREONONE"])
        bond_feature_vector += stereo_type_enc
        
    return np.array(bond_feature_vector)

Define function to generate dataset of labeled Pytorch Geometric Graphs

In [18]:
def create_graph_dataset_from_smiles(x_smiles, y):
    ## Inputs:
    # x_smiles = [smiles_1, smiles_2, ...], smiles representation of molecules
    # y = [y_1, y_2, ...] list of numerical labels for each smiles string, here chemical properties
    
    # Outputs:
    # dataset = [data_1, data_2, ...] list of torch_geometric.data.Data objects representing molecular graphs
    
    dataset = []
    
    for (smiles, y_val) in zip(x_smiles, y):
        # convert smiles to molecular object
        mol = Chem.MolFromSmiles(smiles)
        
        # get feature dimensions
        n_nodes = mol.GetNumAtoms()
        n_edges = 2*mol.GetNumBonds() # each bond is represented twice in the adjacency matrix
        n_node_features = len(get_atom_features(mol.GetAtomWithIdx(0)))
        if n_nodes > 1:
            n_edge_features = len(get_bond_features(mol.GetBondBetweenAtoms(0,1)))
        else:
            n_edge_features = 0  # for single atom molecules -> no edges
        
        # construct node feature matrix X 
        X = np.zeros((n_nodes, n_node_features))
        
        for atom in mol.GetAtoms():
            X[atom.GetIdx(), :] = get_atom_features(atom)
        
        X = torch.tensor(X, dtype=torch.float)
        
        # construct edge index array E, shape = (2, n_edges)
        (rows, cols) = np.nonzero(GetAdjacencyMatrix(mol))
        torch_rows = torch.tensor(rows.astype(np.int64)).to(torch.long)
        torch_cols = torch.tensor(cols.astype(np.int64)).to(torch.long)
        E = torch.stack([torch_rows, torch_cols], dim=0)
        
        # construct edge feature matrix EF
        EF = np.zeros((n_edges, n_edge_features))       # Note: generates zero matrix if n_edges = n_edge_features = 0
        for (k, (i,j)) in enumerate(zip(rows, cols)):
            EF[k] = get_bond_features(mol.GetBondBetweenAtoms(int(i),int(j)))
        EF = torch.tensor(EF, dtype=torch.float)
        
        # construct label/y tensor
        y_tensor = torch.tensor(np.array([y_val]), dtype=torch.float)
        
        # construct torch_geometric.data.Data object and append to dataset
        dataset.append(Data(x=X, edge_index=E, edge_attr=EF, y=y_tensor))
        
    return dataset
        

We use the above functions to create a dataset of molecular graphs from the smiles and labels corresponding to chemical properties

In [27]:
properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y = df.loc[:, properties_names].values  # shape = (n_samples, n_properties)

dataset = create_graph_dataset_from_smiles(x_smiles, y[0:len(x_smiles), :])

In [28]:
print(len(dataset))

# Example entries
print(df.smiles.values[0])
print(dataset[0])
print(df.smiles.values[2])
print(dataset[2])
print(df.smiles.values[50])
print(dataset[50])


133794
C
Data(x=[1, 81], edge_index=[2, 0], edge_attr=[0, 0], y=[1, 15])
O
Data(x=[1, 81], edge_index=[2, 0], edge_attr=[0, 0], y=[1, 15])
N1C=CN=C1
Data(x=[5, 81], edge_index=[2, 10], edge_attr=[10, 10], y=[1, 15])


Information of the graph dataset

In [29]:
print(f'Number of graphs (molecules): {len(dataset)}')
graph = dataset[50]
print('=================================================================================')
print(f'Properties of graph {50}, molecule smiles: {df.smiles.values[50]}')
print(f'Number of nodes: {dataset[50].x.shape[0]}')
print(f'Number of edges: {dataset[50].edge_index.shape[1]}')
print(f'Number of node features: {dataset[50].x.shape[1]}')
print(f'Number of edge features: {dataset[50].edge_attr.shape[1]}')
print(f'Number of properties: {dataset[50].y.shape[1]}')

Number of graphs (molecules): 133794
Properties of graph 50, molecule smiles: N1C=CN=C1
Number of nodes: 5
Number of edges: 10
Number of node features: 81
Number of edge features: 10
Number of properties: 15


Filterout data with no edge features defined (Like ex: CH4) (These causes problems down the line)

In [30]:


indexes_to_delete = []
for item in range(0,len(dataset)):
    if dataset[item].edge_attr.shape[1] == 0:
        indexes_to_delete.append(item)
    else:
        pass

indexes_to_delete.sort()

print("Number of none edge feature molecules: ", len(indexes_to_delete))

print("Before: ", len(dataset))

for item in range(0,len(indexes_to_delete)):
    print("Molecule to delete: ", df.smiles.values[indexes_to_delete[item]])
    #del dataset[indexes_to_delete[item] - item] 
    dataset.pop((indexes_to_delete[item] - item)) # -item since all future data points will have its index reduced by 1 for each deleted previous data point
print("After: ", len(dataset))

Number of none edge feature molecules:  3
Before:  133794
Molecule to delete:  C
Molecule to delete:  N
Molecule to delete:  O
After:  133791


Split data into train and test set

In [36]:
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# split the dataset into test and validation:
num_samples = len(dataset)

# Want to divide data randomly
random.seed(42)
random_indexes = np.array(random.sample(range(num_samples), num_samples)) # random.sample ensures no duplicates

train_data = [dataset[index] for index in random_indexes[int(.2 * num_samples ) :]] # 80%
test_data = [dataset[index] for index in random_indexes[: int(.2 * num_samples )]] # 20%

print("Example train data target before scaling", train_data[0].y)
train_data_targets = [data.y for data in train_data]
train_data_targets = torch.concatenate(train_data_targets, axis=0)
test_data_targets = [data.y for data in test_data]
test_data_targets = torch.concatenate(test_data_targets, axis=0)

scaler = StandardScaler()
train_data_targets = scaler.fit_transform(train_data_targets)
test_data_targets = scaler.transform(test_data_targets)

# print("Example train data target", train_data_targets[0].reshape(1,-1).shape)
train_data_targets = torch.tensor(train_data_targets, dtype=torch.float)
test_data_targets = torch.tensor(test_data_targets, dtype=torch.float)

train_data = [Data(x=data.x, edge_index=data.edge_index, edge_attr=data.edge_attr, y=train_data_targets[index].reshape(1,-1)) for index, data in enumerate(train_data)]
test_data = [Data(x=data.x, edge_index=data.edge_index, edge_attr=data.edge_attr, y=test_data_targets[index].reshape(1,-1)) for index, data in enumerate(test_data)]
print("Example train data target after scaling:", train_data[0].y)

print("Total data size: ", len(dataset))
print("Train data size: ", len(train_data))
print("Test data size: ", len(test_data))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

Example train data target before scaling tensor([[ 2.9447e+00,  2.6734e+00,  1.6819e+00,  7.0850e-01,  7.0340e+01,
         -2.4000e-01,  5.5900e-02,  2.9590e-01,  8.0710e+02,  1.4671e-01,
         -2.7254e+02, -2.7254e+02, -2.7254e+02, -2.7258e+02,  2.9530e+01]])
Example train data target after scaling: tensor([[-2.2742e-01,  2.5325e+00,  1.4954e+00, -1.3102e+00, -5.9178e-01,
         -1.2331e-03,  9.5616e-01,  9.4529e-01, -1.3664e+00, -5.2366e-02,
          3.4719e+00,  3.4719e+00,  3.4719e+00,  3.4719e+00, -5.0999e-01]])
Total data size:  133791
Train data size:  107033
Test data size:  26758


### Main GNN

#### Model for all targets at once

GNN function

In [51]:
data_labels = dataset[50].y.shape[1]
data_features = dataset[50].x.shape[1]

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GATConv(data_features, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.conv4 = GATConv(hidden_channels, hidden_channels)
        self.conv5 = GATConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, hidden_channels)
        self.lin3 = Linear(hidden_channels, data_labels)

    def forward(self, x, edge_index, edge_attr, batch): 
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv3(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv4(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv5(x, edge_index, edge_attr)

        #Returns batch-wise graph-level-outputs by averaging node features across the node dimension, so that for a single graph G
        #its output is computed by
        x = global_mean_pool(x, batch) 
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.lin(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(x)
        x = self.lin2(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(x)
        x = self.lin3(x)
        
 
        return x
    

Train GNN

In [52]:
model = GNN(hidden_channels=128) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)
criterion = torch.nn.MSELoss()

def train(data_in):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch)  # Perform a single forward pass.

      targets = data_in.y
      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data):
      all_test_r2 = []
      counter = -1    
      for data_in in data:
            counter += 1
            model.eval()
            out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch)

            # Caculate R2
            r2_score_var = []
            for item in range(0,data_in.y.shape[1]):
                  if item == 0:
                        r2_score_var = r2_score(data_in.y[:,item].detach().numpy(), out[:,item].detach().numpy())
                  else:
                        r2_score_var = np.vstack((r2_score_var,(r2_score(data_in.y[:,item].detach().numpy(), out[:,item].detach().numpy()))))

            if counter == 0:
                  all_test_r2 = r2_score_var
            else:
                  all_test_r2 = np.hstack((all_test_r2,r2_score_var))

      average_test_r2 = np.sum(all_test_r2,axis=1) / all_test_r2.shape[1]

      return average_test_r2

# Vectors to append accuracy to:
Train_r2 = []
Test_r2 = []

# Calculate accuracy before training 
Train_r2.append(test(train_loader))
Test_r2.append(test(test_loader))
print("Initial training R2: ", Train_r2[0])
print("Initial test R2: ", Test_r2[0])

print_r2_option = True
for epoch in range(1, 31):
      average_loss = []
      for data in train_loader:
            loss = train(data)
            average_loss.append(loss)
      print(f'Epoch: {epoch:03d}, Loss: {(sum(average_loss)/len(average_loss)):.5f}')

      if print_r2_option:

            temp_train_r2 = test(train_loader)
            Train_r2.append(temp_train_r2)

            temp_test_r2 = test(test_loader)
            Test_r2.append(temp_test_r2)

            print(f'Average Train R2: {temp_train_r2}')
            print(f'Average Test R2: {temp_test_r2:}')

Initial training R2:  [-0.02034011 -0.0172521  -0.01965901 -0.02143906 -0.02006519 -0.02162258
 -0.01593567 -0.0154436  -0.02292025 -0.01775128 -0.01767568 -0.02199647
 -0.01765895 -0.02587157 -0.0158652 ]
Initial test R2:  [-0.02112639 -0.0189632  -0.02089278 -0.01996619 -0.02216537 -0.02919588
 -0.01552276 -0.01521761 -0.02419008 -0.01688866 -0.02127208 -0.02363355
 -0.02153127 -0.03111836 -0.0177583 ]
Epoch: 001, Loss: 0.55871
Average Train R2: [0.1951949  0.40794601 0.49197687 0.27392301 0.61148099 0.24410877
 0.70684006 0.67683428 0.65217574 0.8434309  0.64427631 0.64414196
 0.64437309 0.64420164 0.64811555]
Average Test R2: [0.19599081 0.40957752 0.50026938 0.2730022  0.62078261 0.23422642
 0.70818947 0.6783536  0.6576467  0.84765732 0.64565344 0.64572373
 0.64576428 0.64557926 0.65612875]
Epoch: 002, Loss: 0.43956
Average Train R2: [0.18418352 0.46900893 0.53259671 0.33391267 0.63810736 0.59475037
 0.77239994 0.74607857 0.67022887 0.85018634 0.7083865  0.70832622
 0.70853644 0.7

KeyboardInterrupt: 

In [55]:
print("Final loss: ", (sum(average_loss)/len(average_loss)).detach().numpy())
print("Final training R2: ", Train_r2[-1])
print("Final test R2: ", Test_r2[-1])

Final loss:  0.27030855
Final training R2:  [0.4392632  0.62481976 0.6614388  0.43726388 0.76094412 0.75461767
 0.91618546 0.88057522 0.79409025 0.87409347 0.79064699 0.79065257
 0.79065219 0.79064096 0.81704829]
Final test R2:  [0.30344273 0.6294508  0.66600112 0.42917481 0.76529105 0.74790507
 0.91586517 0.87736503 0.79495613 0.87656154 0.79175724 0.79176281
 0.79176255 0.79175098 0.81996635]


#### Model for just one target

GNN function

In [None]:
data_labels = 1
data_features = dataset[50].x.shape[1]

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GATConv(data_features, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.conv4 = GATConv(hidden_channels, hidden_channels)
        self.conv5 = GATConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, hidden_channels)
        self.lin3 = Linear(hidden_channels, data_labels)

    def forward(self, x, edge_index, edge_attr, batch): 
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv3(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv4(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv5(x, edge_index, edge_attr)

        #Returns batch-wise graph-level-outputs by averaging node features across the node dimension, so that for a single graph G
        #its output is computed by
        x = global_mean_pool(x, batch) 
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.lin(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(x)
        x = self.lin2(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(x)
        x = self.lin3(x)
        
 
        return x
    

Train GNN

In [92]:
model = GNN(hidden_channels=64) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
criterion = torch.nn.MSELoss()

def train(data_in):
      target = 6 # target index of interest
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch)[:,target].reshape(-1,1)
      targets = data_in.y[:,target].reshape(-1,1)
      
      #Alt 1
      loss = criterion(out, targets)   

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data):
      target = 6
      all_test_r2 = []
      counter = -1    
      for data_in in data:
            counter += 1
            model.eval()
            out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch)[:,target].reshape(-1,1)
            targets = data_in.y[:,target].reshape(-1,1)
            
            # Caculate R2
            r2_score_var = r2_score(targets.detach().numpy(), out.detach().numpy())

            all_test_r2 .append(r2_score_var)

      average_test_r2 = np.sum(all_test_r2) / len(all_test_r2)

      return average_test_r2

# Vectors to append accuracy to:
Train_r2 = []
Test_r2 = []

# Calculate accuracy before training 
Train_r2.append(test(train_loader))
Test_r2.append(test(test_loader))
print("Initial training R2: ", Train_r2[0])
print("Initial test R2: ", Test_r2[0])

print_r2_option = True
for epoch in range(1, 16):
      average_loss = []
      for data in train_loader:
            loss = train(data)
            average_loss.append(loss)
      print(f'Epoch: {epoch:03d}, Loss: {(sum(average_loss)/len(average_loss)):.5f}')

      if print_r2_option:

            temp_train_r2 = test(train_loader)
            Train_r2.append(temp_train_r2)

            temp_test_r2 = test(test_loader)
            Test_r2.append(temp_test_r2)

            print(f'Average Train R2: {temp_train_r2}')
            print(f'Average Test R2: {temp_test_r2:}')

print("Best training R2: ", np.max(Train_r2))
print("Best test R2: ", np.max(Test_r2))

Initial training R2:  -0.0186155677613906
Initial test R2:  -0.015978314784087073
Epoch: 001, Loss: 0.32682
Average Train R2: 0.8672998000584929
Average Test R2: 0.8681060938673704
Epoch: 002, Loss: 0.10999
Average Train R2: 0.928836275969301
Average Test R2: 0.9266400583640536
Epoch: 003, Loss: 0.08343
Average Train R2: 0.93881720928776
Average Test R2: 0.9378613671032885
Epoch: 004, Loss: 0.07667
Average Train R2: 0.9416217515269708
Average Test R2: 0.9410020391316778
Epoch: 005, Loss: 0.07255
Average Train R2: 0.9460813636748321
Average Test R2: 0.9455932715116921
Epoch: 006, Loss: 0.07021
Average Train R2: 0.9463242108634822
Average Test R2: 0.9453247671856768
Epoch: 007, Loss: 0.06855
Average Train R2: 0.915514749894574
Average Test R2: 0.9139418518749833
Epoch: 008, Loss: 0.06688
Average Train R2: 0.9492874816496003
Average Test R2: 0.9481633532384031
Epoch: 009, Loss: 0.06580
Average Train R2: 0.9502914239926807
Average Test R2: 0.9490051368253422
Epoch: 010, Loss: 0.06516
Avera

## Transformer