# TIF360 Project

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

In [16]:
import os
import rdkit
from rdkit import Chem  # To extract information of the molecules
from rdkit.Chem import Draw  # To draw the molecules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import torch_geometric.utils as utils
import networkx as nx
from torch.nn import Linear
from torch_geometric.nn import global_mean_pool, GraphConv, GATConv, GCNConv
import torch.nn.functional as F

from sklearn.metrics import r2_score

### Load data

In [17]:
df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))

(132820, 21)


### Load descriptors

In [18]:
mol_descriptor = np.load("../data/mol_descriptors.npy")
print(mol_descriptor.shape)

(132820, 209)


# Investigate Neural Networks

## Graph Neural Networks

We use the above functions to create a dataset of molecular graphs from the smiles and labels corresponding to chemical properties

In [19]:
from graph_dataset_functions import create_graph_dataset_from_smiles

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y = df.loc[:, properties_names].values  # shape = (n_samples, n_properties)

dataset = create_graph_dataset_from_smiles(x_smiles, y[0:len(x_smiles), :])

Information of the graph dataset

In [20]:
print(f'Number of graphs (molecules): {len(dataset)}')
graph = dataset[50]
print('=================================================================================')
print(f'Properties of graph {50}, molecule smiles: {df.smiles.values[50]}')
print(f'Number of nodes: {dataset[50].x.shape[0]}')
print(f'Number of edges: {dataset[50].edge_index.shape[1]}')
print(f'Number of node features: {dataset[50].x.shape[1]}')
print(f'Number of edge features: {dataset[50].edge_attr.shape[1]}')
print(f'Number of target properties: {dataset[50].y.shape[1]}')

Number of graphs (molecules): 132820
Properties of graph 50, molecule smiles: CC1=CNC=C1
Number of nodes: 6
Number of edges: 12
Number of node features: 78
Number of edge features: 10
Number of target properties: 15


Filterout data with no edge features defined (Like ex: CH4) (These causes problems down the line)

In [21]:


indexes_to_delete = []
for item in range(0,len(dataset)):
    if dataset[item].edge_attr.shape[1] == 0:
        indexes_to_delete.append(item)
    else:
        pass

indexes_to_delete.sort()

print("Number of none edge feature molecules: ", len(indexes_to_delete))

print("Before: ", len(dataset))

for item in range(0,len(indexes_to_delete)):
    print("Molecule to delete: ", df.smiles.values[indexes_to_delete[item]])
    #del dataset[indexes_to_delete[item] - item] 
    dataset.pop((indexes_to_delete[item] - item)) # -item since all future data points will have its index reduced by 1 for each deleted previous data point
print("After: ", len(dataset))

Number of none edge feature molecules:  0
Before:  132820
After:  132820


Split data into train and test set

In [22]:
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# split the dataset into test and validation:
num_samples = len(dataset)

# Want to divide data randomly
random.seed(42)
random_indexes = np.array(random.sample(range(num_samples), num_samples)) # random.sample ensures no duplicates

train_data = [dataset[index] for index in random_indexes[int(.2 * num_samples ) :]] # 80%
test_data = [dataset[index] for index in random_indexes[: int(.2 * num_samples )]] # 20%

print("Example train data target before scaling", train_data[0].y)
train_data_targets = [data.y for data in train_data]
train_data_targets = torch.concatenate(train_data_targets, axis=0)
test_data_targets = [data.y for data in test_data]
test_data_targets = torch.concatenate(test_data_targets, axis=0)

scaler = StandardScaler()
train_data_targets = scaler.fit_transform(train_data_targets)
test_data_targets = scaler.transform(test_data_targets)

# print("Example train data target", train_data_targets[0].reshape(1,-1).shape)
train_data_targets = torch.tensor(train_data_targets, dtype=torch.float)
test_data_targets = torch.tensor(test_data_targets, dtype=torch.float)

train_data = [Data(x=data.x, edge_index=data.edge_index, edge_attr=data.edge_attr, y=train_data_targets[index].reshape(1,-1)) for index, data in enumerate(train_data)]
test_data = [Data(x=data.x, edge_index=data.edge_index, edge_attr=data.edge_attr, y=test_data_targets[index].reshape(1,-1)) for index, data in enumerate(test_data)]
print("Example train data target after scaling:", train_data[0].y)

print("Total data size: ", len(dataset))
print("Train data size: ", len(train_data))
print("Test data size: ", len(test_data))

#train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
#test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

Example train data target before scaling tensor([[ 2.9693e+00,  1.9777e+00,  1.8423e+00,  1.2093e+00,  8.4060e+01,
         -1.8970e-01, -7.0400e-02,  1.1930e-01,  8.7339e+02,  1.5988e-01,
         -3.4861e+02, -3.4860e+02, -3.4860e+02, -3.4864e+02,  2.9189e+01]])
Example train data target after scaling: tensor([[-0.3350,  1.3300,  2.2841, -1.0264,  1.0990,  2.2975, -1.7445, -2.7966,
         -1.1527,  0.3357,  1.6155,  1.6155,  1.6155,  1.6156, -0.6067]])
Total data size:  132820
Train data size:  106256
Test data size:  26564


Divide descriptors into train and test

In [23]:
train_data_descriptor = [mol_descriptor[index,:] for index in random_indexes[int(.2 * num_samples ) :]] # 80%
test_data_desriptors = [mol_descriptor[index,:] for index in random_indexes[: int(.2 * num_samples )]] # 20%

Normalize using MinMax

In [24]:
minmax_scaler = MinMaxScaler()

train_data_descriptor = minmax_scaler.fit_transform(train_data_descriptor)

test_data_desriptors = minmax_scaler.transform(test_data_desriptors)

Combine descriptors temporarely with in train and test data for graphs

In [25]:
#Testing stuff

temp = train_data[1000]
temp.descriptors = train_data_descriptor[1000,:]

print(temp)

Data(x=[9, 78], edge_index=[2, 20], edge_attr=[20, 10], y=[1, 15], descriptors=[209])


In [26]:

for index in range(0,len(train_data)):
    train_data[index].descriptors = torch.FloatTensor([train_data_descriptor[index,:]])

for index in range(0,len(test_data)):
    test_data[index].descriptors = torch.FloatTensor([test_data_desriptors[index,:]])

# Example
print(train_data[0])

Data(x=[9, 78], edge_index=[2, 24], edge_attr=[24, 10], y=[1, 15], descriptors=[1, 209])


Use dataloader

In [27]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

### Main GNN

#### Model for all targets at once

GNN function

In [28]:
data_labels = dataset[50].y.shape[1]
data_features = dataset[50].x.shape[1]
descriptors_features = train_data[0].descriptors.shape[1]

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GATConv(data_features, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.conv4 = GATConv(hidden_channels, hidden_channels)
        self.conv5 = GATConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels+descriptors_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, hidden_channels)
        self.lin3 = Linear(hidden_channels, data_labels)

    def forward(self, x, edge_index, edge_attr, batch, descriptors): 
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv3(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv4(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv5(x, edge_index, edge_attr)

        #Returns batch-wise graph-level-outputs by averaging node features across the node dimension, so that for a single graph G
        #its output is computed by
        x = global_mean_pool(x, batch) 
        x2 = descriptors
        x = torch.cat((x,x2),1)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.lin(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(x)
        x = self.lin2(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(x)
        x = self.lin3(x)
        
 
        return x
    

Train GNN

In [29]:
model = GNN(hidden_channels=128) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)
criterion = torch.nn.MSELoss()

def train(data_in):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch, data_in.descriptors)  # Perform a single forward pass.

      #Alt 1
      loss = criterion(out, data_in.y) 

      #Alt 2
      #loss = 0
      #for item in range(0,len(data_in.y[0,:])):
      #      loss += criterion(out[:,item], data_in.y[:,item]) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data):
      all_test_r2 = []
      counter = -1    
      for data_in in data:
            counter += 1
            model.eval()
            
            out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch, data_in.descriptors)

            # Caculate R2
            r2_score_var = []
            for item in range(0,data_in.y.shape[1]):
                  if item == 0:
                        r2_score_var = r2_score(data_in.y[:,item].detach().numpy(), out[:,item].detach().numpy())
                  else:
                        r2_score_var = np.vstack((r2_score_var,(r2_score(data_in.y[:,item].detach().numpy(), out[:,item].detach().numpy()))))

            if counter == 0:
                  all_test_r2 = r2_score_var
            else:
                  all_test_r2 = np.hstack((all_test_r2,r2_score_var))

      average_test_r2 = np.sum(all_test_r2,axis=1) / all_test_r2.shape[1]

      return average_test_r2

# Vectors to append accuracy to:
Train_r2 = []
Test_r2 = []

# Calculate accuracy before training 
Train_r2.append(test(train_loader))
Test_r2.append(test(test_loader))
print("Initial training R2: ", Train_r2[0])
print("Initial test R2: ", Test_r2[0])

print_r2_option = True
for epoch in range(1, 31):
      average_loss = []
      for data in train_loader:
            loss = train(data)
            average_loss.append(loss)
      print(f'Epoch: {epoch:03d}, Loss: {(sum(average_loss)/len(average_loss)):.4f}')

      if print_r2_option:

            temp_train_r2 = test(train_loader)
            Train_r2.append(temp_train_r2)

            temp_test_r2 = test(test_loader)
            Test_r2.append(temp_test_r2)

            print(f'Average Train R2: {temp_train_r2}')
            print(f'Average Test R2: {temp_test_r2:}')

Initial training R2:  [-0.01656842 -0.01856694 -0.02078074 -0.02631144 -0.01680059 -0.01626799
 -0.01946096 -0.01639104 -0.01762852 -0.02013742 -0.01840109 -0.01622823
 -0.01620264 -0.01961573 -0.01696051]
Initial test R2:  [-0.02003836 -0.01771115 -0.01908236 -0.03261724 -0.01766767 -0.01763321
 -0.02162603 -0.01781858 -0.01635392 -0.02124352 -0.02738023 -0.02476954
 -0.02309721 -0.02511931 -0.01688011]
Epoch: 001, Loss: 0.5558
Average Train R2: [0.23975458 0.4625822  0.54413198 0.29158475 0.62838469 0.20289357
 0.68515701 0.64966306 0.67821161 0.82797149 0.64678218 0.64713621
 0.64733299 0.64753281 0.66935429]
Average Test R2: [0.25058496 0.46738294 0.55038305 0.27794029 0.63067236 0.19428937
 0.68942889 0.64846276 0.68615158 0.82936262 0.66033863 0.66064882
 0.66091589 0.66136673 0.67021336]


#### Model for just one target

GNN function

GATConv: https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.GATConv.html#torch_geometric.nn.conv.GATConv

GENConv: https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.GENConv.html#torch_geometric.nn.conv.GENConv

In [None]:
data_labels = 1
data_features = dataset[50].x.shape[1]
descriptors_features = train_data[50].descriptors.shape[1]

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GATConv(data_features, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.conv4 = GATConv(hidden_channels, hidden_channels)
        self.conv5 = GATConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels+descriptors_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, hidden_channels)
        self.lin3 = Linear(hidden_channels, data_labels)

    def forward(self, x, edge_index, edge_attr, batch, descriptors): 
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv3(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv4(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv5(x, edge_index, edge_attr)

        #Returns batch-wise graph-level-outputs by averaging node features across the node dimension, so that for a single graph G
        #its output is computed by
        x = global_mean_pool(x, batch) 
        x2 = descriptors
        x = torch.cat((x,x2),1)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.lin(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(x)
        x = self.lin2(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(x)
        x = self.lin3(x)
        
 
        return x
    

Train GNN

In [None]:
def train(data_in, target):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch, data_in.descriptors)
      targets = data_in.y[:,target].reshape(-1,1)
      
      #Alt 1
      loss = criterion(out, targets)   

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data, target):
      all_test_r2 = []
      counter = -1    
      for data_in in data:
            counter += 1
            model.eval()
            out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch, data_in.descriptors)
            targets = data_in.y[:,target].reshape(-1,1)
            
            # Caculate R2
            r2_score_var = r2_score(targets.detach().numpy(), out.detach().numpy())

            all_test_r2 .append(r2_score_var)

      average_test_r2 = np.sum(all_test_r2) / len(all_test_r2)

      return average_test_r2

num_targets = dataset[50].y.shape[1]
for target_index in range(num_targets):
      print("Target index: ", target_index)

      model = GNN(hidden_channels=64) 
      optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
      criterion = torch.nn.MSELoss()

      # Vectors to append accuracy to:
      Train_r2 = []
      Test_r2 = []

      # Calculate accuracy before training 
      Train_r2.append(test(train_loader, target_index))
      Test_r2.append(test(test_loader, target_index))
      print("Initial training R2: ", Train_r2[0])
      print("Initial test R2: ", Test_r2[0])

      print_r2_option = True
      for epoch in range(1, 21):
            average_loss = []
            for data in train_loader:
                  loss = train(data, target_index)
                  average_loss.append(loss)
            print(f'Epoch: {epoch:03d}, Loss: {(sum(average_loss)/len(average_loss)):.5f}')

            if print_r2_option:
                  temp_train_r2 = test(train_loader, target_index)
                  Train_r2.append(temp_train_r2)

                  temp_test_r2 = test(test_loader, target_index)
                  Test_r2.append(temp_test_r2)

                  # print(f'Average Train R2: {temp_train_r2}')
                  # print(f'Average Test R2: {temp_test_r2:}')

      print(f"Best training R2 for target {target_index}: {np.max(Train_r2)}")
      print(f"Best test R2 for target {target_index}: {np.max(Test_r2)}")

Target index:  0
Initial training R2:  -0.018678295383913163
Initial test R2:  -0.03308373704488183
Epoch: 001, Loss: 0.76507
Epoch: 002, Loss: 0.63190
Epoch: 003, Loss: 0.56252
Epoch: 004, Loss: 0.51782
Epoch: 005, Loss: 0.48434
Epoch: 006, Loss: 0.46161
Epoch: 007, Loss: 0.44506
Epoch: 008, Loss: 0.43061
Epoch: 009, Loss: 0.41689
Epoch: 010, Loss: 0.40696
Epoch: 011, Loss: 0.39623
Epoch: 012, Loss: 0.38796
Epoch: 013, Loss: 0.37597
Epoch: 014, Loss: 0.37060
Epoch: 015, Loss: 0.36610
Epoch: 016, Loss: 0.35919
Epoch: 017, Loss: 0.35353
Epoch: 018, Loss: 0.34897
Epoch: 019, Loss: 0.34382
Epoch: 020, Loss: 0.34009
Best training R2 for target 0: 0.6581509149316066
Best test R2 for target 0: 0.6554282682601079
Target index:  1
Initial training R2:  -0.01632305111746684
Initial test R2:  -0.017224460553526586
Epoch: 001, Loss: 0.63536
Epoch: 002, Loss: 0.51249
Epoch: 003, Loss: 0.45557
Epoch: 004, Loss: 0.41914
Epoch: 005, Loss: 0.40053
Epoch: 006, Loss: 0.38774
Epoch: 007, Loss: 0.37415
Ep

## Transformer