# TIF360 Project

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import time as time

import torch
import torch_geometric
from torch_geometric.loader import DataLoader
import torch_geometric.utils as utils
import networkx as nx
from torch.nn import Linear
from torch_geometric.nn import global_mean_pool, GATConv, GraphNorm, BatchNorm
import torch.nn.functional as F

from sklearn.metrics import r2_score

In [2]:
print("cuda available:", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device:", "cuda" if torch.cuda.is_available() else "cpu")

cuda available: True
device: cuda


### Load data

In [3]:
df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))

(132820, 21)


### Load descriptors

In [4]:
mol_descriptor = np.load("../data/mol_descriptors.npy")
print(mol_descriptor.shape)

(132820, 179)


### Load Morgan fingerprints

In [5]:
mol_fingerprints = np.load("../data/mol_morgan_fingerprints.npy")
print(mol_fingerprints.shape)

(132820, 2048)


# Investigate Neural Networks

## Graph Neural Networks

#### Convert data to graphs

In [6]:
# import packages
import torch
from torch_geometric.data import Data
from torch.utils.data import DataLoader

In [7]:
from graph_dataset_functions import create_graph_dataset_from_smiles

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y = df.loc[:, properties_names].values  # shape = (n_samples, n_properties)

dataset = create_graph_dataset_from_smiles(x_smiles, y[0:len(x_smiles), :])

Information of the graph dataset

In [8]:
print(f'Number of graphs (molecules): {len(dataset)}')
graph = dataset[50]
print('=================================================================================')
print(f'Properties of graph {50}, molecule smiles: {df.smiles.values[50]}')
print(f'Number of nodes: {dataset[50].x.shape[0]}')
print(f'Number of edges: {dataset[50].edge_index.shape[1]}')
print(f'Number of node features: {dataset[50].x.shape[1]}')
print(f'Number of edge features: {dataset[50].edge_attr.shape[1]}')
print(f'Number of target properties: {dataset[50].y.shape[1]}')

Number of graphs (molecules): 132820
Properties of graph 50, molecule smiles: CC1=CNC=C1
Number of nodes: 6
Number of edges: 12
Number of node features: 78
Number of edge features: 10
Number of target properties: 15


Filterout data with no edge features defined (Like ex: CH4) (These causes problems down the line)

In [9]:
indexes_to_delete = []
for item in range(0,len(dataset)):
    if dataset[item].edge_attr.shape[1] == 0:
        indexes_to_delete.append(item)
    else:
        pass

indexes_to_delete.sort()

print("Number of none edge feature molecules: ", len(indexes_to_delete))

print("Before: ", len(dataset))

for item in range(0,len(indexes_to_delete)):
    print("Molecule to delete: ", df.smiles.values[indexes_to_delete[item]])
    #del dataset[indexes_to_delete[item] - item] 
    dataset.pop((indexes_to_delete[item] - item)) # -item since all future data points will have its index reduced by 1 for each deleted previous data point
print("After: ", len(dataset))

Number of none edge feature molecules:  0
Before:  132820
After:  132820


Create functions to load and pre-process data

In [10]:
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def scale_and_split_data(dataset, device, input_scaler):
    # split the dataset into test and validation:
    num_samples = len(dataset)

    # Want to divide data randomly
    random.seed(42)
    random_indexes = np.array(random.sample(range(num_samples), num_samples)) # random.sample ensures no duplicates

    train_data = [dataset[index] for index in random_indexes[int(.2 * num_samples ) :]] # 80%
    test_data = [dataset[index] for index in random_indexes[: int(.2 * num_samples )]] # 20%

    print("Example train data target before scaling", train_data[0].y)
    train_data_targets = [data.y for data in train_data]
    train_data_targets = torch.concatenate(train_data_targets, axis=0)
    test_data_targets = [data.y for data in test_data]
    test_data_targets = torch.concatenate(test_data_targets, axis=0)

    input_scaler = StandardScaler()
    train_data_targets = input_scaler.fit_transform(train_data_targets)
    test_data_targets = input_scaler.transform(test_data_targets)

    # print("Example train data target", train_data_targets[0].reshape(1,-1).shape)
    train_data_targets = torch.tensor(train_data_targets, dtype=torch.float)
    test_data_targets = torch.tensor(test_data_targets, dtype=torch.float)

    train_data = [Data(x=data.x.to(device), edge_index=data.edge_index.to(device), edge_attr=data.edge_attr.to(device), 
                       y=train_data_targets[index].reshape(1,-1).to(device)) for index, data in enumerate(train_data)]
    
    test_data = [Data(x=data.x.to(device), edge_index=data.edge_index.to(device), edge_attr=data.edge_attr.to(device), 
                      y=test_data_targets[index].reshape(1,-1).to(device)) for index, data in enumerate(test_data)]
    print("Example train data target after scaling:", train_data[0].y)
    
    # Divide descriptors and fingerprints into train and test
    train_data_descriptor = [mol_descriptor[index,:] for index in random_indexes[int(.2 * num_samples ) :]] # 80%
    test_data_desriptors = [mol_descriptor[index,:] for index in random_indexes[: int(.2 * num_samples )]] # 20%

    train_data_fingerprints = [mol_fingerprints[index,:] for index in random_indexes[int(.2 * num_samples ) :]] # 80%
    test_data_fingerprints = [mol_fingerprints[index,:] for index in random_indexes[: int(.2 * num_samples )]] # 20%
    
    # Normalize descriptors and fingerprints using minmax scaler
    minmax_scaler = MinMaxScaler()
    train_data_descriptor = minmax_scaler.fit_transform(train_data_descriptor)
    test_data_desriptors = minmax_scaler.transform(test_data_desriptors)

    minmax_scaler = MinMaxScaler()
    train_data_fingerprints = minmax_scaler.fit_transform(train_data_fingerprints)
    test_data_fingerprints = minmax_scaler.transform(test_data_fingerprints)
        
    for index in range(0,len(train_data)):
        train_data[index].descriptors = torch.FloatTensor([train_data_descriptor[index,:]]).to(device)
        train_data[index].fingerprints = torch.FloatTensor([train_data_fingerprints[index,:]]).to(device)

    for index in range(0,len(test_data)):
        test_data[index].descriptors = torch.FloatTensor([test_data_desriptors[index,:]]).to(device)
        test_data[index].fingerprints = torch.FloatTensor([test_data_fingerprints[index,:]]).to(device)
        
    return train_data, test_data, input_scaler

def create_data_loaders(train_data, test_data, batch_size): 
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    
    return train_loader, test_loader

Load data

In [11]:
print("...Loading data...")
train_data, test_data, scaler = scale_and_split_data(dataset, device, input_scaler=StandardScaler())
train_loader, test_loader = create_data_loaders(train_data, test_data, batch_size=64)
print("...Data loading done...")

...Loading data...
Example train data target before scaling tensor([[ 2.9693e+00,  1.9777e+00,  1.8423e+00,  1.2093e+00,  8.4060e+01,
         -1.8970e-01, -7.0400e-02,  1.1930e-01,  8.7339e+02,  1.5988e-01,
         -3.4861e+02, -3.4860e+02, -3.4860e+02, -3.4864e+02,  2.9189e+01]])
Example train data target after scaling: tensor([[-0.3350,  1.3300,  2.2841, -1.0264,  1.0990,  2.2975, -1.7445, -2.7966,
         -1.1527,  0.3357,  1.6155,  1.6155,  1.6155,  1.6156, -0.6067]],
       device='cuda:0')


  train_data[index].descriptors = torch.FloatTensor([train_data_descriptor[index,:]]).to(device)


...Data loading done...


### Main GNN

#### Model for all targets at once

GNN function

In [12]:
class GNN_all(torch.nn.Module):
    def __init__(self, hidden_channels, feature_dim, target_dim, descriptor_dim, fingerprint_dim):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GATConv(feature_dim, hidden_channels)
        self.conv1_norm = GraphNorm(hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv2_norm = GraphNorm(hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.conv3_norm = GraphNorm(hidden_channels)
        self.conv4 = GATConv(hidden_channels, hidden_channels)
        self.conv4_norm = GraphNorm(hidden_channels)
        self.conv5 = GATConv(hidden_channels, hidden_channels)
        self.conv5_norm = GraphNorm(hidden_channels)
        
        self.input_norm = BatchNorm(hidden_channels+descriptor_dim+fingerprint_dim)
        self.lin1 = Linear(hidden_channels+descriptor_dim+fingerprint_dim, 1024)
        self.lin2 = Linear(1024, 512)
        self.lin3 = Linear(512, target_dim)

    def forward(self, x, edge_index, edge_attr, batch, descriptors, fingerprints): 
        x = self.conv1(x, edge_index, edge_attr)
        x = F.leaky_relu(x)
        x = self.conv1_norm(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.leaky_relu(x)
        x = self.conv2_norm(x)
        x = self.conv3(x, edge_index, edge_attr)
        x = F.leaky_relu(x)
        x = self.conv3_norm(x)
        x = self.conv4(x, edge_index, edge_attr)
        x = F.leaky_relu(x)
        x = self.conv4_norm(x)
        x = self.conv5(x, edge_index, edge_attr)
        x = self.conv5_norm(x)

        #Returns batch-wise graph-level-outputs by averaging node features across the node dimension, so that for a single graph G
        #its output is computed by
        x = global_mean_pool(x, batch) 
        x2 = descriptors
        x3 = fingerprints
        x = torch.cat((x,x2,x3),1)
        
        x = self.input_norm(x)
        x = self.lin1(x)
        x = F.leaky_relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin2(x)
        x = F.relu(x) # change to leaky relu
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin3(x)
 
        return x

Train GNN

In [13]:
from utility_functions import get_num_parameters

feature_dim = train_data[0].x.shape[1]
target_dim = train_data[0].y.shape[1]
descriptor_dim = train_data[0].descriptors.shape[1]
fingerprint_dim = train_data[0].fingerprints.shape[1]

model = GNN_all(hidden_channels=128, feature_dim=feature_dim, target_dim=target_dim, 
                descriptor_dim=descriptor_dim, fingerprint_dim=fingerprint_dim).to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)
criterion = torch.nn.MSELoss().to(device)

# decay learning rate
decayRate = 0.92
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)

train_params, tot_params = get_num_parameters(model)
print(f"Total number of parameters: {tot_params}")
print(f"Trainable parameters: {train_params}")

def train(data_in):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch, 
                  data_in.descriptors, data_in.fingerprints).to(device)  # Perform a single forward pass.

      targets = data_in.y
      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data):
      all_r2 = []
      all_loss = []
      counter = -1    
      for data_in in data:
            counter += 1
            model.eval()
            out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch,
                        data_in.descriptors, data_in.fingerprints).cpu()
            targets = data_in.y.cpu()
            
            # Caculate R2
            r2_score_var = []
            for item in range(target_dim):
                  if item == 0:
                        r2_score_var = r2_score(targets[:,item].detach().numpy(), out[:,item].detach().numpy())
                  else:
                        new_score = r2_score(targets[:,item].detach().numpy(), out[:,item].detach().numpy())
                        r2_score_var = np.vstack((r2_score_var, new_score))

            if counter == 0:
                  all_r2 = r2_score_var
            else:
                  all_r2 = np.hstack((all_r2, r2_score_var))
                  
            loss = float(criterion(out, targets).detach().numpy())
            all_loss = np.hstack((all_loss, loss))

      average_test_r2 = np.mean(all_r2, axis=1)
      average_test_loss = np.mean(all_loss)
      
      return average_test_r2, average_test_loss

# Vectors to append accuracy to:
train_r2 = []
train_loss = []
test_r2 = []
test_loss = []

n_epochs = 20
print_every_N_epochs = False
N = 5 # print R2 every N epochs

epoch_times = []
train_times = []
test_times = []
print()
print("...Starting training...")
print("Device used:", device)
for epoch in np.arange(1, n_epochs+1):
      epoch_start = time.time()
      losses = []
      train_start = time.time()
      for data in train_loader:
            loss = train(data)
            losses.append(loss.cpu().detach().numpy())
      print(f'Epoch: {epoch:03d}, Loss: {np.mean(losses):.5f}')
      train_end = time.time()
      train_times.append(train_end - train_start)
      lr_scheduler.step()
      
      if print_every_N_epochs and (epoch % N == 0 or epoch == 1) and epoch != n_epochs:
            test_start = time.time()
            r2_temp_train, loss_temp_train = test(train_loader) 
            train_r2.append(r2_temp_train)
            train_loss.append(loss_temp_train)         
            r2_temp_test, loss_temp_test = test(test_loader)
            test_r2.append(r2_temp_train)
            test_loss.append(loss_temp_test)
            print(f'Average Train R2: {r2_temp_train}')
            print(f"Average Train Loss: {loss_temp_train}")
            print(f'Average Test R2: {r2_temp_test}')
            print(f"Average Test Loss: {loss_temp_test}")
            test_end = time.time()
            test_times.append(test_end - test_start)
            
      if epoch == n_epochs:         # calculate results of training
            print("...Training done...")
            print("...Calculating final results...")
            test_start = time.time()
            r2_temp_train, loss_temp_train = test(train_loader) 
            train_r2.append(r2_temp_train)
            train_loss.append(loss_temp_train)
            
            r2_temp_test, loss_temp_test = test(test_loader)
            test_r2.append(r2_temp_train)
            test_loss.append(loss_temp_test)

            print("====================================================")
            print("Final training R2:", train_r2[-1])
            print("Average final training R2: ", np.mean(train_r2[-1]))
            print("Final training loss:", train_loss[-1])

            print("Final test R2:", test_r2[-1])
            print("Average final test R2: ", np.mean(test_r2[-1]))
            print("Final test loss:", test_loss[-1])
            
            test_end = time.time()
            test_times.append(test_end - test_start)
            
      epoch_end = time.time()
      epoch_times.append(epoch_end - epoch_start)

Total number of parameters: 3032181
Trainable parameters: 3032181

...Starting training...
Device used: cuda
Epoch: 001, Loss: 0.14661
Epoch: 002, Loss: 0.11227
Epoch: 003, Loss: 0.10861
Epoch: 004, Loss: 0.10533
Epoch: 005, Loss: 0.10293
Epoch: 006, Loss: 0.10156
Epoch: 007, Loss: 0.09962
Epoch: 008, Loss: 0.09865
Epoch: 009, Loss: 0.09750
Epoch: 010, Loss: 0.09657
Epoch: 011, Loss: 0.09634
Epoch: 012, Loss: 0.09519
Epoch: 013, Loss: 0.09514
Epoch: 014, Loss: 0.09410
Epoch: 015, Loss: 0.09467
Epoch: 016, Loss: 0.09438
Epoch: 017, Loss: 0.09355
Epoch: 018, Loss: 0.09341
Epoch: 019, Loss: 0.09309
Epoch: 020, Loss: 0.09359
...Training done...
...Calculating final results...
Final training R2: [0.82973255 0.90190185 0.92354569 0.68290143 0.97321167 0.87164293
 0.93894763 0.93619858 0.94467401 0.97897132 0.98328259 0.9832772
 0.98327696 0.98326793 0.9724261 ]
Average final training R2:  0.9258172294806469
Final training loss: 0.07037109563610483
Final test R2: [0.82973255 0.90190185 0.9235

In [None]:
print("Device used:", device)
print()
print(f"Total number of epochs: {len(epoch_times)}")
print(f"Total training time: {np.sum(epoch_times)/60:.2f} minutes")
print(f"Total time in training: {np.sum(train_times)/60:.2f} minutes")
print(f"Total time in testing: {np.sum(test_times)/60:.2f} minutes")
print()
print(f"Average epoch time: {np.mean(epoch_times):.1f} seconds")
print(f"Average time in training: {np.mean(train_times):.1f} seconds")
print(f"Average time in testing: {np.mean(test_times):.1f} seconds")

Device used: cuda

Total number of epochs: 30
Total training time: 126.315 minutes
Total time in training: 126.315 minutes
Total time in testing: 0.000 minutes

Average epoch time: 252.631 seconds
Average time in training: 252.630 seconds
Average time in testing: nan seconds


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [14]:
from utility_functions import get_num_parameters

feature_dim = train_data[0].x.shape[1]
target_dim = train_data[0].y.shape[1]
descriptor_dim = train_data[0].descriptors.shape[1]
fingerprint_dim = train_data[0].fingerprints.shape[1]

def train(data_in):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch, 
                  data_in.descriptors, data_in.fingerprints).to(device)  # Perform a single forward pass.

      targets = data_in.y
      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data):
      all_r2 = []
      all_loss = []
      counter = -1    
      for data_in in data:
            counter += 1
            model.eval()
            out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch,
                        data_in.descriptors, data_in.fingerprints).cpu()
            targets = data_in.y.cpu()
            
            # Caculate R2
            r2_score_var = []
            for item in range(target_dim):
                  if item == 0:
                        r2_score_var = r2_score(targets[:,item].detach().numpy(), out[:,item].detach().numpy())
                  else:
                        new_score = r2_score(targets[:,item].detach().numpy(), out[:,item].detach().numpy())
                        r2_score_var = np.vstack((r2_score_var, new_score))

            if counter == 0:
                  all_r2 = r2_score_var
            else:
                  all_r2 = np.hstack((all_r2, r2_score_var))
                  
            loss = float(criterion(out, targets).detach().numpy())
            all_loss = np.hstack((all_loss, loss))

      average_test_r2 = np.mean(all_r2, axis=1)
      average_test_loss = np.mean(all_loss)
      
      return average_test_r2, average_test_loss

n_epochs = 20

print()
print("...Starting training...")
print("Device used:", device)

n_runs = 3
for run in range(n_runs):
      print("Run", run+1)
      
      model = GNN_all(hidden_channels=128, feature_dim=feature_dim, target_dim=target_dim, 
                descriptor_dim=descriptor_dim, fingerprint_dim=fingerprint_dim).to(device) 
      optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)
      criterion = torch.nn.MSELoss().to(device)

      # Decay for learning rate
      decayRate = 0.92
      lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
      
      train_r2 = []
      train_loss = []
      test_r2 = []
      test_loss = []

      for epoch in np.arange(1, n_epochs+1):     
            losses = []
            for data in train_loader:
                  loss = train(data)
                  losses.append(loss.cpu().detach().numpy())
            print(f'Epoch: {epoch:03d}, Loss: {np.mean(losses):.5f}')
            lr_scheduler.step() # Decay to learning rate
                  
            if epoch == n_epochs:         # calculate results of training
                  print("...Training done...")
                  print("...Calculating final results...")
                  r2_temp_train, loss_temp_train = test(train_loader) 
                  train_r2.append(r2_temp_train)
                  train_loss.append(loss_temp_train)
                  
                  r2_temp_test, loss_temp_test = test(test_loader)
                  test_r2.append(r2_temp_train)
                  test_loss.append(loss_temp_test)

                  # print("====================================================")
                  # print("Final training R2:", train_r2[-1])
                  # print("Average final training R2: ", np.mean(train_r2[-1]))
                  # print("Final training loss:", train_loss[-1])

                  # print("Final test R2:", test_r2[-1])
                  # print("Average final test R2: ", np.mean(test_r2[-1]))
                  # print("Final test loss:", test_loss[-1])
      train_r2_hist = np.array(train_r2) if run == 0 else np.vstack((train_r2_hist, np.array(train_r2)))
      train_loss_hist = np.array(train_loss) if run == 0 else np.vstack((train_loss_hist, np.array(train_loss)))
      test_r2_hist = np.array(test_r2) if run == 0 else np.vstack((test_r2_hist, np.array(test_r2)))
      test_loss_hist = np.array(test_loss) if run == 0 else np.vstack((test_loss_hist, np.array(test_loss)))                              
      


...Starting training...
Device used: cuda
Run 1
Epoch: 001, Loss: 0.13758
Epoch: 002, Loss: 0.10398
Epoch: 003, Loss: 0.09878
Epoch: 004, Loss: 0.09472
Epoch: 005, Loss: 0.09144
Epoch: 006, Loss: 0.08925
Epoch: 007, Loss: 0.08648
Epoch: 008, Loss: 0.08483
Epoch: 009, Loss: 0.08291
Epoch: 010, Loss: 0.08117
Epoch: 011, Loss: 0.07999
Epoch: 012, Loss: 0.07849
Epoch: 013, Loss: 0.07749
Epoch: 014, Loss: 0.07612
Epoch: 015, Loss: 0.07501
Epoch: 016, Loss: 0.07399
Epoch: 017, Loss: 0.07314
Epoch: 018, Loss: 0.07214
Epoch: 019, Loss: 0.07146
Epoch: 020, Loss: 0.07108
...Training done...
...Calculating final results...
Run 2
Epoch: 001, Loss: 0.13743
Epoch: 002, Loss: 0.10394
Epoch: 003, Loss: 0.09864
Epoch: 004, Loss: 0.09447
Epoch: 005, Loss: 0.09096
Epoch: 006, Loss: 0.08884
Epoch: 007, Loss: 0.08611
Epoch: 008, Loss: 0.08445
Epoch: 009, Loss: 0.08257
Epoch: 010, Loss: 0.08086
Epoch: 011, Loss: 0.07981
Epoch: 012, Loss: 0.07845
Epoch: 013, Loss: 0.07737
Epoch: 014, Loss: 0.07605
Epoch: 01

In [15]:
print("====================================================")
print("Average final training R2: ", np.mean(train_r2_hist, axis=0))
print("Average final training R2 (avg. over all runs and targets): ", np.mean(train_r2_hist))
print("Average final training loss:", np.mean(train_loss_hist))
print()
# print(np.mean(test_r2_hist, axis=1))
print("Average final test R2: ", np.mean(test_r2_hist, axis=0))
print("Average final test R2 (avg. over all runs and targets): ", np.mean(test_r2_hist))
print("Average final test loss:", np.mean(test_loss_hist))

Average final training R2:  [0.87623942 0.92914367 0.94355457 0.72920452 0.98563006 0.901616
 0.97292513 0.95734401 0.96390268 0.99528525 0.99700663 0.99701151
 0.99701086 0.99701823 0.9863776 ]
Average final training R2 (avg. over all runs and targets):  0.9486180108603899
Average final training loss: 0.048652512051827475

Average final test R2:  [0.87623942 0.92914367 0.94355457 0.72920452 0.98563006 0.901616
 0.97292513 0.95734401 0.96390268 0.99528525 0.99700663 0.99701151
 0.99701086 0.99701823 0.9863776 ]
Average final test R2 (avg. over all runs and targets):  0.9486180108603899
Average final test loss: 0.05276200343042803


#### Model for just one target

GNN function

GATConv: https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.GATConv.html#torch_geometric.nn.conv.GATConv

GENConv: https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.GENConv.html#torch_geometric.nn.conv.GENConv

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, feature_dim, target_dim, descriptor_dim, fingerprint_dim):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GATConv(feature_dim, hidden_channels)
        self.conv1_norm = GraphNorm(hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv2_norm = GraphNorm(hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.conv3_norm = GraphNorm(hidden_channels)
        self.conv4 = GATConv(hidden_channels, hidden_channels)
        self.conv4_norm = GraphNorm(hidden_channels)
        self.conv5 = GATConv(hidden_channels, hidden_channels)
        self.conv5_norm = GraphNorm(hidden_channels)
        
        self.lin1 = Linear(hidden_channels+descriptor_dim+fingerprint_dim, 512)
        self.lin1_norm = BatchNorm(512)
        self.lin2 = Linear(512, 256)
        self.lin2_norm = BatchNorm(256)
        self.lin3 = Linear(256, 128)
        self.lin3_norm = BatchNorm(128)
        self.lin4 = Linear(128, 64)
        self.lin4_norm = BatchNorm(64)
        self.lin5 = Linear(64, 32)
        self.lin5_norm = BatchNorm(32)
        self.lin6 = Linear(32, target_dim)
        

    def forward(self, x, edge_index, edge_attr, batch, descriptors, fingerprints): 
        x = self.conv1(x, edge_index, edge_attr)
        x = self.conv1_norm(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = self.conv2_norm(x)
        x = F.relu(x)
        x = self.conv3(x, edge_index, edge_attr)
        x = self.conv3_norm(x)
        x = F.relu(x)
        x = self.conv4(x, edge_index, edge_attr)
        x = self.conv4_norm(x)
        x = F.relu(x)
        x = self.conv5(x, edge_index, edge_attr)
        x = self.conv5_norm(x)

        #Returns batch-wise graph-level-outputs by averaging node features across the node dimension, so that for a single graph G
        #its output is computed by
        x = global_mean_pool(x, batch) 
        x2 = descriptors
        x3 = fingerprints
        x = torch.cat((x,x2,x3),1)
        
        x = self.lin1(x)
        x = self.lin1_norm(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin2(x)
        x = self.lin2_norm(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin3(x)
        x = self.lin3_norm(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin4(x)
        x = self.lin4_norm(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin5(x)
        x = self.lin5_norm(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin6(x)
 
        return x

Train GNN

In [None]:
feature_dim = train_data[0].x.shape[1]
target_dim = 1
descriptor_dim = train_data[0].descriptors.shape[1]
fingerprint_dim = train_data[0].fingerprints.shape[1]

def train(data_in, target):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch, 
                  data_in.descriptors, data_in.fingerprints).to(device)
      targets = data_in.y[:,target].reshape(-1,1)
      
      loss = criterion(out, targets)   

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data, target):
      all_r2 = []
      all_loss = []
      counter = -1    
      for data_in in data:
            counter += 1
            model.eval()
            out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch, 
                        data_in.descriptors, data_in.fingerprints).cpu()
            targets = data_in.y[:,target].cpu().reshape(-1,1)
            
            # Caculate R2
            r2_score_var = r2_score(targets.detach().numpy(), out.detach().numpy())
            all_r2.append(r2_score_var)
            
            loss = float(criterion(out, targets).detach().numpy())
            all_loss.append(loss)

      average_test_r2 = np.mean(all_r2)
      average_test_loss = np.mean(all_loss)

      return average_test_r2, average_test_loss

num_targets = train_data[0].y.shape[1]
start_time = time.time()
for target_index in range(num_targets):
      print("Target index:", target_index)

      model = GNN(hidden_channels=64, feature_dim=feature_dim, target_dim=target_dim,
                  descriptor_dim=descriptor_dim, fingerprint_dim=fingerprint_dim).to(device) 
      optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
      criterion = torch.nn.MSELoss().to(device)

      # Vectors to append accuracy to:
      train_r2 = []
      test_r2 = []
      train_loss = []
      test_loss = []

      # Calculate accuracy and loss before training 
      r2_temp, loss_temp = test(train_loader, target_index)
      train_r2.append(r2_temp)
      train_loss.append(loss_temp)
      r2_temp, loss_temp = test(test_loader, target_index)
      test_r2.append(r2_temp)
      test_loss.append(loss_temp)
      
      print("Initial training R2: ", train_r2[0])
      print("Initial test R2: ", test_r2[0])

      print_r2_option = True
      counter = 0
      for epoch in range(1, 21):
            counter += 1
            losses = []
            for data in train_loader:
                  loss = train(data, target_index)
                  losses.append(loss.cpu().detach().numpy())
            print(f'Epoch: {epoch:03d}, Loss: {np.mean(losses):.5f}')

            if print_r2_option & epoch == 20:
                  temp_train_r2, temp_train_loss = test(train_loader, target_index)
                  train_r2.append(temp_train_r2)
                  train_loss.append(temp_train_loss)

                  temp_test_r2, temp_test_loss = test(test_loader, target_index)
                  test_r2.append(temp_test_r2)
                  test_loss.append(temp_test_loss)

      print(f"Best training R2 for target {target_index}: {np.max(train_r2)}")
      print(f"Best test R2 for target {target_index}: {np.max(test_r2)}")
print("...Done...")
end_time = time.time()
print(f"Time taken: {(end_time - start_time)/60} minutes")
print(f"Average time per target: {(end_time - start_time)/(num_targets*60)} minutes")

...Loading data...
Example train data target before scaling tensor([[ 2.9693e+00,  1.9777e+00,  1.8423e+00,  1.2093e+00,  8.4060e+01,
         -1.8970e-01, -7.0400e-02,  1.1930e-01,  8.7339e+02,  1.5988e-01,
         -3.4861e+02, -3.4860e+02, -3.4860e+02, -3.4864e+02,  2.9189e+01]])
Example train data target after scaling: tensor([[-0.3350,  1.3300,  2.2841, -1.0264,  1.0990,  2.2975, -1.7445, -2.7966,
         -1.1527,  0.3357,  1.6155,  1.6155,  1.6155,  1.6156, -0.6067]],
       device='cuda:0')
...Data loading done...
Target index: 0


KeyboardInterrupt: 