# TIF360 Project

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

In [8]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import time as time
import torch
import torch_geometric

from torch_geometric.data import Data, DataLoader
from torch.nn import Linear, LeakyReLU
from torch_geometric.nn import global_mean_pool, GATConv, BatchNorm, GraphNorm
import torch.nn.functional as F

from sklearn.metrics import r2_score

In [9]:
# check if cuda is available
print('cuda available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', "cuda" if torch.cuda.is_available() else "cpu")

cuda available: True
device: cuda


### Load data

In [10]:
df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))

(132820, 21)


#### Convert data to graphs

In [11]:
from graph_dataset_functions import create_graph_dataset_from_smiles

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y = df.loc[:, properties_names].values  # shape = (n_samples, n_properties)

dataset = create_graph_dataset_from_smiles(x_smiles, y[0:len(x_smiles), :])

Information of the graph dataset

In [12]:
print(f'Number of graphs (molecules): {len(dataset)}')
graph = dataset[50]
print('=================================================================================')
print(f'Properties of graph {50}, molecule smiles: {df.smiles.values[50]}')
print(f'Number of nodes: {dataset[50].x.shape[0]}')
print(f'Number of edges: {dataset[50].edge_index.shape[1]}')
print(f'Number of node features: {dataset[50].x.shape[1]}')
print(f'Number of edge features: {dataset[50].edge_attr.shape[1]}')
print(f'Number of target properties: {dataset[50].y.shape[1]}')

Number of graphs (molecules): 132820
Properties of graph 50, molecule smiles: CC1=CNC=C1
Number of nodes: 6
Number of edges: 12
Number of node features: 78
Number of edge features: 10
Number of target properties: 15


Create functions to load and pre-process data

In [17]:
from utility_functions import get_data_split_indices
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def scale_and_split_data(dataset, val_share, test_share):
    # split the dataset into test and validation:
    num_samples = len(dataset)

    num_samples = len(dataset)

    train_indices, val_indices, test_indices = get_data_split_indices(num_samples, val_share=val_share, test_share=test_share)

    train_data = [dataset[i] for i in train_indices]
    val_data = [dataset[i] for i in val_indices]
    test_data = [dataset[i] for i in test_indices]

    # scale the targets
    train_data_targets = torch.concatenate([data.y for data in train_data], axis=0)
    val_data_targets = torch.concatenate([data.y for data in val_data], axis=0)
    test_data_targets = torch.concatenate([data.y for data in test_data], axis=0)    
    
    scaler_targets = StandardScaler()
    train_data_targets = scaler_targets.fit_transform(train_data_targets)
    val_data_targets = scaler_targets.transform(val_data_targets)
    test_data_targets = scaler_targets.transform(test_data_targets)

    train_data_targets = torch.tensor(train_data_targets, dtype=torch.float)
    val_data_targets = torch.tensor(val_data_targets, dtype=torch.float)
    test_data_targets = torch.tensor(test_data_targets, dtype=torch.float)

    train_data = [Data(x=data.x.to(device), edge_index=data.edge_index.to(device), edge_attr=data.edge_attr.to(device), 
                       y=train_data_targets[index].reshape(1,-1).to(device)) for index, data in enumerate(train_data)]
    
    val_data = [Data(x=data.x.to(device), edge_index=data.edge_index.to(device), edge_attr=data.edge_attr.to(device),
                        y=val_data_targets[index].reshape(1,-1).to(device)) for index, data in enumerate(val_data)]
    
    test_data = [Data(x=data.x.to(device), edge_index=data.edge_index.to(device), edge_attr=data.edge_attr.to(device), 
                      y=test_data_targets[index].reshape(1,-1).to(device)) for index, data in enumerate(test_data)]

    return train_data, val_data, test_data, scaler_targets

def create_data_loaders(train_data, val_data, test_data, batch_size): 
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader

Load data

In [18]:
print("...Loading data...")
train_data, val_data, test_data, scaler_targets = scale_and_split_data(dataset, 0.15, 0.1)
train_loader, val_loader, test_loader = create_data_loaders(train_data, val_data, test_data, batch_size=64)
print("...Data loading done...")

...Loading data...
...Data loading done...




### Main GNN

#### Model for all targets at once

GNN function

In [9]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, feature_dim, target_dim):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GATConv(feature_dim, hidden_channels)
        self.conv1_norm = GraphNorm(hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv2_norm = GraphNorm(hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.conv3_norm = GraphNorm(hidden_channels)
        self.conv4 = GATConv(hidden_channels, hidden_channels)
        self.conv4_norm = GraphNorm(hidden_channels)
        self.conv5 = GATConv(hidden_channels, hidden_channels)
        self.conv5_norm = GraphNorm(hidden_channels)
        
        self.lin1 = Linear(hidden_channels, hidden_channels)
        self.lin1_norm = BatchNorm(hidden_channels)
        self.lin2 = Linear(hidden_channels, 64)
        self.lin2_norm = BatchNorm(64)
        self.lin3 = Linear(64, target_dim)

    def forward(self, x, edge_index, edge_attr, batch): 
        x = self.conv1(x, edge_index, edge_attr)
        x = self.conv1_norm(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = self.conv2_norm(x)
        x = F.relu(x)
        x = self.conv3(x, edge_index, edge_attr)
        x = self.conv3_norm(x)
        x = F.relu(x)
        x = self.conv4(x, edge_index, edge_attr)
        x = self.conv4_norm(x)
        x = F.relu(x)
        x = self.conv5(x, edge_index, edge_attr)
        x = self.conv5_norm(x)

        #Returns batch-wise graph-level-outputs by averaging node features across the node dimension, so that for a single graph G
        #its output is computed by
        x = global_mean_pool(x, batch) 
        
        x = self.lin1(x)
        x = self.lin1_norm(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin2(x)
        x = self.lin2_norm(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin3(x)
        
        return x

Train GNN

In [10]:
from utility_functions import get_num_parameters

feature_dim = train_data[0].x.shape[1]
target_dim = train_data[0].y.shape[1]

model = GNN(hidden_channels=128, feature_dim=feature_dim, target_dim=target_dim).to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)
criterion = torch.nn.MSELoss().to(device)

train_params, tot_params = get_num_parameters(model)
print(f"Total number of parameters: {tot_params}")
print(f"Trainable parameters: {train_params}")

def train(data_in):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch).to(device)  # Perform a single forward pass.

      targets = data_in.y
      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data):
      all_r2 = []
      all_loss = []
      counter = -1    
      for data_in in data:
            counter += 1
            model.eval()
            out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch).cpu()
            targets = data_in.y.cpu()
            
            # Caculate R2
            r2_score_var = []
            for item in range(target_dim):
                  if item == 0:
                        r2_score_var = r2_score(targets[:,item].detach().numpy(), out[:,item].detach().numpy())
                  else:
                        new_score = r2_score(targets[:,item].detach().numpy(), out[:,item].detach().numpy())
                        r2_score_var = np.vstack((r2_score_var, new_score))

            if counter == 0:
                  all_r2 = r2_score_var
            else:
                  all_r2 = np.hstack((all_r2, r2_score_var))
                  
            loss = float(criterion(out, targets).detach().numpy())
            all_loss = np.hstack((all_loss, loss))

      average_test_r2 = np.mean(all_r2, axis=1)
      average_test_loss = np.mean(all_loss)
      
      return average_test_r2, average_test_loss

# Vectors to append accuracy to:
train_r2 = []
train_loss = []
test_r2 = []
test_loss = []

n_epochs = 20
print_every_N_epochs = False
N = 5 # print R2 every N epochs

epoch_times = []
train_times = []
test_times = []
print()
print("...Starting training...")
print("Device used:", device)
for epoch in np.arange(1, n_epochs+1):
      epoch_start = time.time()
      losses = []
      train_start = time.time()
      for data in train_loader:
            loss = train(data)
            losses.append(loss.cpu().detach().numpy())
      print(f'Epoch: {epoch:03d}, Loss: {np.mean(losses):.5f}')
      train_end = time.time()
      train_times.append(train_end - train_start)
      
      if print_every_N_epochs and (epoch % N == 0 or epoch == 1) and epoch != n_epochs:
            test_start = time.time()
            r2_temp_train, loss_temp_train = test(train_loader) 
            train_r2.append(r2_temp_train)
            train_loss.append(loss_temp_train)         
            r2_temp_test, loss_temp_test = test(test_loader)
            test_r2.append(r2_temp_test)
            test_loss.append(loss_temp_test)
            print(f'Average Train R2: {r2_temp_train}')
            print(f"Average Train Loss: {loss_temp_train}")
            print(f'Average Test R2: {r2_temp_test}')
            print(f"Average Test Loss: {loss_temp_test}")
            test_end = time.time()
            test_times.append(test_end - test_start)
            
      if epoch == n_epochs:         # calculate results of training
            print("...Training done...")
            print("...Calculating final results...")
            test_start = time.time()
            r2_temp_train, loss_temp_train = test(train_loader) 
            train_r2.append(r2_temp_train)
            train_loss.append(loss_temp_train)
            
            r2_temp_test, loss_temp_test = test(test_loader)
            test_r2.append(r2_temp_test)
            test_loss.append(loss_temp_test)

            print("====================================================")
            print("Final training R2:", train_r2[-1])
            print("Average final training R2: ", np.mean(train_r2[-1]))
            print("Final training loss:", train_loss[-1])

            print("Final test R2:", test_r2[-1])
            print("Average final test R2: ", np.mean(test_r2[-1]))
            print("Final test loss:", test_loss[-1])
            
            test_end = time.time()
            test_times.append(test_end - test_start)
            
      epoch_end = time.time()
      epoch_times.append(epoch_end - epoch_start)

Total number of parameters: 105487
Trainable parameters: 105487

...Starting training...
Device used: cuda
Epoch: 001, Loss: 0.38294
Epoch: 002, Loss: 0.29988
Epoch: 003, Loss: 0.28588
Epoch: 004, Loss: 0.27569
Epoch: 005, Loss: 0.26755
Epoch: 006, Loss: 0.25881
Epoch: 007, Loss: 0.25169
Epoch: 008, Loss: 0.24731
Epoch: 009, Loss: 0.24064
Epoch: 010, Loss: 0.23663
Epoch: 011, Loss: 0.23311
Epoch: 012, Loss: 0.22927
Epoch: 013, Loss: 0.22817
Epoch: 014, Loss: 0.22526
Epoch: 015, Loss: 0.22204
Epoch: 016, Loss: 0.22067
Epoch: 017, Loss: 0.21921
Epoch: 018, Loss: 0.21847
Epoch: 019, Loss: 0.21612
Epoch: 020, Loss: 0.21385
...Training done...
...Calculating final results...
Final training R2: [0.67902045 0.77122223 0.79913799 0.61001594 0.8609593  0.8066497
 0.93838602 0.90960948 0.87663879 0.92936654 0.88129252 0.88129467
 0.88129471 0.8812902  0.88267263]
Average final training R2:  0.8392567444445234
Final training loss: 0.15582229232461534
Final test R2: [0.67902045 0.77122223 0.799137

In [11]:
print("Device used:", device)
print()
print(f"Total number of epochs: {len(epoch_times)}")
print(f"Total training time: {np.sum(epoch_times)/60:.2f} minutes")
print(f"Total time in training: {np.sum(train_times)/60:.2f} minutes")
print(f"Total time in testing: {np.sum(test_times)/60:.2f} minutes")
print()
print(f"Average epoch time: {np.mean(epoch_times):.1f} seconds")
print(f"Average time in training: {np.mean(train_times):.1f} seconds")
print(f"Average time in testing: {np.mean(test_times):.1f} seconds")

Device used: cuda

Total number of epochs: 20
Total training time: 15.83 minutes
Total time in training: 12.21 minutes
Total time in testing: 3.61 minutes

Average epoch time: 47.5 seconds
Average time in training: 36.6 seconds
Average time in testing: 216.8 seconds


#### Model for just one target

GNN function

In [12]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, feature_dim, target_dim):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GATConv(feature_dim, hidden_channels)
        self.conv1_norm = GraphNorm(hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv2_norm = GraphNorm(hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.conv3_norm = GraphNorm(hidden_channels)
        self.conv4 = GATConv(hidden_channels, hidden_channels)
        self.conv4_norm = GraphNorm(hidden_channels)
        self.conv5 = GATConv(hidden_channels, hidden_channels)
        self.conv5_norm = GraphNorm(hidden_channels)
        
        self.lin1 = Linear(hidden_channels, hidden_channels)
        self.lin1_norm = BatchNorm(hidden_channels)
        self.lin2 = Linear(hidden_channels, 32)
        self.lin2_norm = BatchNorm(32)
        self.lin3 = Linear(32, target_dim)

    def forward(self, x, edge_index, edge_attr, batch): 
        x = self.conv1(x, edge_index, edge_attr)
        x = self.conv1_norm(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = self.conv2_norm(x)
        x = F.relu(x)
        x = self.conv3(x, edge_index, edge_attr)
        x = self.conv3_norm(x)
        x = F.relu(x)
        x = self.conv4(x, edge_index, edge_attr)
        x = self.conv4_norm(x)
        x = F.relu(x)
        x = self.conv5(x, edge_index, edge_attr)
        x = self.conv5_norm(x)

        #Returns batch-wise graph-level-outputs by averaging node features across the node dimension, so that for a single graph G
        #its output is computed by
        x = global_mean_pool(x, batch) 
        
        x = self.lin1(x)
        x = self.lin1_norm(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin2(x)
        x = self.lin2_norm(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin3(x)
        
        return x

Train GNN

In [13]:
feature_dim = train_data[0].x.shape[1]
target_dim = 1

def train(data_in, target):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch).to(device)
      targets = data_in.y[:,target].reshape(-1,1)
      
      loss = criterion(out, targets)   

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data, target):
      all_r2 = []
      all_loss = []
      counter = -1    
      for data_in in data:
            counter += 1
            model.eval()
            out = model(data_in.x, data_in.edge_index, data_in.edge_attr, data_in.batch).cpu()
            targets = data_in.y[:,target].cpu().reshape(-1,1)
            
            # Caculate R2
            r2_score_var = r2_score(targets.detach().numpy(), out.detach().numpy())
            all_r2.append(r2_score_var)
            
            loss = float(criterion(out, targets).detach().numpy())
            all_loss.append(loss)

      average_test_r2 = np.mean(all_r2)
      average_test_loss = np.mean(all_loss)

      return average_test_r2, average_test_loss

num_targets = train_data[0].y.shape[1]
start_time = time.time()
for target_index in range(num_targets):
      print("Target index:", target_index)

      model = GNN(hidden_channels=64, feature_dim=feature_dim, target_dim=target_dim).to(device) 
      optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
      criterion = torch.nn.MSELoss().to(device)

      # Vectors to append accuracy to:
      train_r2 = []
      test_r2 = []
      train_loss = []
      test_loss = []

      # Calculate accuracy and loss before training 
      r2_temp, loss_temp = test(train_loader, target_index)
      train_r2.append(r2_temp)
      train_loss.append(loss_temp)
      r2_temp, loss_temp = test(test_loader, target_index)
      test_r2.append(r2_temp)
      test_loss.append(loss_temp)
      
      print("Initial training R2: ", train_r2[0])
      print("Initial test R2: ", test_r2[0])

      print_r2_option = True
      counter = 0
      for epoch in range(1, 21):
            counter += 1
            losses = []
            for data in train_loader:
                  loss = train(data, target_index)
                  losses.append(loss.cpu().detach().numpy())
            print(f'Epoch: {epoch:03d}, Loss: {np.mean(losses):.5f}')

            if print_r2_option & epoch == 20:
                  temp_train_r2, temp_train_loss = test(train_loader, target_index)
                  train_r2.append(temp_train_r2)
                  train_loss.append(temp_train_loss)

                  temp_test_r2, temp_test_loss = test(test_loader, target_index)
                  test_r2.append(temp_test_r2)
                  test_loss.append(temp_test_loss)

      print(f"Best training R2 for target {target_index}: {np.max(train_r2)}")
      print(f"Best test R2 for target {target_index}: {np.max(test_r2)}")
print("...Done...")
end_time = time.time()
print(f"Time taken: {(end_time - start_time)/60} minutes")
print(f"Average time per target: {(end_time - start_time)/(num_targets*60)} minutes")

Target index: 0
Initial training R2:  -0.006616069299762293
Initial test R2:  -0.006068681250565049
Epoch: 001, Loss: 0.68748


KeyboardInterrupt: 