# TIF360 Project

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time as time

import torch
from torch.utils.data import random_split
from torch.nn import Linear, BatchNorm1d as BatchNorm
from torch_geometric.nn import global_mean_pool, GATConv, BatchNorm, GraphNorm
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import torch.nn.functional as F

from sklearn.metrics import r2_score
import utility_functions as uf
import GNN_structures as GNNs

In [2]:
print("cuda available:", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device:", "cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

cuda available: True
device: cuda
NVIDIA GeForce RTX 3080


### Load data

Load smiles data and targets

In [3]:
df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))

(132820, 21)


Create the graph dataset

In [4]:
from graph_dataset_functions import create_graph_dataset_from_smiles

properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

x_smiles = df.smiles.values
y = df.loc[:, properties_names].values  # shape = (n_samples, n_properties)

dataset = create_graph_dataset_from_smiles(x_smiles, y[0:len(x_smiles), :])

Information of the graph dataset

In [5]:
print(f'Number of graphs (molecules): {len(dataset)}')
graph = dataset[50]
print('=================================================================================')
print(f'Properties of graph {50}, molecule smiles: {df.smiles.values[50]}')
print(f'Number of nodes: {dataset[50].x.shape[0]}')
print(f'Number of edges: {dataset[50].edge_index.shape[1]}')
print(f'Number of node features: {dataset[50].x.shape[1]}')
print(f'Number of edge features: {dataset[50].edge_attr.shape[1]}')
print(f'Number of target properties: {dataset[50].y.shape[1]}')

Number of graphs (molecules): 132820
Properties of graph 50, molecule smiles: CC1=CNC=C1
Number of nodes: 6
Number of edges: 12
Number of node features: 78
Number of edge features: 10
Number of target properties: 15


Choose mode: Which features to include

In [11]:
# singles
# mode = "rdkit_descriptors"
# mode = "morgan_fingerprints"
mode = "mordred_descriptors"

# combinations
# mode = "rdkit+morgan"
# mode = "mordred+morgan"

# excessive
# mode = "rdkit+mordred"
# mode = "rdkit+mordred+morgan"

Load the features

In [12]:
features = uf.load_molecular_features(mode)
print("features:", np.shape(features))

features: (132820, 985)


Create functions to load the data

In [13]:
def create_scale_and_split_dataset(features:np.ndarray, targets:np.ndarray, val_share, test_share):
    
    num_samples = features.shape[0]
    print("num_samples:", num_samples)
    train_indices, val_indices, test_indices = uf.get_data_split_indices(num_samples, val_share, test_share)
    
    train_data = [dataset[i] for i in train_indices]
    val_data = [dataset[i] for i in val_indices]
    test_data = [dataset[i] for i in test_indices]
        
    X_train, y_train = features[train_indices], targets[train_indices]
    X_val, y_val = features[val_indices], targets[val_indices]
    X_test, y_test = features[test_indices], targets[test_indices]
    
    X_train, X_val, X_test, scaler_features = uf.scale_features(X_train, X_val, X_test)
    y_train, y_val, y_test, scaler_targets = uf.scale_targets(y_train, y_val, y_test)
    
    scalers = {"features": scaler_features, "targets": scaler_targets}
    
    y_train = torch.tensor(y_train, dtype=torch.float, device=device)
    y_val = torch.tensor(y_val, dtype=torch.float, device=device)
    y_test = torch.tensor(y_test, dtype=torch.float, device=device)
    
    train_data = [Data(x=data.x.to(device), edge_index=data.edge_index.to(device), edge_attr=data.edge_attr.to(device), 
                       y=y_train[index].reshape(1,-1).to(device)) for index, data in enumerate(train_data)]
    
    val_data = [Data(x=data.x.to(device), edge_index=data.edge_index.to(device), edge_attr=data.edge_attr.to(device),
                        y=y_val[index].reshape(1,-1).to(device)) for index, data in enumerate(val_data)]
    
    test_data = [Data(x=data.x.to(device), edge_index=data.edge_index.to(device), edge_attr=data.edge_attr.to(device), 
                      y=y_test[index].reshape(1,-1).to(device)) for index, data in enumerate(test_data)]
        
    X_train = torch.tensor(X_train, dtype=torch.float, device=device)
    X_val = torch.tensor(X_val, dtype=torch.float, device=device)
    X_test = torch.tensor(X_test, dtype=torch.float, device=device)
      
    for index in range(len(train_data)):
        train_data[index].features = X_train[index,:].unsqueeze(0) # extra dimension for batch
    for index in range(len(val_data)):
        val_data[index].features = X_val[index,:].unsqueeze(0)
    for index in range(len(test_data)):
        test_data[index].features = X_test[index,:].unsqueeze(0)

    return train_data, val_data, test_data, scalers

def create_data_loaders(train_data, val_data, test_data, batch_size):
    
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader  

Load data

In [14]:
print("...Loading data...")
properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
x_smiles = df.smiles.values
targets = df.loc[:, properties_names].values

train_data, val_data, test_data, scalers = create_scale_and_split_dataset(features, targets, 0.15, 0.2)
train_loader, val_loader, test_loader = create_data_loaders(train_data, val_data, test_data, 64)
print("...Data loading done...")

...Loading data...
num_samples: 132820
...Data loading done...


### Graph Neural Network

#### Model for all targets at once

Define model

Train GNN

In [15]:
def train(model, batch):
      targets = batch.y
      
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(batch.x, batch.edge_index, batch.edge_attr, 
                        batch.batch, batch.features).to(device)  # Perform a single forward pass.

      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      
      return loss

def test(model, data):
      for i, batch in enumerate(data):
            targets = batch.y.cpu()
            
            model.eval()
            out = model(batch.x, batch.edge_index, batch.edge_attr, 
                        batch.batch, batch.features).cpu()
            
            # Caculate R2 for each target
            for target_idx in range(target_dim):
                  if target_idx != 0:
                        r2_score_var = np.vstack((r2_score_var, r2_score(targets[:,target_idx].detach().numpy(), 
                                                          out[:,target_idx].detach().numpy())))
                  else:
                        r2_score_var = np.array([r2_score(targets[:,target_idx].detach().numpy(),
                                                          out[:,target_idx].detach().numpy())])        
            all_r2 = np.hstack((all_r2, r2_score_var)) if i != 0 else r2_score_var
                  
            loss = float(criterion(out, targets).detach().numpy())
            all_loss = np.hstack((all_loss, loss)) if i != 0 else np.array(loss)

      average_test_r2 = np.mean(all_r2, axis=1)
      average_test_loss = np.mean(all_loss)
      
      return average_test_r2, average_test_loss

def early_stopping(val_losses, patience): # returns True if there is no improvement in val_loss
      if len(val_losses) < patience:
            return False
      else:
            best_loss = np.min(val_losses)
            current_loss = val_losses[-1]
            
            if current_loss > best_loss:
                  return True
            else:
                  return False

node_feature_dim = train_data[0].x.shape[1]
feature_dim = train_data[0].features.shape[1]
target_dim = train_data[0].y.shape[1]

num_layers, hidden_channels = 5, 512

model_class = GNNs.define_GNN_structure_with_global_features(num_layers, hidden_channels, node_feature_dim, 
                                                             feature_dim, target_dim)
model = model_class().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

# Decay for learning rate
decay_rate = 0.94
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decay_rate)

criterion = torch.nn.MSELoss().to(device)

train_params, _ = uf.get_num_parameters(model)
print(f"Trainable parameters: {train_params:,}")
      
# Vectors to append accuracy to:
train_r2 = []
train_loss = []
test_r2 = []
test_loss = []
val_r2 = []
val_loss = []

n_epochs = 100
print_every_N_epochs = False
N = 10 # print R2 every N epochs

epoch_times = []
train_times = []
print()
print("...Starting training...")
print("Device used:", device)

val_losses_epoch = [] # for early stopping
patience = 5 # how many epochs to wait for the val loss to improve
best_val_loss = np.inf  
epochs_without_improvement = 0

for epoch in np.arange(1, n_epochs+1):
      epoch_start = time.time()
      losses = []
      train_start = time.time()
      for batch in train_loader:
            loss = train(model, batch)
            losses.append(loss.cpu().detach().numpy())  
      # Compute validation loss
      model.eval()
      val_losses = []
      for batch in val_loader:
            targets = batch.y
            out = model(batch.x, batch.edge_index, batch.edge_attr, 
                        batch.batch, batch.features)
            val_losses.append(criterion(out, targets).cpu().detach().numpy())
      
      val_loss_epoch = np.mean(val_losses)
      val_losses_epoch.append(val_loss_epoch)
      print(f"Epoch: {epoch:02d} | Train Loss: {np.mean(losses):.5f} | Validation Loss: {val_loss_epoch:.5f}")
      train_end = time.time()
      train_times.append(train_end - train_start)
      lr_scheduler.step() # Decay to learning rate
      
      # check for early stopping
      if early_stopping(val_losses_epoch, patience) and epoch > patience:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                  print(f"Early stopping at epoch {epoch} after {patience} epochs without improvement.")
                  break
      else:
            epochs_without_improvement = 0
            best_val_loss = val_loss_epoch
            # save the model
            torch.save(model.state_dict(), 'best_graphs+features_model.pt')
                   
      if print_every_N_epochs and (epoch % N == 0 or epoch == 1) and epoch != n_epochs:
            test_start = time.time()
                
            r2_temp_val, loss_temp_val = test(model, val_loader)
            val_r2.append(r2_temp_val)
            val_loss.append(loss_temp_val)
            
            print(f'Validation R2: {r2_temp_val}')
            print(f"Validation loss: {loss_temp_val}")
            
      epoch_end = time.time()
      epoch_times.append(epoch_end - epoch_start)
      
            
print("...Training done...")
print("...Calculating final results...")
model.load_state_dict(torch.load('best_graphs+features_model.pt'))

r2_temp_train, loss_temp_train = test(model, train_loader) 
train_r2.append(r2_temp_train)
train_loss.append(loss_temp_train)

r2_temp_val, loss_temp_val = test(model, val_loader)
val_r2.append(r2_temp_val)
val_loss.append(loss_temp_val)

r2_temp_test, loss_temp_test = test(model, test_loader)
test_r2.append(r2_temp_test)
test_loss.append(loss_temp_test)

print("====================================================")
print("Final training R2:", train_r2[-1])
print("Average final training R2: ", np.mean(train_r2[-1]))
print("Final training loss:", train_loss[-1])

print("Final validation R2:", val_r2[-1])
print("Average validation R2: ", np.mean(val_r2[-1]))
print("Final validation loss:", val_loss[-1])

print("Final test R2:", test_r2[-1])
print("Average final test R2: ", np.mean(test_r2[-1]))
print("Final test loss:", test_loss[-1])          

Trainable parameters: 2,144,193

...Starting training...
Device used: cuda
Epoch: 01 | Train Loss: 0.13400 | Validation Loss: 0.08864
Epoch: 02 | Train Loss: 0.11036 | Validation Loss: 0.08509
Epoch: 03 | Train Loss: 0.10290 | Validation Loss: 0.07705
Epoch: 04 | Train Loss: 0.09938 | Validation Loss: 0.07622
Epoch: 05 | Train Loss: 0.09597 | Validation Loss: 0.07355
Epoch: 06 | Train Loss: 0.09452 | Validation Loss: 0.07130
Epoch: 07 | Train Loss: 0.09208 | Validation Loss: 0.08102
Epoch: 08 | Train Loss: 0.09015 | Validation Loss: 0.07589
Epoch: 09 | Train Loss: 0.08835 | Validation Loss: 0.06753
Epoch: 10 | Train Loss: 0.08694 | Validation Loss: 0.06902
Epoch: 11 | Train Loss: 0.08585 | Validation Loss: 0.06623
Epoch: 12 | Train Loss: 0.08487 | Validation Loss: 0.06600
Epoch: 13 | Train Loss: 0.08394 | Validation Loss: 0.06437
Epoch: 14 | Train Loss: 0.08334 | Validation Loss: 0.06486
Epoch: 15 | Train Loss: 0.08232 | Validation Loss: 0.06278
Epoch: 16 | Train Loss: 0.08176 | Valida

In [16]:
print("Device used:", device)
print()
print(f"Total number of epochs: {len(epoch_times)}")
print(f"Total training time: {np.sum(epoch_times)/60:.2f} minutes")
print(f"Total time in training: {np.sum(train_times)/60:.2f} minutes")
print()
print(f"Average epoch time: {np.mean(epoch_times):.1f} seconds")
print(f"Average time in training: {np.mean(train_times):.1f} seconds")


Device used: cuda

Total number of epochs: 55
Total training time: 24.07 minutes
Total time in training: 24.55 minutes

Average epoch time: 26.3 seconds
Average time in training: 26.3 seconds


#### Model for just one target

Train DNN

In [17]:
feature_dim = train_data[:][0].shape[1]
target_dim = 1

def train(batch, target_idx):
      features = batch[:][0]
      targets = batch[:][1][:,target_idx].unsqueeze(-1)
      
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(features).to(device)  # Perform a single forward pass.

      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      
      return loss

def test(data, target_idx):
      for i, batch in enumerate(data):
            features = batch[:][0]
            targets = batch[:][1][:,target_idx].cpu().unsqueeze(-1)
            
            model.eval()
            out = model(features).cpu()
            
            # Caculate R2    
            r2_score_var = np.array([r2_score(targets.detach().numpy(), out.detach().numpy())])        
            all_r2 = np.hstack((all_r2, r2_score_var)) if i != 0 else r2_score_var
                  
            loss = float(criterion(out, targets).detach().numpy())
            all_loss = np.hstack((all_loss, loss)) if i != 0 else np.array(loss)

      average_test_r2 = np.mean(all_r2)
      average_test_loss = np.mean(all_loss)
      
      return average_test_r2, average_test_loss

num_targets = train_data[:][1].shape[1]
start_time = time.time()
for target_index in range(num_targets):
      print("Target index:", target_index)

      model = DNN(hidden_channels=1024, feature_dim=feature_dim, target_dim=target_dim).to(device) 
      optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
      criterion = torch.nn.MSELoss().to(device)
      
      # Decay for learning rate
      decayRate = 0.9
      lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)

      # Vectors to append accuracy to:
      train_r2 = []
      train_loss = []
      test_r2 = []
      test_loss = []
      val_r2 = []
      val_loss = []

      n_epochs = 10

      print("Device used:", device)

      for epoch in np.arange(1, n_epochs+1):
            losses = []
            for data in train_loader:
                  loss = train(data, target_index)
                  losses.append(loss.cpu().detach().numpy())  
            # Compute validation loss
            model.eval()
            val_losses = []
            for batch in val_loader:
                  features = batch[:][0]
                  targets = batch[:][1][:, target_index].unsqueeze(-1)
                  out = model(features)
                  val_losses.append(criterion(out, targets).cpu().detach().numpy())
            print(f"Epoch: {epoch:02d} | Train Loss: {np.mean(losses):.5f} | Validation Loss: {np.mean(val_losses):.5f}")
            
            lr_scheduler.step() # Decay to learning rate
                  
            if epoch == n_epochs:         # calculate results of training: test on all data
                  test_start = time.time()
                  r2_temp_train, loss_temp_train = test(train_loader, target_index) 
                  train_r2.append(r2_temp_train)
                  train_loss.append(loss_temp_train)
                  
                  r2_temp_val, loss_temp_val = test(val_loader, target_index)
                  val_r2.append(r2_temp_val)
                  val_loss.append(loss_temp_val)
                  
                  r2_temp_test, loss_temp_test = test(test_loader, target_index)
                  test_r2.append(r2_temp_test)
                  test_loss.append(loss_temp_test)

                  print("====================================================")
                  print("Final training R2:", train_r2[-1])
                  print("Average final training R2: ", np.mean(train_r2[-1]))
                  print("Final training loss:", train_loss[-1])
                  print()
                  print("Final validation R2:", val_r2[-1])
                  print("Average validation test R2: ", np.mean(val_r2[-1]))
                  print("Final validation loss:", val_loss[-1])
                  print()
                  print("Final test R2:", test_r2[-1])
                  print("Average final test R2: ", np.mean(test_r2[-1]))
                  print("Final test loss:", test_loss[-1])
                  print("====================================================")
                                    
print("...Done...")
end_time = time.time()
print(f"Time taken: {(end_time - start_time)/60} minutes")
print(f"Average time per target: {(end_time - start_time)/(num_targets*60)} minutes")

AttributeError: 'GlobalStorage' object has no attribute 'shape'