# TIF360 Project

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import time as time

import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import networkx as nx
from torch.nn import Linear, LeakyReLU
from torch_geometric.nn import global_mean_pool, GATConv, BatchNorm, GraphNorm
import torch.nn.functional as F

from sklearn.metrics import r2_score

In [2]:
print("cuda available:", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device:", "cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))

cuda available: True
device: cuda
NVIDIA GeForce RTX 3080


### Load data

In [3]:
df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))

mol_descriptor = np.load("../data/Mordred_mol_descriptors.npy")
print(mol_descriptor.shape)

mol_fingerprints = np.load("../data/mol_morgan_fingerprints.npy")
print(mol_fingerprints.shape)

(132820, 21)
(132820, 985)
(132820, 2048)


In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from utility_functions import get_data_split_indices

def create_scale_and_split_dataset(features:np.ndarray, targets:np.ndarray, val_share, test_share, batch_size):
    
    train_indices, val_indices, test_indices = get_data_split_indices(features.shape[0], val_share, test_share)
    
    X_train, y_train = features[train_indices], targets[train_indices]
    X_val, y_val = features[val_indices], targets[val_indices]
    X_test, y_test = features[test_indices], targets[test_indices]
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    scaler_targets = StandardScaler()
    y_train = scaler_targets.fit_transform(y_train)
    y_val = scaler_targets.transform(y_val)
    y_test = scaler_targets.transform(y_test)
    
    train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32, device=device),
                               torch.tensor(y_train, dtype=torch.float32, device=device))
    val_data = TensorDataset(torch.tensor(X_val, dtype=torch.float32, device=device),
                            torch.tensor(y_val, dtype=torch.float32, device=device))
    test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32, device=device),
                                torch.tensor(y_test, dtype=torch.float32, device=device))

    
    return train_data, val_data, test_data, scaler_targets

def create_data_loaders(train_data, val_data, test_data, batch_size):
    
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader
    

Load data

In [5]:
print("...Loading data...")
properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
x_smiles = df.smiles.values
y = df.loc[:, properties_names].values

features = np.concatenate((mol_descriptor, mol_fingerprints), axis=1)

train_data, val_data, test_data, scaler_targets = create_scale_and_split_dataset(features, y, 0.15, 0.2, 64)
train_loader, val_loader, test_loader = create_data_loaders(train_data, val_data, test_data, 64)
print("...Data loading done...")

...Loading data...
...Data loading done...


### Dense network 

#### Model for all targets at once

Define model

In [6]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, feature_dim, target_dim):
        super().__init__()
        torch.manual_seed(12345)
        
        self.input_norm = BatchNorm(feature_dim)
        self.lin1 = Linear(feature_dim, hidden_channels)
        self.lin2 = Linear(hidden_channels, 512)
        self.lin3 = Linear(512, target_dim)

    def forward(self, x): 
                
        x = self.input_norm(x)
        x = self.lin1(x)
        x = F.leaky_relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin2(x)
        x = F.leaky_relu(x)

        x = self.lin3(x)
 
        return x

Train GNN

In [10]:
from utility_functions import get_num_parameters

feature_dim = train_data[:][0].shape[1]
target_dim = train_data[:][1].shape[1]

model = GNN(hidden_channels=1024, feature_dim=feature_dim, target_dim=target_dim).to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

# Decay for learning rate
decayRate = 0.9
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)

criterion = torch.nn.MSELoss().to(device)

train_params, tot_params = get_num_parameters(model)
print(f"Total number of parameters: {tot_params}")
print(f"Trainable parameters: {train_params}")

def train(batch):
      features = batch[:][0]
      targets = batch[:][1]
      
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(features).to(device)  # Perform a single forward pass.

      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      
      return loss

def test(data):
      for i, batch in enumerate(data):
            features = batch[:][0]
            targets = batch[:][1].cpu()
            
            model.eval()
            out = model(features).cpu()
            
            # Caculate R2 for each target
            for target_idx in range(target_dim):
                  if target_idx != 0:
                        r2_score_var = np.vstack((r2_score_var, r2_score(targets[:,target_idx].detach().numpy(), 
                                                          out[:,target_idx].detach().numpy())))
                  else:
                        r2_score_var = np.array([r2_score(targets[:,target_idx].detach().numpy(),
                                                          out[:,target_idx].detach().numpy())])        
            all_r2 = np.hstack((all_r2, r2_score_var)) if i != 0 else r2_score_var
                  
            loss = float(criterion(out, targets).detach().numpy())
            all_loss = np.hstack((all_loss, loss)) if i != 0 else np.array(loss)

      average_test_r2 = np.mean(all_r2, axis=1)
      average_test_loss = np.mean(all_loss)
      
      return average_test_r2, average_test_loss

# Vectors to append accuracy to:
train_r2 = []
train_loss = []
test_r2 = []
test_loss = []
val_r2 = []
val_loss = []

n_epochs = 30
print_every_N_epochs = True
N = 10 # print R2 every N epochs

epoch_times = []
train_times = []
test_times = []
print()
print("...Starting training...")
print("Device used:", device)

for epoch in np.arange(1, n_epochs+1):
      epoch_start = time.time()
      losses = []
      train_start = time.time()
      for data in train_loader:
            loss = train(data)
            losses.append(loss.cpu().detach().numpy())  
      # Compute validation loss
      with torch.no_grad():
            val_losses = []
            for batch in val_loader:
                  features = batch[:][0]
                  targets = batch[:][1]
                  out = model(features)
                  val_losses.append(criterion(out, targets).cpu().detach().numpy())
      print(f"Epoch: {epoch:02d} | Train Loss: {np.mean(losses):.5f} | Validation Loss: {np.mean(val_losses):.5f}")
      train_end = time.time()
      train_times.append(train_end - train_start)
      
      lr_scheduler.step() # Decay to learning rate
      
      if print_every_N_epochs and (epoch % N == 0 or epoch == 1) and epoch != n_epochs:
            test_start = time.time()
                
            r2_temp_val, loss_temp_val = test(test_loader)
            val_r2.append(r2_temp_val)
            val_loss.append(loss_temp_val)
            
            print(f'Validation R2: {r2_temp_val}')
            print(f"Validation loss: {loss_temp_val}")
            test_end = time.time()
            test_times.append(test_end - test_start)
            
      if epoch == n_epochs:         # calculate results of training: test on all data
            print("...Training done...")
            print("...Calculating final results...")
            test_start = time.time()
            r2_temp_train, loss_temp_train = test(train_loader) 
            train_r2.append(r2_temp_train)
            train_loss.append(loss_temp_train)
            
            r2_temp_val, loss_temp_val = test(test_loader)
            val_r2.append(r2_temp_val)
            val_loss.append(loss_temp_val)
            
            r2_temp_test, loss_temp_test = test(test_loader)
            test_r2.append(r2_temp_test)
            test_loss.append(loss_temp_test)

            print("====================================================")
            print("Final training R2:", train_r2[-1])
            print("Average final training R2: ", np.mean(train_r2[-1]))
            print("Final training loss:", train_loss[-1])
            
            print("Final validation R2:", val_r2[-1])
            print("Average validation test R2: ", np.mean(val_r2[-1]))
            print("Final validation loss:", val_loss[-1])

            print("Final test R2:", test_r2[-1])
            print("Average final test R2: ", np.mean(test_r2[-1]))
            print("Final test loss:", test_loss[-1])
            
            test_end = time.time()
            test_times.append(test_end - test_start)
            
      epoch_end = time.time()
      epoch_times.append(epoch_end - epoch_start)

Total number of parameters: 3645377
Trainable parameters: 3645377

...Starting training...
Device used: cuda
Epoch: 01 | Train Loss: 0.12303 | Validation Loss: 0.10497
Validation R2: [0.77897328 0.89232372 0.91811492 0.59263023 0.9661426  0.80712823
 0.9359219  0.8943081  0.9425438  0.99059181 0.99307779 0.99358267
 0.99331053 0.99351325 0.97816684]
Validation loss: 0.08534403396949458
Epoch: 02 | Train Loss: 0.10173 | Validation Loss: 0.09752
Epoch: 03 | Train Loss: 0.09633 | Validation Loss: 0.08994
Epoch: 04 | Train Loss: 0.09243 | Validation Loss: 0.09116
Epoch: 05 | Train Loss: 0.08815 | Validation Loss: 0.08348
Epoch: 06 | Train Loss: 0.08546 | Validation Loss: 0.08370
Epoch: 07 | Train Loss: 0.08314 | Validation Loss: 0.07940
Epoch: 08 | Train Loss: 0.08058 | Validation Loss: 0.07881
Epoch: 09 | Train Loss: 0.07857 | Validation Loss: 0.07780
Epoch: 10 | Train Loss: 0.07707 | Validation Loss: 0.07531
Validation R2: [0.82941789 0.91282307 0.93027837 0.6646775  0.98004264 0.8828958

In [8]:
print("Device used:", device)
print()
print(f"Total number of epochs: {len(epoch_times)}")
print(f"Total training time: {np.sum(epoch_times)/60:.2f} minutes")
print(f"Total time in training: {np.sum(train_times)/60:.2f} minutes")
print(f"Total time in testing: {np.sum(test_times)/60:.2f} minutes")
print()
print(f"Average epoch time: {np.mean(epoch_times):.1f} seconds")
print(f"Average time in training: {np.mean(train_times):.1f} seconds")
print(f"Average time in testing: {np.mean(test_times):.1f} seconds")

Device used: cuda

Total number of epochs: 30
Total training time: 1.45 minutes
Total time in training: 1.30 minutes
Total time in testing: 0.15 minutes

Average epoch time: 2.9 seconds
Average time in training: 2.6 seconds
Average time in testing: 2.3 seconds


#### Model for just one target

Train GNN

In [27]:
feature_dim = train_data[:][0].shape[1]
target_dim = 1

def train(batch, target_idx):
      features = batch[:][0]
      targets = batch[:][1][:,target_idx].unsqueeze(-1)
      
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(features).to(device)  # Perform a single forward pass.

      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      
      return loss

def test(data, target_idx):
      for i, batch in enumerate(data):
            features = batch[:][0]
            targets = batch[:][1][:,target_idx].cpu().unsqueeze(-1)
            
            model.eval()
            out = model(features).cpu()
            
            # Caculate R2    
            r2_score_var = np.array([r2_score(targets.detach().numpy(), out.detach().numpy())])        
            all_r2 = np.hstack((all_r2, r2_score_var)) if i != 0 else r2_score_var
                  
            loss = float(criterion(out, targets).detach().numpy())
            all_loss = np.hstack((all_loss, loss)) if i != 0 else np.array(loss)

      average_test_r2 = np.mean(all_r2)
      average_test_loss = np.mean(all_loss)
      
      return average_test_r2, average_test_loss

num_targets = train_data[:][1].shape[1]
start_time = time.time()
for target_index in range(num_targets):
      print("Target index:", target_index)

      model = GNN(hidden_channels=1024, feature_dim=feature_dim, target_dim=target_dim).to(device) 
      optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
      criterion = torch.nn.MSELoss().to(device)
      
      # Decay for learning rate
      decayRate = 0.9
      lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)

      # Vectors to append accuracy to:
      train_r2 = []
      train_loss = []
      test_r2 = []
      test_loss = []
      val_r2 = []
      val_loss = []

      n_epochs = 10

      print("Device used:", device)

      for epoch in np.arange(1, n_epochs+1):
            losses = []
            for data in train_loader:
                  loss = train(data, target_index)
                  losses.append(loss.cpu().detach().numpy())  
            # Compute validation loss
            with torch.no_grad():
                  val_losses = []
                  for batch in val_loader:
                        features = batch[:][0]
                        targets = batch[:][1][:, target_index].unsqueeze(-1)
                        out = model(features)
                        val_losses.append(criterion(out, targets).cpu().detach().numpy())
            print(f"Epoch: {epoch:02d} | Train Loss: {np.mean(losses):.5f} | Validation Loss: {np.mean(val_losses):.5f}")
            
            lr_scheduler.step() # Decay to learning rate
                  
            if epoch == n_epochs:         # calculate results of training: test on all data
                  test_start = time.time()
                  r2_temp_train, loss_temp_train = test(train_loader, target_index) 
                  train_r2.append(r2_temp_train)
                  train_loss.append(loss_temp_train)
                  
                  r2_temp_val, loss_temp_val = test(test_loader, target_index)
                  val_r2.append(r2_temp_val)
                  val_loss.append(loss_temp_val)
                  
                  r2_temp_test, loss_temp_test = test(test_loader, target_index)
                  test_r2.append(r2_temp_test)
                  test_loss.append(loss_temp_test)

                  print("====================================================")
                  print("Final training R2:", train_r2[-1])
                  print("Average final training R2: ", np.mean(train_r2[-1]))
                  print("Final training loss:", train_loss[-1])
                  print()
                  print("Final validation R2:", val_r2[-1])
                  print("Average validation test R2: ", np.mean(val_r2[-1]))
                  print("Final validation loss:", val_loss[-1])
                  print()
                  print("Final test R2:", test_r2[-1])
                  print("Average final test R2: ", np.mean(test_r2[-1]))
                  print("Final test loss:", test_loss[-1])
                  print("====================================================")
                                    
print("...Done...")
end_time = time.time()
print(f"Time taken: {(end_time - start_time)/60} minutes")
print(f"Average time per target: {(end_time - start_time)/(num_targets*60)} minutes")

Target index: 0
Device used: cuda
Epoch: 01 | Train Loss: 0.20331 | Validation Loss: 0.16931
Epoch: 02 | Train Loss: 0.13334 | Validation Loss: 0.13731
Epoch: 03 | Train Loss: 0.11218 | Validation Loss: 0.12130
Epoch: 04 | Train Loss: 0.09657 | Validation Loss: 0.10917
Epoch: 05 | Train Loss: 0.08596 | Validation Loss: 0.10260
Epoch: 06 | Train Loss: 0.07844 | Validation Loss: 0.10326
Epoch: 07 | Train Loss: 0.07271 | Validation Loss: 0.10409
Epoch: 08 | Train Loss: 0.06706 | Validation Loss: 0.09729
Epoch: 09 | Train Loss: 0.06232 | Validation Loss: 0.09121
Epoch: 10 | Train Loss: 0.05829 | Validation Loss: 0.08852
Final training R2: 0.8930999648095416
Average final training R2:  0.8930999648095416
Final training loss: 0.10087261825601113

Final validation R2: 0.8405620029856536
Average validation test R2:  0.8405620029856536
Final validation loss: 0.1506244971936282

Final test R2: 0.8405620029856536
Average final test R2:  0.8405620029856536
Final test loss: 0.1506244971936282
Targe