# TIF360 Project

Main source: https://www.kaggle.com/code/rmonge/predicting-molecule-properties-based-on-its-smiles/notebook

### Import packages

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import time as time

import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import networkx as nx
from torch.nn import Linear, LeakyReLU
from torch_geometric.nn import global_mean_pool, GATConv, BatchNorm, GraphNorm
import torch.nn.functional as F

from sklearn.metrics import r2_score

In [2]:
print("cuda available:", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device:", "cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

cuda available: False
device: cpu


### Load data

In [3]:
df = pd.read_csv("../data/smiles_and_targets.csv")
print(np.shape(df))

mol_descriptor = np.load("../data/mol_descriptors.npy")
print(mol_descriptor.shape)

mol_fingerprints = np.load("../data/mol_morgan_fingerprints.npy")
print(mol_fingerprints.shape)

(132820, 21)
(132820, 179)
(132820, 2048)


In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from utility_functions import get_data_split_indices, scale_targets

def create_scale_and_split_dataset(features:np.ndarray, targets:np.ndarray, val_share, test_share, batch_size):
    
    num_samples = features.shape[0]
    print("num_samples:", num_samples)
    train_indices, val_indices, test_indices = get_data_split_indices(num_samples, val_share, test_share)
    print("train_indices:", train_indices)
    print("val_indices:", val_indices)
    print("test_indices:", test_indices)
    
    X_train, y_train = features[train_indices], targets[train_indices]
    X_val, y_val = features[val_indices], targets[val_indices]
    X_test, y_test = features[test_indices], targets[test_indices]
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    y_train, y_val, y_test, scaler_targets = scale_targets(y_train, y_val, y_test)
    
    train_data = TensorDataset(torch.FloatTensor(X_train, device=device),
                               torch.FloatTensor(y_train, device=device))
    val_data = TensorDataset(torch.FloatTensor(X_val, device=device),   
                             torch.FloatTensor(y_val, device=device))
    test_data = TensorDataset(torch.FloatTensor(X_test, device=device),
                              torch.FloatTensor(y_test, device=device))

    return train_data, val_data, test_data, scaler_targets

def create_data_loaders(train_data, val_data, test_data, batch_size):
    
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader  

Load data

In [5]:
print("...Loading data...")
properties_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
x_smiles = df.smiles.values
y = df.loc[:, properties_names].values

# features = np.concatenate((mol_descriptor), axis=1)
features = mol_descriptor

train_data, val_data, test_data, scaler_targets = create_scale_and_split_dataset(features, y, 0.15, 0.2, 64)
train_loader, val_loader, test_loader = create_data_loaders(train_data, val_data, test_data, 64)
print("...Data loading done...")

...Loading data...
num_samples: 132820
train_indices: [127438   5606   9214 ...  64809  71506  76175]
val_indices: [ 84700 105937  14560 ... 129099  46956  43265]
test_indices: [ 26727  53117 114907 ...  85412 130333  77285]
...Data loading done...


### Dense network 

#### Model for all targets at once

Define model

In [6]:
class DNN(torch.nn.Module):
    def __init__(self, hidden_channels, feature_dim, target_dim):
        super().__init__()
        torch.manual_seed(12345)
        
        self.input_norm = BatchNorm(feature_dim)
        self.lin1 = Linear(feature_dim, hidden_channels)
        self.lin2 = Linear(hidden_channels, 512)
        self.lin3 = Linear(512, target_dim)

    def forward(self, x): 
                
        x = self.input_norm(x)
        x = self.lin1(x)
        x = F.leaky_relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.lin2(x)
        x = F.leaky_relu(x)

        x = self.lin3(x)
 
        return x

Train DNN

In [7]:
from utility_functions import get_num_parameters

feature_dim = train_data[:][0].shape[1]
target_dim = train_data[:][1].shape[1]

model = DNN(hidden_channels=1024, feature_dim=feature_dim, target_dim=target_dim).to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

# Decay for learning rate
decayRate = 0.9
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)

criterion = torch.nn.MSELoss().to(device)

train_params, tot_params = get_num_parameters(model)
print(f"Total number of parameters: {tot_params}")
print(f"Trainable parameters: {train_params}")

def train(batch):
      features = batch[:][0]
      targets = batch[:][1]
      
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(features).to(device)  # Perform a single forward pass.

      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      
      return loss

def test(data):
      for i, batch in enumerate(data):
            features = batch[:][0]
            targets = batch[:][1].cpu()
            
            model.eval()
            out = model(features).cpu()
            
            # Caculate R2 for each target
            for target_idx in range(target_dim):
                  if target_idx != 0:
                        r2_score_var = np.vstack((r2_score_var, r2_score(targets[:,target_idx].detach().numpy(), 
                                                          out[:,target_idx].detach().numpy())))
                  else:
                        r2_score_var = np.array([r2_score(targets[:,target_idx].detach().numpy(),
                                                          out[:,target_idx].detach().numpy())])        
            all_r2 = np.hstack((all_r2, r2_score_var)) if i != 0 else r2_score_var
                  
            loss = float(criterion(out, targets).detach().numpy())
            all_loss = np.hstack((all_loss, loss)) if i != 0 else np.array(loss)

      average_test_r2 = np.mean(all_r2, axis=1)
      average_test_loss = np.mean(all_loss)
      
      return average_test_r2, average_test_loss

# Vectors to append accuracy to:
train_r2 = []
train_loss = []
test_r2 = []
test_loss = []
val_r2 = []
val_loss = []

n_epochs = 30
print_every_N_epochs = True
N = 10 # print R2 every N epochs

epoch_times = []
train_times = []
test_times = []
print()
print("...Starting training...")
print("Device used:", device)

for epoch in np.arange(1, n_epochs+1):
      epoch_start = time.time()
      losses = []
      train_start = time.time()
      for batch in train_loader:
            loss = train(batch)
            losses.append(loss.cpu().detach().numpy())  
      # Compute validation loss
      model.eval()
      val_losses = []
      for batch in val_loader:
            features = batch[:][0]
            targets = batch[:][1]
            out = model(features)
            val_losses.append(criterion(out, targets).cpu().detach().numpy()) 

      print(f"Epoch: {epoch:02d} | Train Loss: {np.mean(losses):.5f} | Validation Loss: {np.mean(val_losses):.5f}")
      train_end = time.time()
      train_times.append(train_end - train_start)
      
      lr_scheduler.step() # Decay to learning rate
      
      if print_every_N_epochs and (epoch % N == 0 or epoch == 1) and epoch != n_epochs:
            test_start = time.time()
                
            r2_temp_val, loss_temp_val = test(val_loader)
            val_r2.append(r2_temp_val)
            val_loss.append(loss_temp_val)
            
            print(f'Validation R2: {r2_temp_val}')
            print(f"Validation loss: {loss_temp_val}")
            test_end = time.time()
            test_times.append(test_end - test_start)
            
      if epoch == n_epochs:         # calculate results of training: test on all data
            print("...Training done...")
            print("...Calculating final results...")
            test_start = time.time()
            r2_temp_train, loss_temp_train = test(train_loader) 
            train_r2.append(r2_temp_train)
            train_loss.append(loss_temp_train)
            
            r2_temp_val, loss_temp_val = test(val_loader)
            val_r2.append(r2_temp_val)
            val_loss.append(loss_temp_val)
            
            r2_temp_test, loss_temp_test = test(test_loader)
            test_r2.append(r2_temp_test)
            test_loss.append(loss_temp_test)

            print("====================================================")
            print("Final training R2:", train_r2[-1])
            print("Average final training R2: ", np.mean(train_r2[-1]))
            print("Final training loss:", train_loss[-1])
            
            print("Final validation R2:", val_r2[-1])
            print("Average validation R2: ", np.mean(val_r2[-1]))
            print("Final validation loss:", val_loss[-1])

            print("Final test R2:", test_r2[-1])
            print("Average final test R2: ", np.mean(test_r2[-1]))
            print("Final test loss:", test_loss[-1])
            
            test_end = time.time()
            test_times.append(test_end - test_start)
            
      epoch_end = time.time()
      epoch_times.append(epoch_end - epoch_start)

Total number of parameters: 717173
Trainable parameters: 717173

...Starting training...
Device used: cpu
Epoch: 01 | Train Loss: 0.15350 | Validation Loss: 0.11355
Validation R2: [0.59545072 0.78840536 0.87502583 0.5469624  0.95366012 0.8124005
 0.93087593 0.89631883 0.90801047 0.98002215 0.98621267 0.98523387
 0.9851991  0.98607401 0.97075313]
Validation loss: 0.11355060066741246
Epoch: 02 | Train Loss: 0.12704 | Validation Loss: 0.11098
Validation R2: [0.65804845 0.8042125  0.87799829 0.58014746 0.95171283 0.80101679
 0.91997258 0.90320567 0.91980022 0.95029874 0.98381823 0.98310717
 0.98257769 0.98254086 0.95773256]
Validation loss: 0.11097705612579982
Epoch: 03 | Train Loss: 0.12132 | Validation Loss: 0.10546
...Training done...
...Calculating final results...
Final training R2: [0.65523054 0.8111751  0.89372835 0.58501221 0.95490911 0.81181616
 0.92402429 0.91409194 0.90623216 0.96590456 0.99248892 0.99258645
 0.99252159 0.99257691 0.95650353]
Average final training R2:  0.889920

In [8]:
print("Device used:", device)
print()
print(f"Total number of epochs: {len(epoch_times)}")
print(f"Total training time: {np.sum(epoch_times)/60:.2f} minutes")
print(f"Total time in training: {np.sum(train_times)/60:.2f} minutes")
print(f"Total time in testing: {np.sum(test_times)/60:.2f} minutes")
print()
print(f"Average epoch time: {np.mean(epoch_times):.1f} seconds")
print(f"Average time in training: {np.mean(train_times):.1f} seconds")
print(f"Average time in testing: {np.mean(test_times):.1f} seconds")

Device used: cpu

Total number of epochs: 3
Total training time: 1.30 minutes
Total time in training: 0.93 minutes
Total time in testing: 0.37 minutes

Average epoch time: 25.9 seconds
Average time in training: 18.6 seconds
Average time in testing: 7.4 seconds


#### Model for just one target

Train DNN

In [9]:
feature_dim = train_data[:][0].shape[1]
target_dim = 1

def train(batch, target_idx):
      features = batch[:][0]
      targets = batch[:][1][:,target_idx].unsqueeze(-1)
      
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(features).to(device)  # Perform a single forward pass.

      loss = criterion(out, targets) 

      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      
      return loss

def test(data, target_idx):
      for i, batch in enumerate(data):
            features = batch[:][0]
            targets = batch[:][1][:,target_idx].cpu().unsqueeze(-1)
            
            model.eval()
            out = model(features).cpu()
            
            # Caculate R2    
            r2_score_var = np.array([r2_score(targets.detach().numpy(), out.detach().numpy())])        
            all_r2 = np.hstack((all_r2, r2_score_var)) if i != 0 else r2_score_var
                  
            loss = float(criterion(out, targets).detach().numpy())
            all_loss = np.hstack((all_loss, loss)) if i != 0 else np.array(loss)

      average_test_r2 = np.mean(all_r2)
      average_test_loss = np.mean(all_loss)
      
      return average_test_r2, average_test_loss

num_targets = train_data[:][1].shape[1]
start_time = time.time()
for target_index in range(num_targets):
      print("Target index:", target_index)

      model = DNN(hidden_channels=1024, feature_dim=feature_dim, target_dim=target_dim).to(device) 
      optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
      criterion = torch.nn.MSELoss().to(device)
      
      # Decay for learning rate
      decayRate = 0.9
      lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)

      # Vectors to append accuracy to:
      train_r2 = []
      train_loss = []
      test_r2 = []
      test_loss = []
      val_r2 = []
      val_loss = []

      n_epochs = 10

      print("Device used:", device)

      for epoch in np.arange(1, n_epochs+1):
            losses = []
            for data in train_loader:
                  loss = train(data, target_index)
                  losses.append(loss.cpu().detach().numpy())  
            # Compute validation loss
            with torch.no_grad():
                  val_losses = []
                  for batch in val_loader:
                        features = batch[:][0]
                        targets = batch[:][1][:, target_index].unsqueeze(-1)
                        out = model(features)
                        val_losses.append(criterion(out, targets).cpu().detach().numpy())
            print(f"Epoch: {epoch:02d} | Train Loss: {np.mean(losses):.5f} | Validation Loss: {np.mean(val_losses):.5f}")
            
            lr_scheduler.step() # Decay to learning rate
                  
            if epoch == n_epochs:         # calculate results of training: test on all data
                  test_start = time.time()
                  r2_temp_train, loss_temp_train = test(train_loader, target_index) 
                  train_r2.append(r2_temp_train)
                  train_loss.append(loss_temp_train)
                  
                  r2_temp_val, loss_temp_val = test(test_loader, target_index)
                  val_r2.append(r2_temp_val)
                  val_loss.append(loss_temp_val)
                  
                  r2_temp_test, loss_temp_test = test(test_loader, target_index)
                  test_r2.append(r2_temp_test)
                  test_loss.append(loss_temp_test)

                  print("====================================================")
                  print("Final training R2:", train_r2[-1])
                  print("Average final training R2: ", np.mean(train_r2[-1]))
                  print("Final training loss:", train_loss[-1])
                  print()
                  print("Final validation R2:", val_r2[-1])
                  print("Average validation test R2: ", np.mean(val_r2[-1]))
                  print("Final validation loss:", val_loss[-1])
                  print()
                  print("Final test R2:", test_r2[-1])
                  print("Average final test R2: ", np.mean(test_r2[-1]))
                  print("Final test loss:", test_loss[-1])
                  print("====================================================")
                                    
print("...Done...")
end_time = time.time()
print(f"Time taken: {(end_time - start_time)/60} minutes")
print(f"Average time per target: {(end_time - start_time)/(num_targets*60)} minutes")

Target index: 0
Device used: cpu


KeyboardInterrupt: 