# Preprocessing

## adding necessary tools

In [1]:
#installing deep graph library
!pip3 install dgl



In [2]:
!pip install dglgo



In [3]:
%matplotlib inline
import os #to create and dealing with directory

os.environ["DGLBACKEND"] = "pytorch"
import dgl #deep graph library
import numpy as np #numerical functions
import networkx as nx #to represent graph and dealing with graph
import torch #for neural network
import torch.nn as nn
import dgl.function as fn #to catch functions
import torch.nn.functional as F #neural network functions
import shutil #help us import dataset
from torch.utils.data import DataLoader #to load data
import cloudpickle
from dgl.nn import GraphConv
from sklearn import preprocessing

## loading dataset

using ESOL dataset.<br>
ESOL is a small dataset consisting of water solubility data for some compounds.

In [4]:
current_dir = "./"
checkpoint_path = current_dir + "save_models/model_checkpoints/" + "checkpoint"
os.makedirs(checkpoint_path, exist_ok=True)

best_model_path = current_dir + "save_models/best_model/"

folder_data_temp = current_dir +"data_temp/"
shutil.rmtree(folder_data_temp, ignore_errors=True)

path_save = current_dir + "graph_regression.zip"
shutil.unpack_archive(path_save, folder_data_temp)

Defining a custom pytorch regression dataset class

In [5]:
""" Regression Dataset """
class DGLDatasetReg(torch.utils.data.Dataset):
    def __init__(self, address, transform=None, train=False, scaler=None, scaler_regression=None):
            self.train = train
            self.scaler = scaler
            self.data_set, train_labels_masks_globals = dgl.load_graphs(address+".bin")
            num_graphs = len(self.data_set)
            self.labels = train_labels_masks_globals["labels"].view(num_graphs,-1)
            self.masks = train_labels_masks_globals["masks"].view(num_graphs,-1)
            self.globals = train_labels_masks_globals["globals"].view(num_graphs,-1)
            self.transform = transform
            self.scaler_regression = scaler_regression
    def scaler_method(self):
        if self.train:
            scaler = preprocessing.StandardScaler().fit(self.labels)
            self.scaler = scaler
        return self.scaler
    def __len__(self):
        return len(self.data_set)
    def __getitem__(self, idx):
        if self.scaler_regression:
            """ With Scaler"""
            return  self.data_set[idx], torch.tensor(self.scaler.transform(self.labels)[idx]).float(), self.masks[idx], self.globals[idx]
        else:
            """ Without Scaler """
            return  self.data_set[idx], self.labels[idx].float(), self.masks[idx], self.globals[idx]

performing train and test and split according to file

In [6]:
path_data_temp = folder_data_temp + "scaffold"+"_"+str(0)

train_set = DGLDatasetReg(address=path_data_temp+"_train")
scaler= train_set.scaler_method()
val_set = DGLDatasetReg(address=path_data_temp+"_val", scaler = scaler)
test_set = DGLDatasetReg(address=path_data_temp+"_test", scaler = scaler)

print(len(train_set), len(val_set), len(test_set))

902 112 114


In [7]:
def collate(batch):
    # batch is a list of tuples (graphs, labels, masks, globals)
    # Concatenate a sequence of graphs
    graphs = [e[0] for e in batch]
    g = dgl.batch(graphs)

    # Concatenate a sequence of tensors (labels) along a new dimension
    labels = [e[1] for e in batch]
    labels = torch.stack(labels, 0)

    # Concatenate a sequence of tensors (masks) along a new dimension
    masks = [e[2] for e in batch]
    masks = torch.stack(masks, 0)

    # Concatenate a sequence of tensors (globals) along a new dimension
    globals = [e[3] for e in batch]
    globals = torch.stack(globals, 0)

    return g, labels, masks, globals

def loader(batch_size = 128):
    train_dataloader = DataLoader(train_set,
                              batch_size=batch_size,
                              collate_fn=collate,
                              drop_last=False,
                              shuffle=True,
                              num_workers=1)

    val_dataloader =  DataLoader(val_set,
                             batch_size=batch_size,
                             collate_fn=collate,
                             drop_last=False,
                             shuffle=False,
                             num_workers=1)

    test_dataloader = DataLoader(test_set,
                             batch_size=batch_size,
                             collate_fn=collate,
                             drop_last=False,
                             shuffle=False,
                             num_workers=1)
    return train_dataloader, val_dataloader, test_dataloader

In [8]:
train_dataloader, val_dataloader, test_dataloader = loader(batch_size=128)

# Model Defining

In [9]:
#ESOL has only 1 task
num_tasks = 1

# Size of global feature of each graph
global_size = 200

# Number of epochs to train the model
num_epochs = 100

# Number of steps to wait if the model performance on the validation set does not improve
patience = 10

#Configurations to instantiate the model
config = {"node_feature_size":127, "edge_feature_size":12, "hidden_size":100}

for this project we'll use 8 models

## model1

using graph convolutional<br>
3 layer<br>
no drop out

In [10]:
class model1(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = GraphConv(self.node_feature_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv2 = GraphConv(self.hidden_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv3 = GraphConv(self.hidden_size, self.num_tasks, allow_zero_in_degree=True)

    # def forward(self, g, in_feat):
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.conv3(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

Function to Compute Score of the Model

In [11]:
import math

In [12]:
def compute_score(model, data_loader, scaler, val_size, num_tasks):
  model.eval()
  loss_sum = nn.MSELoss(reduction='sum') # MSE with sum instead of mean, i.e., sum_i[(y_i)^2-(y'_i)^2]
  final_loss = 0
  state = torch.get_rng_state()
  with torch.no_grad():
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
      prediction = model(mol_dgl_graph, globals)
      loss = loss_sum(prediction, labels)
      final_loss += loss.item()
    final_loss /= val_size
    final_loss = math.sqrt(final_loss)
  return final_loss / num_tasks

loss function

In [13]:
def loss_func(output, label, mask, num_tasks):
  pos_weight = torch.ones((1, num_tasks))
  pos_weight
  criterion = nn.MSELoss(reduction='none')
  loss = mask*criterion(output,label)
  loss = loss.sum() / mask.sum()
  return loss

training function

In [14]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [15]:
def train_evaluate():

    model = model1(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = float('inf')
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, scaler, len(val_set), num_tasks)
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

Function to compute test set score of the final saved model

In [16]:
def test_evaluate():
    final_model = model1(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, scaler, len(val_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))

Train the model and evaluate its performance

In [17]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 13.033 | Valid Score: 4.394
 
Epoch: 1/100 | Best Valid Score Until Now: 4.394 

Save checkpoint
Epoch: 2/100 | Training Loss: 12.989 | Valid Score: 4.300
 
Epoch: 2/100 | Best Valid Score Until Now: 4.300 

Save checkpoint
Epoch: 3/100 | Training Loss: 11.709 | Valid Score: 4.210
 
Epoch: 3/100 | Best Valid Score Until Now: 4.210 

Save checkpoint
Epoch: 4/100 | Training Loss: 9.988 | Valid Score: 4.123
 
Epoch: 4/100 | Best Valid Score Until Now: 4.123 

Save checkpoint
Epoch: 5/100 | Training Loss: 12.350 | Valid Score: 4.042
 
Epoch: 5/100 | Best Valid Score Until Now: 4.042 

Save checkpoint
Epoch: 6/100 | Training Loss: 9.061 | Valid Score: 3.963
 
Epoch: 6/100 | Best Valid Score Until Now: 3.963 

Save checkpoint
Epoch: 7/100 | Training Loss: 9.181 | Valid Score: 3.885
 
Epoch: 7/100 | Best Valid Score Until Now: 3.885 

Save checkpoint
Epoch: 8/100 | Training Loss: 8.500 | Valid Score: 3.807
 
Epoch: 8/100 | Best Valid Score Until N

## model2

using graph sage<br>
3 layer<br>
no drop out

In [18]:
#defining Graph sage class from scratch
class SAGEConv(nn.Module):
    """Graph convolution module used by the GraphSAGE model.

    Parameters
    ----------
    in_feat : int
        Input feature size.
    out_feat : int
        Output feature size.
    """

    def __init__(self, in_feat, out_feat):
        super(SAGEConv, self).__init__()
        # A linear submodule for projecting the input and neighbor feature to the output.
        self.linear = nn.Linear(in_feat * 2, out_feat)

    def forward(self, g, h):
        """Forward computation

        Parameters
        ----------
        g : Graph
            The input graph.
        h : Tensor
            The input node feature.
        """
        with g.local_scope():
            g.ndata["h"] = h
            # update_all is a message passing API.
            g.update_all(
                message_func=fn.copy_u("h", "m"),
                reduce_func=fn.mean("m", "h_N"),
            )
            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)

In [19]:
class model2(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv(self.hidden_size, self.hidden_size)
        self.conv3 = SAGEConv(self.hidden_size, self.num_tasks)

    # def forward(self, g, in_feat):
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.conv3(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

Function to Compute Score of the Model

In [20]:
def compute_score(model, data_loader, scaler, val_size, num_tasks):
  model.eval()
  loss_sum = nn.MSELoss(reduction='sum') # MSE with sum instead of mean, i.e., sum_i[(y_i)^2-(y'_i)^2]
  final_loss = 0
  state = torch.get_rng_state()
  with torch.no_grad():
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
      prediction = model(mol_dgl_graph, globals)
      loss = loss_sum(prediction, labels)
      final_loss += loss.item()
    final_loss /= val_size
    final_loss = math.sqrt(final_loss)
  return final_loss / num_tasks

loss function

In [21]:
def loss_func(output, label, mask, num_tasks):
    pos_weight = torch.ones((1, num_tasks))
    criterion = nn.MSELoss(reduction='none')
    loss = mask*criterion(output,label)
    loss = loss.sum() / mask.sum()
    return loss

training function

In [22]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [23]:
def train_evaluate():

    model = model2(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = float('inf')
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, scaler, len(val_set), num_tasks)
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

Function to compute test set score of the final saved model

In [24]:
def test_evaluate():
    final_model = model2(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, scaler, len(val_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))

Train the model and evaluate its performance

In [25]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 10.622 | Valid Score: 4.129
 
Epoch: 1/100 | Best Valid Score Until Now: 4.129 

Save checkpoint
Epoch: 2/100 | Training Loss: 9.491 | Valid Score: 4.076
 
Epoch: 2/100 | Best Valid Score Until Now: 4.076 

Save checkpoint
Epoch: 3/100 | Training Loss: 9.477 | Valid Score: 4.017
 
Epoch: 3/100 | Best Valid Score Until Now: 4.017 

Save checkpoint
Epoch: 4/100 | Training Loss: 9.247 | Valid Score: 3.948
 
Epoch: 4/100 | Best Valid Score Until Now: 3.948 

Save checkpoint
Epoch: 5/100 | Training Loss: 9.410 | Valid Score: 3.864
 
Epoch: 5/100 | Best Valid Score Until Now: 3.864 

Save checkpoint
Epoch: 6/100 | Training Loss: 8.242 | Valid Score: 3.763
 
Epoch: 6/100 | Best Valid Score Until Now: 3.763 

Save checkpoint
Epoch: 7/100 | Training Loss: 7.377 | Valid Score: 3.647
 
Epoch: 7/100 | Best Valid Score Until Now: 3.647 

Save checkpoint
Epoch: 8/100 | Training Loss: 7.169 | Valid Score: 3.518
 
Epoch: 8/100 | Best Valid Score Until Now:

## model3

using graph convolutional<br>
4 layer<br>
no drop out

In [26]:
class model3(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = GraphConv(self.node_feature_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv2 = GraphConv(self.hidden_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv3 = GraphConv(self.hidden_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv4 = GraphConv(self.hidden_size, self.num_tasks, allow_zero_in_degree=True)

    # def forward(self, g, in_feat):
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.conv3(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.conv4(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

Function to Compute Score of the Model

In [27]:
def compute_score(model, data_loader, scaler, val_size, num_tasks):
  model.eval()
  loss_sum = nn.MSELoss(reduction='sum') # MSE with sum instead of mean, i.e., sum_i[(y_i)^2-(y'_i)^2]
  final_loss = 0
  state = torch.get_rng_state()
  with torch.no_grad():
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
      prediction = model(mol_dgl_graph, globals)
      loss = loss_sum(prediction, labels)
      final_loss += loss.item()
    final_loss /= val_size
    final_loss = math.sqrt(final_loss)
  return final_loss / num_tasks

loss function

In [28]:
def loss_func(output, label, mask, num_tasks):
    pos_weight = torch.ones((1, num_tasks))
    criterion = nn.MSELoss(reduction='none')
    loss = mask*criterion(output,label)
    loss = loss.sum() / mask.sum()
    return loss

training function

In [29]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [30]:
def train_evaluate():

    model = model3(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = float('inf')
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, scaler, len(val_set), num_tasks)
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

Function to compute test set score of the final saved model

In [31]:
def test_evaluate():
    final_model = model3(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, scaler, len(val_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))

Train the model and evaluate its performance

In [32]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 10.142 | Valid Score: 4.125
 
Epoch: 1/100 | Best Valid Score Until Now: 4.125 

Save checkpoint
Epoch: 2/100 | Training Loss: 9.370 | Valid Score: 4.043
 
Epoch: 2/100 | Best Valid Score Until Now: 4.043 

Save checkpoint
Epoch: 3/100 | Training Loss: 9.737 | Valid Score: 3.962
 
Epoch: 3/100 | Best Valid Score Until Now: 3.962 

Save checkpoint
Epoch: 4/100 | Training Loss: 8.967 | Valid Score: 3.876
 
Epoch: 4/100 | Best Valid Score Until Now: 3.876 

Save checkpoint
Epoch: 5/100 | Training Loss: 8.759 | Valid Score: 3.788
 
Epoch: 5/100 | Best Valid Score Until Now: 3.788 

Save checkpoint
Epoch: 6/100 | Training Loss: 8.174 | Valid Score: 3.691
 
Epoch: 6/100 | Best Valid Score Until Now: 3.691 

Save checkpoint
Epoch: 7/100 | Training Loss: 7.985 | Valid Score: 3.584
 
Epoch: 7/100 | Best Valid Score Until Now: 3.584 

Save checkpoint
Epoch: 8/100 | Training Loss: 6.900 | Valid Score: 3.475
 
Epoch: 8/100 | Best Valid Score Until Now:

## model4

using graph convolutional<br>
4 layer<br>
drop out = 0.1

In [33]:
class model4(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = GraphConv(self.node_feature_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv2 = GraphConv(self.hidden_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv3 = GraphConv(self.hidden_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv4 = GraphConv(self.hidden_size, self.num_tasks, allow_zero_in_degree=True)

        self.dropout = nn.Dropout(p=0.1)

    # def forward(self, g, in_feat):
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv3(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv4(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

Function to Compute Score of the Model

In [34]:
def compute_score(model, data_loader, scaler, val_size, num_tasks):
  model.eval()
  loss_sum = nn.MSELoss(reduction='sum') # MSE with sum instead of mean, i.e., sum_i[(y_i)^2-(y'_i)^2]
  final_loss = 0
  state = torch.get_rng_state()
  with torch.no_grad():
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
      prediction = model(mol_dgl_graph, globals)
      loss = loss_sum(prediction, labels)
      final_loss += loss.item()
    final_loss /= val_size
    final_loss = math.sqrt(final_loss)
  return final_loss / num_tasks

loss function

In [35]:
def loss_func(output, label, mask, num_tasks):
    pos_weight = torch.ones((1, num_tasks))
    criterion = nn.MSELoss(reduction='none')
    loss = mask*criterion(output,label)
    loss = loss.sum() / mask.sum()
    return loss

training function

In [36]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [37]:
def train_evaluate():

    model = model4(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = float('inf')
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, scaler, len(val_set), num_tasks)
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

Function to compute test set score of the final saved model

In [38]:
def test_evaluate():
    final_model = model4(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, scaler, len(val_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))

Train the model and evaluate its performance

In [39]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 10.175 | Valid Score: 4.077
 
Epoch: 1/100 | Best Valid Score Until Now: 4.077 

Save checkpoint
Epoch: 2/100 | Training Loss: 9.419 | Valid Score: 4.014
 
Epoch: 2/100 | Best Valid Score Until Now: 4.014 

Save checkpoint
Epoch: 3/100 | Training Loss: 8.645 | Valid Score: 3.949
 
Epoch: 3/100 | Best Valid Score Until Now: 3.949 

Save checkpoint
Epoch: 4/100 | Training Loss: 9.077 | Valid Score: 3.881
 
Epoch: 4/100 | Best Valid Score Until Now: 3.881 

Save checkpoint
Epoch: 5/100 | Training Loss: 8.337 | Valid Score: 3.804
 
Epoch: 5/100 | Best Valid Score Until Now: 3.804 

Save checkpoint
Epoch: 6/100 | Training Loss: 9.771 | Valid Score: 3.721
 
Epoch: 6/100 | Best Valid Score Until Now: 3.721 

Save checkpoint
Epoch: 7/100 | Training Loss: 7.581 | Valid Score: 3.630
 
Epoch: 7/100 | Best Valid Score Until Now: 3.630 

Save checkpoint
Epoch: 8/100 | Training Loss: 8.713 | Valid Score: 3.531
 
Epoch: 8/100 | Best Valid Score Until Now:

## model5

using graph convolutional<br>
4 layer<br>
drop out = 0.2

In [40]:
class model5(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = GraphConv(self.node_feature_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv2 = GraphConv(self.hidden_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv3 = GraphConv(self.hidden_size, self.hidden_size, allow_zero_in_degree=True)
        self.conv4 = GraphConv(self.hidden_size, self.num_tasks, allow_zero_in_degree=True)

        self.dropout = nn.Dropout(p=0.2)

    # def forward(self, g, in_feat):
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv3(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv4(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

Function to Compute Score of the Model

In [41]:
def compute_score(model, data_loader, scaler, val_size, num_tasks):
  model.eval()
  loss_sum = nn.MSELoss(reduction='sum') # MSE with sum instead of mean, i.e., sum_i[(y_i)^2-(y'_i)^2]
  final_loss = 0
  state = torch.get_rng_state()
  with torch.no_grad():
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
      prediction = model(mol_dgl_graph, globals)
      loss = loss_sum(prediction, labels)
      final_loss += loss.item()
    final_loss /= val_size
    final_loss = math.sqrt(final_loss)
  return final_loss / num_tasks

loss function

In [42]:
def loss_func(output, label, mask, num_tasks):
    pos_weight = torch.ones((1, num_tasks))
    criterion = nn.MSELoss(reduction='none')
    loss = mask*criterion(output,label)
    loss = loss.sum() / mask.sum()
    return loss

training function

In [43]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [44]:
def train_evaluate():

    model = model5(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = float('inf')
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, scaler, len(val_set), num_tasks)
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

Function to compute test set score of the final saved model

In [45]:
def test_evaluate():
    final_model = model5(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, scaler, len(val_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))

Train the model and evaluate its performance

In [46]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 10.395 | Valid Score: 4.086
 
Epoch: 1/100 | Best Valid Score Until Now: 4.086 

Save checkpoint
Epoch: 2/100 | Training Loss: 9.660 | Valid Score: 4.023
 
Epoch: 2/100 | Best Valid Score Until Now: 4.023 

Save checkpoint
Epoch: 3/100 | Training Loss: 10.658 | Valid Score: 3.961
 
Epoch: 3/100 | Best Valid Score Until Now: 3.961 

Save checkpoint
Epoch: 4/100 | Training Loss: 9.234 | Valid Score: 3.899
 
Epoch: 4/100 | Best Valid Score Until Now: 3.899 

Save checkpoint
Epoch: 5/100 | Training Loss: 8.142 | Valid Score: 3.832
 
Epoch: 5/100 | Best Valid Score Until Now: 3.832 

Save checkpoint
Epoch: 6/100 | Training Loss: 8.313 | Valid Score: 3.760
 
Epoch: 6/100 | Best Valid Score Until Now: 3.760 

Save checkpoint
Epoch: 7/100 | Training Loss: 8.065 | Valid Score: 3.684
 
Epoch: 7/100 | Best Valid Score Until Now: 3.684 

Save checkpoint
Epoch: 8/100 | Training Loss: 7.192 | Valid Score: 3.603
 
Epoch: 8/100 | Best Valid Score Until Now

## model6

using graph sage<br>
4 layer<br>
no drop out

In [47]:
class model6(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv(self.hidden_size, self.hidden_size)
        self.conv3 = SAGEConv(self.hidden_size, self.hidden_size)
        self.conv4 = SAGEConv(self.hidden_size, self.num_tasks)

    # def forward(self, g, in_feat):
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.conv3(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.conv4(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

Function to Compute Score of the Model

In [48]:
def compute_score(model, data_loader, scaler, val_size, num_tasks):
  model.eval()
  loss_sum = nn.MSELoss(reduction='sum') # MSE with sum instead of mean, i.e., sum_i[(y_i)^2-(y'_i)^2]
  final_loss = 0
  state = torch.get_rng_state()
  with torch.no_grad():
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
      prediction = model(mol_dgl_graph, globals)
      loss = loss_sum(prediction, labels)
      final_loss += loss.item()
    final_loss /= val_size
    final_loss = math.sqrt(final_loss)
  return final_loss / num_tasks

loss function

In [49]:
def loss_func(output, label, mask, num_tasks):
    pos_weight = torch.ones((1, num_tasks))
    criterion = nn.MSELoss(reduction='none')
    loss = mask*criterion(output,label)
    loss = loss.sum() / mask.sum()
    return loss

training function

In [50]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [51]:
def train_evaluate():

    model = model6(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = float('inf')
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, scaler, len(val_set), num_tasks)
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

Function to compute test set score of the final saved model

In [52]:
def test_evaluate():
    final_model = model6(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, scaler, len(val_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))

Train the model and evaluate its performance

In [53]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 10.606 | Valid Score: 4.185
 
Epoch: 1/100 | Best Valid Score Until Now: 4.185 

Save checkpoint
Epoch: 2/100 | Training Loss: 10.544 | Valid Score: 4.162
 
Epoch: 2/100 | Best Valid Score Until Now: 4.162 

Save checkpoint
Epoch: 3/100 | Training Loss: 11.403 | Valid Score: 4.133
 
Epoch: 3/100 | Best Valid Score Until Now: 4.133 

Save checkpoint
Epoch: 4/100 | Training Loss: 10.285 | Valid Score: 4.093
 
Epoch: 4/100 | Best Valid Score Until Now: 4.093 

Save checkpoint
Epoch: 5/100 | Training Loss: 10.992 | Valid Score: 4.034
 
Epoch: 5/100 | Best Valid Score Until Now: 4.034 

Save checkpoint
Epoch: 6/100 | Training Loss: 9.109 | Valid Score: 3.948
 
Epoch: 6/100 | Best Valid Score Until Now: 3.948 

Save checkpoint
Epoch: 7/100 | Training Loss: 9.141 | Valid Score: 3.826
 
Epoch: 7/100 | Best Valid Score Until Now: 3.826 

Save checkpoint
Epoch: 8/100 | Training Loss: 8.125 | Valid Score: 3.657
 
Epoch: 8/100 | Best Valid Score Until 

## model7

using graph sage<br>
4 layer<br>
drop out = 0.1

In [54]:
class model7(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv(self.hidden_size, self.hidden_size)
        self.conv3 = SAGEConv(self.hidden_size, self.hidden_size)
        self.conv4 = SAGEConv(self.hidden_size, self.num_tasks)

        self.dropout = nn.Dropout(p=0.1)

    # def forward(self, g, in_feat):
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv3(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv4(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

Function to Compute Score of the Model

In [55]:
def compute_score(model, data_loader, scaler, val_size, num_tasks):
  model.eval()
  loss_sum = nn.MSELoss(reduction='sum') # MSE with sum instead of mean, i.e., sum_i[(y_i)^2-(y'_i)^2]
  final_loss = 0
  state = torch.get_rng_state()
  with torch.no_grad():
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
      prediction = model(mol_dgl_graph, globals)
      loss = loss_sum(prediction, labels)
      final_loss += loss.item()
    final_loss /= val_size
    final_loss = math.sqrt(final_loss)
  return final_loss / num_tasks

loss function

In [56]:
def loss_func(output, label, mask, num_tasks):
    pos_weight = torch.ones((1, num_tasks))
    criterion = nn.MSELoss(reduction='none')
    loss = mask*criterion(output,label)
    loss = loss.sum() / mask.sum()
    return loss

training function

In [57]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [58]:
def train_evaluate():

    model = model7(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = float('inf')
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, scaler, len(val_set), num_tasks)
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

Function to compute test set score of the final saved model

In [59]:
def test_evaluate():
    final_model = model7(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, scaler, len(val_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))

Train the model and evaluate its performance

In [60]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 11.323 | Valid Score: 4.162
 
Epoch: 1/100 | Best Valid Score Until Now: 4.162 

Save checkpoint
Epoch: 2/100 | Training Loss: 10.133 | Valid Score: 4.127
 
Epoch: 2/100 | Best Valid Score Until Now: 4.127 

Save checkpoint
Epoch: 3/100 | Training Loss: 11.501 | Valid Score: 4.083
 
Epoch: 3/100 | Best Valid Score Until Now: 4.083 

Save checkpoint
Epoch: 4/100 | Training Loss: 9.883 | Valid Score: 4.023
 
Epoch: 4/100 | Best Valid Score Until Now: 4.023 

Save checkpoint
Epoch: 5/100 | Training Loss: 9.681 | Valid Score: 3.941
 
Epoch: 5/100 | Best Valid Score Until Now: 3.941 

Save checkpoint
Epoch: 6/100 | Training Loss: 8.843 | Valid Score: 3.829
 
Epoch: 6/100 | Best Valid Score Until Now: 3.829 

Save checkpoint
Epoch: 7/100 | Training Loss: 8.030 | Valid Score: 3.680
 
Epoch: 7/100 | Best Valid Score Until Now: 3.680 

Save checkpoint
Epoch: 8/100 | Training Loss: 7.529 | Valid Score: 3.491
 
Epoch: 8/100 | Best Valid Score Until No

## model8

using graph sage<br>
4 layer<br>
drop out = 0.2

In [61]:
class model8(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv(self.hidden_size, self.hidden_size)
        self.conv3 = SAGEConv(self.hidden_size, self.hidden_size)
        self.conv4 = SAGEConv(self.hidden_size, self.num_tasks)

        self.dropout = nn.Dropout(p=0.2)

    # def forward(self, g, in_feat):
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv3(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv4(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

Function to Compute Score of the Model

In [62]:
def compute_score(model, data_loader, scaler, val_size, num_tasks):
  model.eval()
  loss_sum = nn.MSELoss(reduction='sum') # MSE with sum instead of mean, i.e., sum_i[(y_i)^2-(y'_i)^2]
  final_loss = 0
  state = torch.get_rng_state()
  with torch.no_grad():
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
      prediction = model(mol_dgl_graph, globals)
      loss = loss_sum(prediction, labels)
      final_loss += loss.item()
    final_loss /= val_size
    final_loss = math.sqrt(final_loss)
  return final_loss / num_tasks

loss function

In [63]:
def loss_func(output, label, mask, num_tasks):
    pos_weight = torch.ones((1, num_tasks))
    criterion = nn.MSELoss(reduction='none')
    loss = mask*criterion(output,label)
    loss = loss.sum() / mask.sum()
    return loss

training function

In [64]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [65]:
def train_evaluate():

    model = model8(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = float('inf')
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, scaler, len(val_set), num_tasks)
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

Function to compute test set score of the final saved model

In [66]:
def test_evaluate():
    final_model = model8(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, scaler, len(val_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))

Train the model and evaluate its performance

In [67]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 10.205 | Valid Score: 4.199
 
Epoch: 1/100 | Best Valid Score Until Now: 4.199 

Save checkpoint
Epoch: 2/100 | Training Loss: 10.323 | Valid Score: 4.174
 
Epoch: 2/100 | Best Valid Score Until Now: 4.174 

Save checkpoint
Epoch: 3/100 | Training Loss: 10.152 | Valid Score: 4.146
 
Epoch: 3/100 | Best Valid Score Until Now: 4.146 

Save checkpoint
Epoch: 4/100 | Training Loss: 10.503 | Valid Score: 4.112
 
Epoch: 4/100 | Best Valid Score Until Now: 4.112 

Save checkpoint
Epoch: 5/100 | Training Loss: 10.584 | Valid Score: 4.066
 
Epoch: 5/100 | Best Valid Score Until Now: 4.066 

Save checkpoint
Epoch: 6/100 | Training Loss: 11.355 | Valid Score: 3.999
 
Epoch: 6/100 | Best Valid Score Until Now: 3.999 

Save checkpoint
Epoch: 7/100 | Training Loss: 8.939 | Valid Score: 3.900
 
Epoch: 7/100 | Best Valid Score Until Now: 3.900 

Save checkpoint
Epoch: 8/100 | Training Loss: 9.117 | Valid Score: 3.759
 
Epoch: 8/100 | Best Valid Score Until