In [1]:
pip install  dgl -f https://data.dgl.ai/wheels/repo.html

Looking in links: https://data.dgl.ai/wheels/repo.html
Collecting dgl
  Downloading dgl-1.1.1-cp310-cp310-manylinux1_x86_64.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-1.1.1


In [2]:
pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html

Looking in links: https://data.dgl.ai/wheels-test/repo.html
Collecting dglgo
  Downloading dglgo-0.0.2-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting isort>=5.10.1 (from dglgo)
  Downloading isort-5.12.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autopep8>=1.6.0 (from dglgo)
  Downloading autopep8-2.0.2-py2.py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpydoc>=1.1.0 (from dglgo)
  Downloading numpydoc-1.5.0-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.4/52.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml>=0.17.20 (from dglgo)
  Downloading ruamel.yaml-0.17.32-py3-none-any.whl (

In [3]:
%matplotlib inline
import os
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import dgl.function as fn
import torch.nn.functional as F
import shutil
from torch.utils.data import DataLoader
import cloudpickle
from dgl.nn import GraphConv

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
current_dir = "/content/gdrive/MyDrive/graph_data.zip"
#This variable (checkpoint_path) represents the path to the directory where model checkpoints will be saved.
checkpoint_path = current_dir + "save_models/model_checkpoints/" + "checkpoint"
#Creates the directory specified by checkpoint_path using os.makedirs(). If the directory already exists, it will not raise an error (exist_ok=True).
os.makedirs(checkpoint_path, exist_ok=True)
best_model_path = current_dir + "save_models/best_model/"
folder_data_temp = current_dir +"data_temp/"
#Uses shutil.rmtree() to delete the directory specified by folder_data_temp, including all its contents. The ignore_errors=True parameter ensures that errors encountered during the deletion process are ignored
shutil.rmtree(folder_data_temp, ignore_errors=True)
path_save = current_dir
#Uses shutil.unpack_archive() to extract the contents of the archive specified by path_save into the directory specified by folder_data_temp. This line effectively unzips the "graph_data.zip" file into the "data_temp" directory.
shutil.unpack_archive(path_save, folder_data_temp)

In [6]:
""" Classification Dataset """
class DGLDatasetClass(torch.utils.data.Dataset):
    #This is the constructor method of the class. It takes a single argument address, which represents the path to the binary file containing the graph data.
    def __init__(self, address):
       # It sets the instance variable address to the provided address, but with ".bin" appended at the end. This suggests that the graph data is stored in a binary file format.
            self.address=address+".bin"
            #It loads the DGL graphs from the binary file specified by address using the dgl.load_graphs() function. The resulting graphs are stored in self.list_graphs, and other data related to classification tasks, such as labels, masks, and globals, are stored in train_labels_masks_globals.
            self.list_graphs, train_labels_masks_globals = dgl.load_graphs(self.address)
            num_graphs =len(self.list_graphs)
            
         #It extracts the labels from the train_labels_masks_globals dictionary and reshapes them to have a size of (num_graphs, -1), where num_graphs is the number of graphs 
         # -1 indicates that the second dimension can have any size.
            self.labels = train_labels_masks_globals["labels"].view(num_graphs,-1)
            self.masks = train_labels_masks_globals["masks"].view(num_graphs,-1)
            self.globals = train_labels_masks_globals["globals"].view(num_graphs,-1)
    def __len__(self):
        return len(self.list_graphs)
       #This method is used to retrieve an item from the dataset at a specific index idx. In this case, it returns the graph, labels, masks, and globals corresponding to the given index.
    def __getitem__(self, idx):
        return  self.list_graphs[idx], self.labels[idx], self.masks[idx], self.globals[idx]

In [7]:
#It constructs the path_data_temp by concatenating the folder_data_temp path (which represents the temporary data folder) with the string "scaffold_0". The resulting path_data_temp will be used to specify the address for loading the graph data.
path_data_temp = folder_data_temp + "scaffold"+"_"+str(0)
#It creates an instance of the DGLDatasetClass for the training set by providing the address as path_data_temp+"_train". This implies that the training set graph data is stored in a binary file with the suffix "_train" appended to the path_data_temp.
train_set = DGLDatasetClass(address=path_data_temp+"_train")
"): Similarly, it creates an instance of the DGLDatasetClass for the validation set, assuming the graph data is stored in a binary file with the suffix "_val" appended to the path_data_temp.
val_set = DGLDatasetClass(address=path_data_temp+"_val")
test_set = DGLDatasetClass(address=path_data_temp+"_test")

print(len(train_set), len(val_set), len(test_set))

1631 203 205


In [8]:
def collate(batch):
    #This function takes a batch of samples, where each sample is a tuple (graphs, labels, masks, globals). 
    # batch is a list of tuples (graphs, labels, masks, globals)
    # Extracts the graphs from each sample in the batch and creates a batched graph using dgl.batch(). The resulting batched graph is stored in the variable g.
    graphs = [e[0] for e in batch]
    g = dgl.batch(graphs)

    # Extracts the labels from each sample in the batch and stacks them along a new dimension using torch.stack(). The resulting tensor is stored in the variable labels.
    labels = [e[1] for e in batch]
    labels = torch.stack(labels, 0)

    # Concatenate a sequence of tensors (masks) along a new dimension
    masks = [e[2] for e in batch]
    masks = torch.stack(masks, 0)

    # Concatenate a sequence of tensors (globals) along a new dimension
    globals = [e[3] for e in batch]
    globals = torch.stack(globals, 0)

    return g, labels, masks, globals

#This function creates and returns three data loaders: train_dataloader, val_dataloader, and test_dataloader
def loader(batch_size=64):
    train_dataloader = DataLoader(train_set,
                              batch_size=batch_size,
                              collate_fn=collate,
                              drop_last=False,
                              shuffle=True,
                              num_workers=1)

    val_dataloader =  DataLoader(val_set,
                             batch_size=batch_size,
                             collate_fn=collate,
                             drop_last=False,
                             shuffle=False,
                             num_workers=1)

    test_dataloader = DataLoader(test_set,
                             batch_size=batch_size,
                             collate_fn=collate,
                             drop_last=False,
                             shuffle=False,
                             num_workers=1)
    return train_dataloader, val_dataloader, test_dataloader

In [9]:
train_dataloader, val_dataloader, test_dataloader = loader(batch_size=64)

In [10]:
#BBBP dataset has 1 task.
num_tasks = 1

# Size of global feature of each graph
global_size = 200

# Number of epochs to train the model
num_epochs = 100

# Number of steps to wait if the model performance on the validation set does not improve
patience = 10

#Configurations to instantiate the model
config = {"node_feature_size":127, "edge_feature_size":12, "hidden_size":100}

In [24]:
#defines a GNN (Graph Neural Network) model as a subclass of nn.Module in PyTorch
class GNN(nn.Module):
    #This is the constructor method of the GNN class. It takes three parameters: #The configuration dictionary used to instantiate the model
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

    # This line creates an instance of the GraphConv layer from the DGL library. It takes the node feature size, hidden size, and an optional parameter allow_zero_in_degree as arguments. The GraphConv layer performs graph convolutional operations on the input graph
        self.conv1 = GraphConv(self.node_feature_size, self.hidden_size,allow_zero_in_degree=True)
        self.conv2 = GraphConv(self.hidden_size, self.num_tasks,allow_zero_in_degree=True)

    #This method defines the forward pass of the model. It takes two arguments:
    # mol_dgl_graph: The DGL graph object representing the molecular graph. #globals: The global features for the graph.
    def forward(self, mol_dgl_graph, globals):
        #In the forward method, the node and edge features of the input graph are sliced to match the specified feature sizes. This is done using mol_dgl_graph.ndata["v"] and mol_dgl_graph.edata["e"] respectively.
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        #The input graph is then passed through the first GraphConv layer (self.conv1) with a ReLU activation function applied to the hidden features (h). The result is stored in the variable h.
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        #8.	Next, h is passed through the second GraphConv layer (self.conv2) to obtain the final node representations for each task.
        h = self.conv2(mol_dgl_graph, h)
        #9.	Finally, the node representations are averaged using dgl.mean_nodes() over the entire graph and returned as the output of the forward pass.
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

In [12]:
#It will be used as the evaluation metric for computing the model's performance.
from sklearn.metrics import roc_auc_score

def compute_score(model, data_loader, val_size, num_tasks):
    #The function starts by setting the model to evaluation mode using model.eval().
    model.eval()
    #The metric variable is set to roc_auc_score, indicating that the ROC AUC score will be used as the evaluation metric. 
    metric = roc_auc_score
    with torch.no_grad():
        #The function initializes empty tensors (prediction_all, labels_all, masks_all) to store the predictions, labels, and masks for all samples in the dataset.
        prediction_all= torch.empty(0)
        labels_all= torch.empty(0)
        masks_all= torch.empty(0)
        #The function iterates over the data loader, which provides batches of samples. For each batch, the model is applied to the input graph and global features to obtain the predictions. The predictions are then converted to probabilities using the sigmoid function (torch.sigmoid(prediction)).
        for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
            prediction = model(mol_dgl_graph, globals)
            prediction = torch.sigmoid(prediction)
            #The batch predictions, labels, and masks are concatenated with the existing tensors using torch.cat().
            prediction_all = torch.cat((prediction_all, prediction), 0)
            labels_all = torch.cat((labels_all, labels), 0)
            masks_all = torch.cat((masks_all, masks), 0)
            #For each task, the function extracts the relevant predictions, labels, and masks and calculates the metric score (ROC AUC score) using roc_auc_score. If a ValueError 
            # occurs during the calculation, indicating that the task has no positive or negative samples, the score is set to 0.
        #The calculated score is added to the average tensor.
        average = torch.tensor([0.])
        for i in range(num_tasks):
            a1 = prediction_all[:, i][masks_all[:,i]==1]
            a2 = labels_all[:, i][masks_all[:,i]==1]
            try:
                t = metric(a2.int().cpu(), a1.cpu()).item()
            except ValueError:
                t = 0
            average += t
   #Finally, the function returns the average score, which is the sum of individual task scores divided by the number of tasks.
    return average.item()/num_tasks

In [13]:
def loss_func(output, label, mask, num_tasks):
    # This line creates a tensor pos_weight with ones, representing the positive weights for each task. The tensor has a shape of (1, num_tasks), where num_tasks is the number of tasks for the model.
    pos_weight = torch.ones((1, num_tasks))
    pos_weight
    # This line creates an instance of the BCEWithLogitsLoss class from PyTorch's torch.nn module
    criterion = torch.nn.BCEWithLogitsLoss(reduction='none', pos_weight=pos_weight)
    #This line creates an instance of the BCEWithLogitsLoss class from PyTorch's torch.nn module. 
    loss = mask*criterion(output,label)
    #This line calculates the average loss for the batch by summing the losses and dividing by the sum of the mask values. This ensures that the loss is normalized based on the number of valid samples.
    loss = loss.sum() / mask.sum()
    return loss

In [14]:
#Initializes the epoch_train_loss variable to keep track of the average training loss for the epoch.
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    #Sets the model in training mode using model.train(). This is necessary to enable training-specific behavior such as dropout
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader): 
        #For each batch: a. Passes the molecular DGL graph (mol_dgl_graph) and global features (globals) through the model to obtain predictions. b. Calculates the training loss (loss_train) by comparing the predictions with the true labels (labels) using the provided loss_func. c. Zeros out the gradients of the optimizer using optimizer.zero_grad(set_to_none=True) to prevent accumulation of gradients from previous iterations. d. Performs backpropagation by calling loss_train.backward() to compute the gradients of the loss with respect to the model parameters. e. Updates the model parameters using the gradients by calling optimizer.step(). f. Adds the current batch's loss to epoch_train_loss. g. Increments the iterations counter.
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        #Divides epoch_train_loss by the total number of iterations to obtain the average training loss for the epoch.
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [23]:
def train_evaluate():
    #Initializes the GNN model (model) and the optimizer using the Adam optimizer with a learning rate of 0.0001.
    model = GNN(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
#Sets the initial values for variables best_val, patience_count, and epoch. best_val keeps track of the best validation score obtained so far, patience_count counts the number of epochs since the last improvement in validation score, and epoch represents the current epoch number.
    best_val = 0
    patience_count = 1
    epoch = 1
#Starts a while loop that continues until the epoch reaches the maximum number of epochs (num_epochs).
    while epoch <= num_epochs:
        #Within the loop, checks if the patience_count is within the allowed patience limit. If it is, proceeds with training and evaluation.
        if patience_count <= patience:
        #Sets the model in training mode (model.train()) and calls the train_epoch function to perform one epoch of training on the training data. The training loss (loss_train) is returned
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            #6.	Sets the model in evaluation mode (model.eval()) and calls the compute_score function to calculate the score on the validation set (val_dataloader). The score_val represents the evaluation score.
            model.eval()
            #7.	Compares the score_val with the best_val. If it is higher, updates best_val with the new score and saves the model checkpoint.
            score_val = compute_score(model, val_dataloader, len(val_set), num_tasks)
            if score_val > best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

In [16]:
#this function trains a GNN model for a specified number of epochs, tracks the best validation score, and saves the best model checkpoint. It also prints the training progress and final results
def test_evaluate():
    final_model = GNN(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, len(test_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))


In [25]:
#this function loads the best model checkpoint, initializes a new GNN model with the same configuration, evaluates its performance on the test dataset, and prints the test score as well as the execution time.
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 0.637 | Valid Score: 0.390
 
Epoch: 1/100 | Best Valid Score Until Now: 0.390 

Patience 1
Epoch: 2/100 | Training Loss: 0.616 | Valid Score: 0.361
 
Epoch: 2/100 | Best Valid Score Until Now: 0.390 

Patience 2
Epoch: 3/100 | Training Loss: 0.601 | Valid Score: 0.357
 
Epoch: 3/100 | Best Valid Score Until Now: 0.390 

Patience 3
Epoch: 4/100 | Training Loss: 0.591 | Valid Score: 0.366
 
Epoch: 4/100 | Best Valid Score Until Now: 0.390 

Patience 4
Epoch: 5/100 | Training Loss: 0.583 | Valid Score: 0.373
 
Epoch: 5/100 | Best Valid Score Until Now: 0.390 

Patience 5
Epoch: 6/100 | Training Loss: 0.580 | Valid Score: 0.387
 
Epoch: 6/100 | Best Valid Score Until Now: 0.390 

Save checkpoint
Epoch: 7/100 | Training Loss: 0.576 | Valid Score: 0.405
 
Epoch: 7/100 | Best Valid Score Until Now: 0.405 

Save checkpoint
Epoch: 8/100 | Training Loss: 0.571 | Valid Score: 0.432
 
Epoch: 8/100 | Best Valid Score Until Now: 0.432 

Save checkpoint
E