In [None]:
pip install  dgl -f https://data.dgl.ai/wheels/repo.htm

Looking in links: https://data.dgl.ai/wheels/repo.htm
Collecting dgl
  Downloading dgl-1.1.1-cp310-cp310-manylinux1_x86_64.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-1.1.1


In [None]:
pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html

Looking in links: https://data.dgl.ai/wheels-test/repo.html
Collecting dglgo
  Downloading dglgo-0.0.2-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting isort>=5.10.1 (from dglgo)
  Downloading isort-5.12.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autopep8>=1.6.0 (from dglgo)
  Downloading autopep8-2.0.2-py2.py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpydoc>=1.1.0 (from dglgo)
  Downloading numpydoc-1.5.0-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.4/52.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml>=0.17.20 (from dglgo)
  Downloading ruamel.yaml-0.17.32-py3-none-any.whl (1

In [None]:
%matplotlib inline
import os
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import dgl.function as fn
import torch.nn.functional as F
import shutil
from torch.utils.data import DataLoader
import cloudpickle
from dgl.nn import GraphConv
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
current_dir = "/content/gdrive/MyDrive/graph_data.zip"
checkpoint_path = current_dir + "save_models/model_checkpoints/" + "checkpoint"
os.makedirs(checkpoint_path, exist_ok=True)

best_model_path = current_dir + "save_models/best_model/"

folder_data_temp = current_dir +"data_temp/"
shutil.rmtree(folder_data_temp, ignore_errors=True)

path_save = current_dir
shutil.unpack_archive(path_save, folder_data_temp)

In [None]:
""" Regression Dataset """
class DGLDatasetReg(torch.utils.data.Dataset):
    def __init__(self, address, transform=None, train=True, scaler=None , scaler_regression=None):
            self.train = train
            self.scaler = scaler
            self.data_set, train_labels_masks_globals = dgl.load_graphs(address+".bin")
            num_graphs = len(self.data_set)
            self.labels = train_labels_masks_globals["labels"].view(num_graphs,-1)
            self.masks = train_labels_masks_globals["masks"].view(num_graphs,-1)
            self.globals = train_labels_masks_globals["globals"].view(num_graphs,-1)
            self.transform = transform
            self.scaler_regression = scaler_regression

    def scaler_method(self):
        if self.train:
            scaler = preprocessing.StandardScaler().fit(self.labels)
            return scaler
        else:
            return None
    def __len__(self):
        return len(self.data_set)
    def __getitem__(self, idx):
        if self.scaler_regression:
            """ With Scaler"""
            return  self.data_set[idx], torch.tensor(self.scaler.transform(self.labels)[idx]).float(), self.masks[idx], self.globals[idx]
        else:
            """ Without Scaler """
            return  self.data_set[idx], self.labels[idx].float(), self.masks[idx], self.globals[idx]

In [None]:
scaler = StandardScaler()
path_data_temp = folder_data_temp + "scaffold"+"_"+str(0)
train_set = DGLDatasetReg(address=path_data_temp+"_train")
scaler.fit(train_set.scaler_method().transform(train_set.labels))
val_set = DGLDatasetReg(address=path_data_temp+"_val", scaler=scaler)
test_set= DGLDatasetReg(address=path_data_temp+"_test", scaler=scaler)

print(len(train_set), len(val_set), len(test_set))

513 64 65


In [None]:
def collate(batch):
    # batch is a list of tuples (graphs, labels, masks, globals)
    # Concatenate a sequence of graphs
    graphs = [e[0] for e in batch]
    g = dgl.batch(graphs)

    # Concatenate a sequence of tensors (labels) along a new dimension
    labels = [e[1] for e in batch]
    labels = torch.stack(labels, 0)

    # Concatenate a sequence of tensors (masks) along a new dimension
    masks = [e[2] for e in batch]
    masks = torch.stack(masks, 0)

    # Concatenate a sequence of tensors (globals) along a new dimension
    globals = [e[3] for e in batch]
    globals = torch.stack(globals, 0)

    return g, labels, masks, globals


def loader(batch_size=64):
    train_dataloader = DataLoader(train_set,
                              batch_size=batch_size,
                              collate_fn=collate,
                              drop_last=False,
                              shuffle=True,
                              num_workers=1)

    val_dataloader =  DataLoader(val_set,
                             batch_size=batch_size,
                             collate_fn=collate,
                             drop_last=False,
                             shuffle=False,
                             num_workers=1)

    test_dataloader = DataLoader(test_set,
                             batch_size=batch_size,
                             collate_fn=collate,
                             drop_last=False,
                             shuffle=False,
                             num_workers=1)
    return train_dataloader, val_dataloader, test_dataloader

In [None]:
train_dataloader, val_dataloader, test_dataloader = loader(batch_size=16)

In [None]:
#freesol dataset has 1 task. Some other datasets may have some more number of tasks, e.g., tox21 has 12 tasks.
num_tasks = 1

# Size of global feature of each graph
global_size = 200

# Number of epochs to train the model
num_epochs = 100

# Number of steps to wait if the model performance on the validation set does not improve
patience = 10

#Configurations to instantiate the model
config = {"node_feature_size":127, "edge_feature_size":12, "hidden_size":100}

In [None]:
class GNN(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = GraphConv(self.node_feature_size, self.hidden_size, allow_zero_in_degree = True)
        self.conv2 = GraphConv(self.hidden_size, self.num_tasks, allow_zero_in_degree = True)

    # def forward(self, g, in_feat):
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"]= mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

In [None]:
import math
from sklearn.metrics import mean_squared_error
def compute_score(model, data_loader, val_size, num_tasks):
  model.eval()
  loss_sum = nn.MSELoss(reduction='sum') # MSE with sum instead of mean, i.e., sum_i[(y_i)^2-(y'_i)^2]
  final_loss = 0
  state = torch.get_rng_state()
  with torch.no_grad():
            for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
                prediction = model(mol_dgl_graph, globals)
                prediction = torch.tensor(scaler.inverse_transform(prediction.detach().cpu()))
                labels = torch.tensor(scaler.inverse_transform(labels.cpu()))
                loss = loss_sum(prediction, labels)
                final_loss += loss.item()
            final_loss /= val_size
            final_loss = math.sqrt(final_loss) # RMSE
  return final_loss / num_tasks

In [None]:
def loss_func(output, label, mask, num_tasks):
    pos_weight = torch.ones((1, num_tasks))
    pos_weight
    criterion = nn.MSELoss(reduction='none')
    loss = mask*criterion(output,label)
    loss = loss.sum() / mask.sum()
    return loss

In [None]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [None]:
def train_evaluate():

    model = GNN(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = math.inf
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, len(val_set), num_tasks)
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")

In [None]:
def test_evaluate():
    final_model = GNN(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, len(test_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))

In [None]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 25.764 | Valid Score: 6.905
 
Epoch: 1/100 | Best Valid Score Until Now: 6.905 

Save checkpoint
Epoch: 2/100 | Training Loss: 25.915 | Valid Score: 6.767
 
Epoch: 2/100 | Best Valid Score Until Now: 6.767 

Save checkpoint
Epoch: 3/100 | Training Loss: 23.253 | Valid Score: 6.627
 
Epoch: 3/100 | Best Valid Score Until Now: 6.627 

Save checkpoint
Epoch: 4/100 | Training Loss: 22.044 | Valid Score: 6.486
 
Epoch: 4/100 | Best Valid Score Until Now: 6.486 

Save checkpoint
Epoch: 5/100 | Training Loss: 20.959 | Valid Score: 6.336
 
Epoch: 5/100 | Best Valid Score Until Now: 6.336 

Save checkpoint
Epoch: 6/100 | Training Loss: 19.719 | Valid Score: 6.180
 
Epoch: 6/100 | Best Valid Score Until Now: 6.180 

Save checkpoint
Epoch: 7/100 | Training Loss: 18.540 | Valid Score: 6.008
 
Epoch: 7/100 | Best Valid Score Until Now: 6.008 

Save checkpoint
Epoch: 8/100 | Training Loss: 17.380 | Valid Score: 5.835
 
Epoch: 8/100 | Best Valid Score Unt

In [None]:
import dgl.function as fn
from dgl.nn import SAGEConv

class GraphSAGE(nn.Module):
    def __init__(self, config, global_size = 200, num_tasks = 1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv(self.node_feature_size, self.hidden_size, aggregator_type='mean')
        self.conv2 = SAGEConv(self.hidden_size, self.hidden_size , aggregator_type='mean')

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:,:self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:,:self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

In [None]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 22.653 | Valid Score: 6.584
 
Epoch: 1/100 | Best Valid Score Until Now: 6.584 

Save checkpoint
Epoch: 2/100 | Training Loss: 21.591 | Valid Score: 6.443
 
Epoch: 2/100 | Best Valid Score Until Now: 6.443 

Save checkpoint
Epoch: 3/100 | Training Loss: 20.305 | Valid Score: 6.302
 
Epoch: 3/100 | Best Valid Score Until Now: 6.302 

Save checkpoint
Epoch: 4/100 | Training Loss: 19.228 | Valid Score: 6.156
 
Epoch: 4/100 | Best Valid Score Until Now: 6.156 

Save checkpoint
Epoch: 5/100 | Training Loss: 19.506 | Valid Score: 6.003
 
Epoch: 5/100 | Best Valid Score Until Now: 6.003 

Save checkpoint
Epoch: 6/100 | Training Loss: 17.179 | Valid Score: 5.836
 
Epoch: 6/100 | Best Valid Score Until Now: 5.836 

Save checkpoint
Epoch: 7/100 | Training Loss: 16.175 | Valid Score: 5.683
 
Epoch: 7/100 | Best Valid Score Until Now: 5.683 

Save checkpoint
Epoch: 8/100 | Training Loss: 15.553 | Valid Score: 5.526
 
Epoch: 8/100 | Best Valid Score Unt

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import dgl

class GAT(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super(GAT, self).__init__()
        self.conv1 = dgl.nn.pytorch.conv.GATConv(in_feats, hidden_feats, num_heads=8)
        self.conv2 = dgl.nn.pytorch.conv.GATConv(hidden_feats * 8, out_feats, num_heads=1)

    def forward(self, g, x):
        h = self.conv1(g, x).flatten(1)
        h = F.elu(h)
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.conv2(g, h).mean(1)
        return F.log_softmax(h, dim=1)

In [None]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()

Save checkpoint
Epoch: 1/100 | Training Loss: 24.548 | Valid Score: 6.799
 
Epoch: 1/100 | Best Valid Score Until Now: 6.799 

Save checkpoint
Epoch: 2/100 | Training Loss: 23.064 | Valid Score: 6.634
 
Epoch: 2/100 | Best Valid Score Until Now: 6.634 

Save checkpoint
Epoch: 3/100 | Training Loss: 21.647 | Valid Score: 6.465
 
Epoch: 3/100 | Best Valid Score Until Now: 6.465 

Save checkpoint
Epoch: 4/100 | Training Loss: 20.328 | Valid Score: 6.298
 
Epoch: 4/100 | Best Valid Score Until Now: 6.298 

Save checkpoint
Epoch: 5/100 | Training Loss: 19.083 | Valid Score: 6.124
 
Epoch: 5/100 | Best Valid Score Until Now: 6.124 

Save checkpoint
Epoch: 6/100 | Training Loss: 20.732 | Valid Score: 5.944
 
Epoch: 6/100 | Best Valid Score Until Now: 5.944 

Save checkpoint
Epoch: 7/100 | Training Loss: 16.708 | Valid Score: 5.757
 
Epoch: 7/100 | Best Valid Score Until Now: 5.757 

Save checkpoint
Epoch: 8/100 | Training Loss: 15.893 | Valid Score: 5.583
 
Epoch: 8/100 | Best Valid Score Unt