In [5]:
%matplotlib inline

# Lab 4: Graph Neural Networks (GNNs)

In [6]:
# Import packages
import dgl
import torch
import torch.nn.functional as F
import numpy as np
from dgl.dataloading import GraphDataLoader
from dgl.nn import GraphConv
from IPython.display import Latex
from sklearn.model_selection import train_test_split

# Part I: Node Classification

In [7]:
# Import datasat
dataset = dgl.data.AmazonCoBuyPhotoDataset()
print('Number of classes:', dataset.num_classes)

# A DGL Dataset object may contain one or multiple graphs. The Amazon
# dataset used in this lab only consists of one single graph.
graph = dataset[0]
graph = dgl.add_self_loop(graph)

print('Number of nodes:', graph.num_nodes())
print('Number of edges:', graph.num_edges())

Number of classes: 8
Number of nodes: 7650
Number of edges: 245813


A DGL graph can store node features in a
dictionary-like attribute called ``ndata``.
In the DGL Amazon co-buy dataset, the graph contains the following node features:

- ``label``: The ground truth node category.

-  ``feat``: The node features.

In [8]:
print('Node labels and features')
print(graph.ndata)

Node labels and features
{'label': tensor([6, 4, 3,  ..., 1, 2, 3]), 'feat': tensor([[0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 1.],
        ...,
        [0., 1., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])}


## Exercise 1:

### 1.1: Retrieve key properties of the dataset

In [9]:
# Define key graph variables
X = graph.ndata['feat']
y = graph.ndata['label']
num_classes = dataset.num_classes
num_feat = X.shape[1]
N = graph.number_of_nodes()

print('Number of features: ', num_feat)

Number of features:  745


In [10]:
def split_dataset(N, train_ratio, seed=4):
    """ Creates train/val/test masks

    Args:
        N (int): dataset size
        train_ratio (float): proportion of the training set
        seed (int, optional): Fixes random. Defaults to 10

    Return: 
        [tensors]: returns boolean tensors for train/val/test set
        True indicates that a node belong to this set, False otherwise
    """

    train_size = int(train_ratio * N)
    val_size = int((N - train_size)/2)
    test_size = N - train_size - val_size

    # split dataset
    subsets = torch.utils.data.random_split(range(N), lengths = [train_size, val_size, test_size], generator=torch.Generator().manual_seed(seed))
    train_inds, val_inds, test_inds = [torch.Tensor(subset.indices) for subset in subsets]

    # create tensors of masks for each subset
    dataset_inds = torch.arange(N)
    train_mask = torch.isin(dataset_inds, train_inds)
    val_mask = torch.isin(dataset_inds, val_inds)
    test_mask = torch.isin(dataset_inds, test_inds)

    return train_mask, val_mask, test_mask

train_mask, val_mask, test_mask = split_dataset(N, train_ratio=0.8)


### 1.2 Implement a Graph Convolutional Network

$$H^{(l+1)} = f(H^{(l)}, A) = \sigma( \tilde{D}^{-\frac{1}{2}} \tilde{A} \tilde{D}^{-\frac{1}{2}}H^{(l)}W^{(l)})$$



<center><img src="./gcn_web.png"/></center>

In [20]:
class GNN_model(torch.nn.Module):
    """
    Define a Graph Convolution Network 
    """
    def __init__(self, num_layers, input_size, hidden_size, output_size, dropout):
        super(GNN_model, self).__init__()

        # Define GNN components
        self.convs = torch.nn.ModuleList() # holds GraphConv layers in a list
        self.convs.append(
            GraphConv(input_size, hidden_size, activation=F.relu)) # You can either define the activation at the layer level or call it inside the forward
        for i in range(num_layers-2):
            self.convs.append(
                GraphConv(hidden_size, hidden_size, activation=F.relu))
        self.convs.append(GraphConv(hidden_size, output_size))

        self.dropout = dropout

    def forward(self, graph, x):
        # Implement the forward function that takes the graph,
        # the features tensor x and returns the output tensor as shown in figure 1
        for conv in self.convs:
            x = conv(graph, x)
    
        output = F.log_softmax(x, dim=1) # Log_softmax is more stable numerically in comparison to softmax
        return output

### 1.3 Training

In [21]:
def train(model, graph, x, labels, num_epochs, optimizer, train_mask, val_mask, test_mask):
    """ Train the GNN model 

    Args:
        model: GNN model defined in pytorch
        graph (dgl.graph): dataset on which the task is performed
        x (tensor): node feature matrix 
        labels (tensor): node labels
        num_epochs (int): number of epochs
        optimizer: Adam optimizer
        train_mask (tensor): boolean mask for training nodes
        val_mask (tensor): boolean mask for validation set
    """
    
    # Train the model (pytorch specific)
    best_val_acc = 0
    best_test_acc = 0
    nll_loss = torch.nn.NLLLoss()

    model.train()
    for epoch in range(num_epochs):
        # Forward
        pred = model(graph, x)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = nll_loss(pred[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        pred = torch.argmax(pred, dim=1)
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print('Epoch {}: loss {:.3f}, train Acc: {:.3f}, val acc: {:.3f}, test acc: {:.3f}'.format(
                epoch, loss, train_acc, val_acc, test_acc))

In [22]:
# Instanciate model
num_layers=3
hidden_size=16
dropout=0.3
num_epochs=300
lr=0.01
weight_decay=0.005
train_ratio=0.8
seed=4

model = GNN_model(num_layers, num_feat, hidden_size, num_classes, dropout)

In [23]:
# Define an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=weight_decay)

# Train model
train(model, graph, X, y, num_epochs, optimizer, train_mask, val_mask, test_mask)

Epoch 0: loss 2.059, train Acc: 0.221, val acc: 0.197, test acc: 0.200
Epoch 10: loss 1.529, train Acc: 0.464, val acc: 0.456, test acc: 0.444
Epoch 20: loss 1.072, train Acc: 0.708, val acc: 0.720, test acc: 0.697
Epoch 30: loss 0.687, train Acc: 0.817, val acc: 0.817, test acc: 0.807
Epoch 40: loss 0.500, train Acc: 0.880, val acc: 0.865, test acc: 0.869
Epoch 50: loss 0.394, train Acc: 0.908, val acc: 0.884, test acc: 0.905
Epoch 60: loss 0.352, train Acc: 0.908, val acc: 0.897, test acc: 0.908
Epoch 70: loss 0.325, train Acc: 0.916, val acc: 0.903, test acc: 0.907
Epoch 80: loss 0.312, train Acc: 0.919, val acc: 0.908, test acc: 0.908
Epoch 90: loss 0.298, train Acc: 0.926, val acc: 0.895, test acc: 0.912
Epoch 100: loss 0.303, train Acc: 0.921, val acc: 0.894, test acc: 0.918
Epoch 110: loss 0.303, train Acc: 0.918, val acc: 0.890, test acc: 0.906
Epoch 120: loss 0.284, train Acc: 0.927, val acc: 0.907, test acc: 0.916
Epoch 130: loss 0.276, train Acc: 0.930, val acc: 0.907, test 

# Part II: Graph Classification

## Exercise 3

### 3.1: Load dataset

In [24]:
dataset = dgl.data.TUDataset(name='ENZYMES')

# Add self loop to each graph
dataset.graph_lists = [dgl.add_self_loop(graph) for graph in dataset.graph_lists]

In [25]:
dataset[0]

(Graph(num_nodes=37, num_edges=205,
       ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_attr': Scheme(shape=(18,), dtype=torch.int64), 'node_labels': Scheme(shape=(1,), dtype=torch.int64)}
       edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
 tensor([5]))

In [26]:
print('Number of graph categories:', dataset.num_labels)
print('Dimension of nodes features', dataset[0][0].ndata['node_attr'].shape[1])

Number of graph categories: [6]
Dimension of nodes features 18


In [27]:
# Split dataset into train, validation and test sets
train_sampler, val_sampler, test_sampler = dgl.data.utils.split_dataset(
        dataset, frac_list=[0.6, 0.2, 0.2], shuffle=True)

In [28]:
# batch graphs with GraphDataLoader
train_dataloader = GraphDataLoader(
        train_sampler, batch_size=5, drop_last=False)
val_dataloader = GraphDataLoader(
    val_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    test_sampler, batch_size=5, drop_last=False)

### 2.1: Create GNN model for graph classification

In [46]:
class BasicGraphModel(torch.nn.Module):

    def __init__(self, n_layers, input_size, hidden_size, output_size):
        super(BasicGraphModel, self).__init__()

        # Define GNN components
        self.convs = torch.nn.ModuleList()
        self.convs.append(GraphConv(input_size, hidden_size))
        for i in range(n_layers-1):
            self.convs.append(GraphConv(hidden_size, hidden_size))
        self.linear = torch.nn.Linear(hidden_size, output_size)

    def forward(self, g, x):
        # Message Passing -- Learn node representations via GCN
        for conv in self.convs[:-1]:
            x = conv(g, x)
            x = F.elu(x)
        x = self.convs[-1](g, x)
        # Readout -- average all node representations to get graph embedding
        g.ndata['h'] = x
        x = dgl.mean_nodes(g, 'h')
        # Apply linear layer to classify graph representation
        x = self.linear(x)
        return x

### 2.1 Training and evaluation

In [47]:
def train(model, loss_fcn, optimizer, train_dataloader, val_dataloader, num_epochs):
    model = model.double()
    model.train()

    for epoch in range(num_epochs):
        losses = []
        for batch, batched_graph in enumerate(train_dataloader):
            batched_graph, labels = batched_graph
            logits = model(batched_graph, batched_graph.ndata['node_attr'].double())
            loss = loss_fcn(logits, labels.T[0])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        loss_data = np.mean(losses)

        if epoch % 5 == 0:
            print("Epoch {} | Loss: {:.4f}".format(epoch, loss_data))
            test(model, loss_fcn, val_dataloader)

In [48]:
def test(model, loss_fcn, dataloader):
    scores = []
    for batch, batched_graph in enumerate(dataloader):
        batched_graph, labels = batched_graph
        scores.append(
            evaluate(model, batched_graph, labels, loss_fcn))
    mean_scores = np.mean(scores)
    print("Accuracy score: {:.4f}".format(mean_scores))

In [49]:
def evaluate(model, batched_graph, labels, loss_fcn):
    model = model.double()
    model.eval()
    with torch.no_grad():
        output = model(batched_graph, batched_graph.ndata['node_attr'].double())

    labels = labels.T[0]
    loss = loss_fcn(output, labels)
    predict = output.argmax(dim=1)
    score = (labels == predict).sum().item() / len(labels)

    return score

In [50]:
# Store features
n_features, n_classes = dataset[0][0].ndata['node_attr'].shape[1], \
    dataset.num_labels[0]
hidden_size = 64

# Define model, loss function and optimizer
model = BasicGraphModel(n_layers=3, input_size=n_features,
                        hidden_size=hidden_size, output_size=n_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
loss_fcn = torch.nn.CrossEntropyLoss()

# Train and test
train(model, loss_fcn, optimizer,
        train_dataloader, val_dataloader, num_epochs=150)
test(model, loss_fcn, test_dataloader)


Epoch 0 | Loss: 1.9696
Accuracy score: 0.2167
Epoch 5 | Loss: 1.6956
Accuracy score: 0.2500
Epoch 10 | Loss: 1.6642
Accuracy score: 0.2583
Epoch 15 | Loss: 1.6076
Accuracy score: 0.2750
Epoch 20 | Loss: 1.5372
Accuracy score: 0.2833
Epoch 25 | Loss: 1.4642
Accuracy score: 0.3000
Epoch 30 | Loss: 1.3809
Accuracy score: 0.3583
Epoch 35 | Loss: 1.2993
Accuracy score: 0.3500
Epoch 40 | Loss: 1.2195
Accuracy score: 0.3417
Epoch 45 | Loss: 1.1524
Accuracy score: 0.3250
Epoch 50 | Loss: 1.0951
Accuracy score: 0.3667
Epoch 55 | Loss: 1.0476
Accuracy score: 0.3500
Epoch 60 | Loss: 1.0443
Accuracy score: 0.3417
Epoch 65 | Loss: 0.9964
Accuracy score: 0.3583
Epoch 70 | Loss: 0.9833
Accuracy score: 0.3583
Epoch 75 | Loss: 0.9031
Accuracy score: 0.3833
Epoch 80 | Loss: 0.8359
Accuracy score: 0.4083
Epoch 85 | Loss: 0.7944
Accuracy score: 0.4167
Epoch 90 | Loss: 0.7931
Accuracy score: 0.4500
Epoch 95 | Loss: 0.7201
Accuracy score: 0.4083
Epoch 100 | Loss: 0.7284
Accuracy score: 0.4667
Epoch 105 | Lo