In [1]:
%matplotlib inline

# Lab 4: Graph Neural Networks (GNNs)

In [1]:
# Import packages
import dgl
import torch
import torch.nn.functional as F
import numpy as np
from dgl.nn import GraphConv
from sklearn.model_selection import train_test_split

# Part I: Node Classification

In [2]:
# Import datasat
dataset = dgl.data.AmazonCoBuyPhotoDataset()
print('Number of classes:', dataset.num_classes)

# A DGL Dataset object may contain one or multiple graphs. The Amazon
# dataset used in this lab only consists of one single graph.
graph = dataset[0]
graph = dgl.add_self_loop(graph)

print('Number of nodes:', graph.num_nodes())
print('Number of edges:', graph.num_edges())

Number of classes: 8
Number of nodes: 7650
Number of edges: 245813


A DGL graph can store node features in a
dictionary-like attribute called ``ndata``.
In the DGL Amazon co-buy dataset, the graph contains the following node features:

- ``label``: The ground truth node category.

-  ``feat``: The node features.

In [3]:
print('Node labels and features')
print(graph.ndata)

Node labels and features
{'label': tensor([2, 2, 2,  ..., 3, 1, 1]), 'feat': tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [1., 1., 0.,  ..., 1., 0., 1.]])}


## Exercise 1:

### 1.1: Retrieve key properties of the dataset

In [None]:
# Define key graph variables
X = None
y = None 
num_classes = None
num_feat = None
N = None

print('Number of features: ', num_feat)

###  Split into train/validation/test

In [None]:
def split_dataset(N, train_ratio, seed=4):
    """ Creates train/val/test masks

    Args:
        N (int): dataset size
        train_ratio (float): proportion of the training set
        seed (int, optional): Fixes random. Defaults to 10

    Return: 
        [tensors]: returns boolean tensors for train/val/test set
        True indicates that a node belong to this set, False otherwise
    """

    train_size = int(train_ratio * N)
    val_size = int((N - train_size)/2)
    test_size = N - train_size - val_size

    # split dataset
    subsets = torch.utils.data.random_split(range(N), lengths = [train_size, val_size, test_size], generator=torch.Generator().manual_seed(seed))
    train_inds, val_inds, test_inds = [torch.Tensor(subset.indices) for subset in subsets]

    # create tensors of masks for each subset
    dataset_inds = torch.arange(N)
    train_mask = torch.isin(dataset_inds, train_inds)
    val_mask = torch.isin(dataset_inds, val_inds)
    test_mask = torch.isin(dataset_inds, test_inds)

    return train_mask, val_mask, test_mask

train_mask, val_mask, test_mask = split_dataset(N, train_ratio=0.8)


### 1.2 Implement a Graph Convolutional Network
<center><img src="./gcn_web.png"/></center>

In [5]:
class GNN_model(torch.nn.Module):
    """
    Define a Graph Convolutional Network 
    """
    def __init__(self, num_layers, input_size, hidden_size, output_size, dropout):
        super(GNN_model, self).__init__()
        # Fill in 
        # Define GNN components

    def forward(self, g, x):
        # Fill in
        # Implement the forward function that takes the graph,
        # the features tensor x and returns the output tensor as shown in figure 1

        return # output

### 1.3 Training

In [None]:
def train(model, graph, x, labels, num_epochs, optimizer, train_mask, val_mask, test_mask):
    """ Train the GNN model 

    Args:
        model: GNN model defined in pytorch
        graph (dgl.graph): dataset on which the task is performed
        x (tensor): node feature matrix 
        labels (tensor): node labels
        num_epochs (int): number of epochs
        optimizer: Adam optimizer
        train_mask (tensor): boolean mask for training nodes
        val_mask (tensor): boolean mask for validation set
    """
    
    # Train the model (pytorch specific)
    best_val_acc = 0
    best_test_acc = 0

    model.train()
    for epoch in range(num_epochs):
        # Forward pass -- compute predictions
        pred = #

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = #

        # Compute accuracy on training/validation/test
        pred = torch.argmax(pred, dim=1)
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        # Fill in the blank

        if epoch % 10 == 0:
            print('Epoch {}: loss {:.3f}, train Acc: {:.3f}, val acc: {:.3f}, test acc: {:.3f}'.format(
                epoch, loss, train_acc, val_acc, test_acc))

In [None]:
# Instanciate model
num_layers=3
hidden_size=16
dropout=0
num_epochs=400
lr=0.01,
weight_decay=0.005,
train_ratio=0.8,
seed=4

model = #

# Define an optimizer
optimizer = #

# Train model
train(model, graph, X, y, num_epochs, optimizer, train_mask, val_mask, test_mask)

# Part II: Graph Classification

## Exercise 2

### 2.1: Load dataset

In [None]:
dataset = #

# Add self loop to each graph
for i, graph in enumerate(dataset):
    graph, label = graph
    graph = #

In [None]:
# Split dataset into train, validation and test sets
train_sampler, val_sampler, test_sampler = #

In [None]:
# batch graphs with GraphDataLoader
train_dataloader = #
val_dataloader = #
test_dataloader = #

### 2.2: Create GNN model for graph classification

In [8]:
class BasicGraphModel(torch.nn.Module):

    def __init__(self, n_layers, input_size, hidden_size, output_size):
        super(BasicGraphModel, self).__init__()

        # Define GNN components

    def forward(self, g, x):

        # Create GNN 

        return # output

### 2.3 Training and evaluation

In [None]:
def train(model, loss_fcn, optimizer, train_dataloader, val_dataloader):
    # Train model
    # Fill in

In [None]:
def test(model, loss_fcn, dataloader):
    # Test predictions
    # Fill in 

In [None]:
def evaluate(model, subgraph, labels, loss_fcn):
    # Evaluate model
    # Fill in 

In [None]:
# Store features
n_features, n_classes = dataset[0][0].ndata['node_attr'].shape[1], \
    dataset.num_labels[0]

# Define model, loss function and optimizer
model = # 
optimizer = # 
loss_fcn = # 

# Train and test
train(model, loss_fcn, optimizer, train_dataloader, val_dataloader)
test(model, loss_fcn, test_dataloader)
