In [None]:
## This is a compilation of the PyTorch Geometric tutorial notebooks.
## It compiles the following notebooks:
## 0. graph.ipynb, 1. mp_layer.ipynb, 2. gnn.ipynb, 3. dataloader.ipyb, 4. train_gcn.ipynb
## The original notebooks can be found at
## https://github.com/Junyoungpark/GNNAtoZ/tree/main/pyg_tutorial

## Tutorial 0 : Bring Your Own Graphs

### Mathematical definition of graphs

A graph is a structure amounting to a set of objects where some objects are "related." The objects are often referred to as 'nodes' (or vertices), and the related pairs of vertices refer to as 'edges' (or lines). Mathematically speaking, there's no one unified framework for describing the graph. In this series of tutorials, we represent a graph as a tuple:

$$\mathcal{G}=<V,A>$$

where the $V$ is the set of vertices, and $A$ is the adjacency matrix. The columns and rows indicate nodes. If the value of $A_{ij}$ is 1, then the edge exists between two nodes $i,j$. For the undirected graphs, the adjacent $A$ is symmetric.

We can expand this setup to the directed graphs where the edges are directed. Depending on the edge's direction, the source node is the node where the edge starts from, and the destination node is the node where the edge reaches. In the directed graphs, the adjacent matrix $A$ can be asymmetric.

Note that the index of nodes (and the edges) are arbitrary. This property becomes important in the computational process of GNN, namely permutations invariance.

## Instantiating a graph in python

In python, we have a couple of ways to build graphs. We focus on `networkx` and `PyG` for this purpose.

`networkx` is one of the basic packages for handling graph format data in python. Hence almost every major graph-related packages have an interface to convert a framework-specific graph to the `networkx` equivalent one.

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
G = nx.Graph() # instantiate an empty container for handling graph
G.add_node(1) # Add node in the graph
G.add_node(2) # Add node in the graph
G.add_node(3) # Add node in the graph
nx.draw_networkx(G)

let's add some edges to the graph `G`

In [None]:
G.add_edge(1,2)
G.add_edge(2,3)
nx.draw_networkx(G)

## PyG as a computational framework over graphs

Even though `networkx` supports a unified way for handling graphs in python, it is not friendly with the computations. This section mathematically defines the attributed graphs where the nodes (or/and) edges have the attributes and handling such attributed graphs in `PyG`.

The attributed graph $\mathcal{G}$ is often defined as follows:
$$\mathcal{G} = <V, E>$$

where the $V$ is the **set** of node-related features (node features), and $E$ is the **set** of edge-related features (edge features). Assume the adjacency matrix $A$ is defined implicitly.

We often assume all the node features for all nodes have the same feature dimensions for the simplicity of computations. Similarly, we make the same assumption on the edge features. 

In practice, we achieve such assumptions easily. In some applications, different nodes may be of different types. As a result, different nodes may have different input features. However, by appending some proper dimensional vectors in the beginning or end, we can treat them to be the same dimensional vectors.

Let's instanitate an attributed graph with `PyG`.

In [None]:
! pip install torch_geometric

import torch_geometric as pyg
from torch_geometric.data import Data
import torch

In [None]:
num_nodes = 4
node_feat_dim = 9
edge_feat_dim = 7

In [None]:
# Node features
x = torch.randint(size=(num_nodes, node_feat_dim), high=3) # [num_nodes, node_feat_dim]

# edges 0->1, 0->2, 0->3, 1->3, 2->1
u, v = torch.tensor([0, 0, 0, 1, 2]), torch.tensor([1, 2, 3, 3, 1])
edge_index = torch.stack([u, v], dim=0) # [2, num_edges]
num_edges = edge_index.shape[1]

# Edge featurs
edge_attr = torch.randn(size=(edge_index.shape[1], 
                              edge_feat_dim)) # [num_edges, edge_feat_dim]


g = Data(x=x, # node feature matrix
         edge_index=edge_index,
         edge_attr=edge_attr)
print(g)

## Querying attributes of graph

We've assigned the node features as `x` and edge features as `edge_attr` to the graph `G`. We can access them as follows:

In [None]:
print(f"Node features \n size: {g.x.shape} \n values: \n {g.x}") # equivalently g['x']

print(f"Edge features: \n size: {g.edge_attr.shape} \n values: \n {g.edge_attr}")


## Querying the structure of the graph

When it comes to implementing GNN, querying the statistics of graph, such as number of nodes, edges, and the degree of nodes, are often required. In this section, we will see how to query the graph structure with `PyG`.


In [None]:
# number of nodes in the graph
print(f"Number of Nodes: {g.num_nodes}")

# number of edge in the graph
print(f"Number of edges: {g.num_edges}")

# node feature dimension
print(f"Node feature dimension: {g.num_node_features}")

# edge feature dimension
print(f"Edge feature dimension: {g.num_edge_features}")

## Assigning arbitrary attributes to the graph

From time to time, we may want to store additional information to the graph. For example, the prediction label of the graph, or the graph-level features In this section, we will see how to assign arbitrary attributes to the graph with `PyG`.

In [None]:
g = Data(x=x, # node feature matrix
         edge_index=edge_index,
         edge_attr=edge_attr,
         y_node=torch.randn(size=(num_nodes, 1)), # node label
         y_graph=torch.randn(size=(1, 1)), # graph label       
         )
print(g)

In [None]:
print(g.y_node)

## Device control of `Data` object

In pytorch, we can move the data to the GPU by calling `to` method. In `PyG`, we can do the same thing by calling `to` method. In this section, we will see how to move the `Data` object to the GPU.

In [None]:
# Move the graph to CPU
g = g.to('cpu')
print(g.x.device)

# Move the graph to GPU
g = g.to('cuda:0') # Move the graph to the first CUDA device -- i.e., GPU 0
print(g.x.device)

More detailed explanation of `Data` can be found from the official documentation of [link](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.data.Data.html#torch_geometric.data.Data)

## Batching graph with `Batch`

Unlike tensor-based data, graph-based data has a variable size of the graph. 
Due to this structural differences, conventional way of batching -- i.e., compile the tensors over the new axis -- is not applicable to the graph-based data. Instead, graphs are batched with one big (disconnected) graph. In this section, we will see how to batch the graph with `Batch` object as shown in the following

$$
\mathbf{A}=\left[\begin{array}{ccc}
\mathbf{A}_1 & & \\
& \ddots & \\
& & \mathbf{A}_n
\end{array}\right], \quad \mathbf{X}=\left[\begin{array}{c}
\mathbf{X}_1 \\
\vdots \\
\mathbf{X}_n
\end{array}\right], \quad \mathbf{Y}=\left[\begin{array}{c}
\mathbf{Y}_1 \\
\vdots \\
\mathbf{Y}_n
\end{array}\right]
$$

In [None]:
def generate_random_graph(num_node: int, 
                          p_edges: float=0.1,
                          node_feat_dim: int=3,
                          edge_feat_dim: int=5):
    
    x = torch.randn(size=(num_node, node_feat_dim))
    # assume directional and potentially self-loop
    num_edges = int(num_node * num_node * p_edges)
    
    # Generate random edge index
    # (1) Generate all possible edges
    edge_index = torch.stack([torch.arange(num_node).repeat(num_node),
                              torch.arange(num_node).repeat(num_node)], dim=0)
    # (2) Shuffle the edges and take the first 'num_edges' edges
    edge_index = edge_index[:, torch.randperm(edge_index.shape[1])]
    edge_index = edge_index[:, :num_edges]
    
    edge_attr = torch.randn(size=(num_edges, edge_feat_dim))
    
    dummy_attr = torch.randn(size=(num_node, 1))
    
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr,
                dummy_attr=dummy_attr)

In [None]:
num_gs = 4
num_nodes = [7, 8, 5, 11]

gs = [generate_random_graph(num_node=num_node) for num_node in num_nodes]
for i, g in enumerate(gs):
    print(f"Graph {i+1}: {g}")

In [None]:
from torch_geometric.data import Batch

batched_g = Batch.from_data_list(gs, 
                                 exclude_keys=['dummy_attr'])
print(type(batched_g)) # Batch is a subclass of Data -- this explains so much!)
print(batched_g)

## Querying the batched graph statistics

In [None]:
print(f"Number of batched graphs: {batched_g.num_graphs}")
print(f"Number of cumulative nodes: {batched_g.ptr}")
print(f"Batch indicators: {batched_g.batch}")

## Tutorial 1 : Implementing GNN layer using `MessagePasing` class

In general, a Graph Neural Network (GNN) layer can be written as spectral-based or spatial-based methods. The spectral-based GNN layer is defined in the Fourier domain, while the spatial-based GNN layer is defined in the vertex domain. In often cases, majority mainstream GNN models are spatial-based methods due to the limitations of spectral-based methods, including the difficulty of generalization to unseen graphs and the high computational complexity. 

In this tutorial, we will focus on implementing spatial-based GNN layers (i.e., message passing networks) using the `MessagePassing` class of `PyG`.

In [None]:
import torch
import torch.nn as nn
from torch_geometric.nn.conv import MessagePassing

## `MessagePassing` Base Class in PyG

The `MessagePassing` base class in PyG implements the message passing scheme as follows:

$$
x'_i=
\underbrace{
f_\theta \left(\mathbf{x}_i,
\underbrace{ 
\bigoplus_{j \in \mathcal{N}(i)} 
}_{\text{(2) aggregation}}
\underbrace{
g_\theta\left(\mathbf{x}_i, \mathbf{x}_j, \mathbf{e}_{ij}\right)
}_{\text{(1) message}}
\right)
}_{\text{(3) update}},
$$

where 
- $x_i \in \mathbb{R}^{d_n}$ and $x'_i \in \mathbb{R}^{d_n'}$ are the input and updated node features (embeddings), respectively.
- $d_n$ and $d'_n$ are the dimensions of node features before and after the update.
- $e_{ij} \in \mathbb{R}^{d_{e}}$ is the edge feature between nodes $i$ and $j$.
- $d_e$ is the dimension of edge features.
- $\bigoplus$ is a differentiable and permutation-invariant function (e.g., summation, mean, maximum).
- $g_\theta : \mathbb{R}^{d_n} \times \mathbb{R}^{d_n} \times \mathbb{R}^{d_{e}} \rightarrow \mathbb{R}^{d''}$ is a trainable edge function (e.g., MLP).
- $f_\theta : \mathbb{R}^{d_n} \times \mathbb{R}^{d''} \rightarrow \mathbb{R}^{d'_n}$ is a trainable mapping function (e.g., MLP).
- $\mathcal{N}(i)$ is the set of neighbors of node $i$.


This message passing implementation is flexible and can be used to implement various GNN variants, including GCN, GAT, GraphSAGE, Interaction Layers, etc. 

In PyG, the `MessagePassing` triggers the (1) message, (2) message aggregation, and (3) node update functions in the order of (1) -> (2) -> (3) while `propgate` the messages. In the following tutorial, we will demonstrate how to implement the components of message passing (message, aggregation, and update) to create different graph convolution layers.

## Very First Message Passing Layer; NaiveGCN

We will use the `MessagePassing` base class to implement the NaiveGCN layer that performs the following message passing scheme:

$$
x'_i = \sigma \left(\sum_{j \in \mathcal{N}(i)} \left(W\mathbf{x}_j+b\right) \right),
$$
where $\sigma$ is an non-linear activations (e.g. ReLU, Tanh, SiLU, GELU, ...)

In [None]:
# Yes this is it! We are done with the implementation of NaiveGCN.

class NaiveGCNConv(MessagePassing):
    
    def __init__(self, dim:int, act:'str'='ReLU'):
        
        super().__init__(aggr='add') # Aggregates messages with Summation (i.e., addition).        
        self.linear = nn.Linear(dim, dim)
        self.act = getattr(nn, act)()
        
    def forward(self, x, edge_index):
        x = self.linear(x) # Perform Wx+b
        x = self.propagate(x=x, edge_index=edge_index) # Propagate messages and Aggregate them with Summation
        x = self.act(x) # Apply activation function
        return x

In [None]:
num_node = 5
n_dim = 16

g = generate_random_graph(num_node=num_node, 
                          p_edges=0.5,
                          node_feat_dim=n_dim,
                          edge_feat_dim=n_dim)

In [None]:
conv = NaiveGCNConv(dim=n_dim)
gc_out = conv(g.x, g.edge_index)
print(gc_out.shape)

## Can we have a more fine-grained control over messaging passing scheme?

As explained earlier, we can generally manipulate (1) message generation routine,
(2) message aggregation routine, and (3) node update routine to implement GNN layers.

In the following, we will implement such (sub) routines in `MessagePassing` class.
Generally, (1) is done by overridding `message` method, (2) is done by specifying `aggr` in `MessagePassing` class's constructor, and (3) is done by overridding `update` method.

### `message` method of `MessagePassing` class

The NaiveGCN can be implemented without explicit message generation as the messages are the source node features. However, in many cases, we often employ more sophisticated message generation schemes that generates "message" from source, destination, and edge features (if applicable). For example, the following message generation scheme is used in 
[Edge Convolution](https://arxiv.org/abs/1801.07829):

$$
x'_i = \max_{j \in \mathcal{N}(i)} h_\theta\left(x_i, x_j-x_i\right),
$$

where $h_\theta$ is an learnable function

In [None]:
class EdgeConv(MessagePassing):
    
    def __init__(self, dim:int):
        super().__init__(aggr='max')
        self.h = nn.Sequential(
            nn.Linear(dim*2, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )
    
    def forward(self, x, edge_index):
        return self.propagate(edge_index, x=x)
    
    def message(self, x_i, x_j):
        # By overriding this function, we can specify how messages are computed by
        
        # x_i is the source node and x_j is the target node
        # x_i, x_j has shape [E, dim], where E is the number of edges
        msg = torch.cat([x_i, x_j-x_i], dim=-1) # [E, 2*dim]
        msg = self.h(msg) # [E, dim]
        return msg

In [None]:
conv = EdgeConv(dim=n_dim)
gc_out = conv(g.x, g.edge_index)
print(gc_out.shape)

### `Update` method of `MessagePassing` class

The update method is used to update the node features using the aggregated messages. As a running example, we will consider a simplified Interaction Network layer, an iconic GNN model for learning the interaction between two objects (e.g., atoms, nodes, etc.) in a graph. The update function of the Interaction Network layer is defined as follows:
$$
\begin{align}
e'_{ij} &= f_\theta(x_i, x_j, e_{ij}), \\
x'_{i} &= g_\theta(x_i, \sum_{j \in \mathcal{N}(i)} e'_{ij}),
\end{align}
$$

where $f_\theta$ and $g_\theta$ are edge and node updater, respectively. The updaters are often implemented with learnable functions (e.g., MLP)

Unlike `NaiveGCNConv` or `EdgeConv`, the Interaction Network Layer requires to update the edge features to perform node updates. To take account the edge update, we additionally implement `edge_update` method in the `MessagePassing` class. This `edge_update` method is called inside of `edge_updater` method that is already defined in `MessagePassing` class.

In [None]:
class InteractionNetworkLayer(MessagePassing):
    
    def __init__(self, dim:int):
        super().__init__(aggr='add')
        self.f = nn.Sequential(
            nn.Linear(dim*3, dim), # Assuming edge are node features are of same dimension
            nn.ReLU(),
            nn.Linear(dim, dim)
        )
        
        self.g = nn.Sequential(
            nn.Linear(dim*2, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )

    def forward(self, x, edge_index, edge_attr):
        updated_ef = self.edge_updater(edge_index=edge_index, x=x, edge_attr=edge_attr)
        updated_nf = self.propagate(edge_index=edge_index, 
                                    x=x, edge_attr=updated_ef)    
        return updated_nf, updated_ef
    
    def edge_update(self, x, edge_index, edge_attr):
        row, col = edge_index
        x_i, x_j = x[row], x[col] # src and dst node features
        return self.f(torch.cat([x_i, x_j, edge_attr], dim=-1)) # Eq (1)

    # Eq (2) related
    def message(self, edge_attr):
        return edge_attr
    
    def update(self, aggr_msg, x): 
        # !!! The update function takes in the aggregated messages as the first argument !!!
        # The other arguments are any arguments passed to the propagate function.
        
        # Maybe good to practice
        # Try to pass additional arguments that is not passed to self.propagate such as 'y'
        # by changing 'def update(self, aggs_msg, x)' to 'def update(self, aggs_msg, x, y)'
        return self.g(torch.cat([x, aggr_msg], dim=-1))

In [None]:
conv = InteractionNetworkLayer(dim=n_dim)
updated_nf, updated_ef = conv(g.x, g.edge_index, g.edge_attr)
print(updated_nf.shape, updated_ef.shape)

## `aggr` argument of `messagePassing` class's constructor

### Implementing `AttentiveInteractionLayer` with Advanced aggregation

So far, we've considered to use "simple" aggregation methods in aggregating messages. However, we can consider a more sophisticated aggregation methods that can be used to implement various GNN variants. Luckily, PyG provides a set of aggregation methods that can be used to implement various GNN variants. For example, we implement attentive aggregation with `aggr.AttentionalAggregation`. Formally, the following layer performs the following message passing scheme:

$$
\begin{align}
e'_{ij} &= f_\theta(x_i, x_j, e_{ij}), \\
w_{ij} &= \text{softmax}_j \left(\text{gate}_\theta(e'_{ij}) \right), \\
x'_{i} &= g_\theta(x_i, \sum_{j \in \mathcal{N}(i)} w_{ij} e'_{ij}),
\end{align}
$$

In [None]:
from torch_geometric.nn import aggr

class AttentiveINLayer(MessagePassing):
    
    def __init__(self, dim:int):
        
        gate_nn = nn.Sequential(
            nn.Linear(dim, dim),
            nn.Tanh(),
            nn.Linear(dim, 1)
        )
            
        super().__init__(aggr=aggr.AttentionalAggregation(gate_nn=gate_nn))
        
        self.f = nn.Sequential(
            nn.Linear(dim*3, dim), # Assuming edge are node features are of same dimension
            nn.ReLU(),
            nn.Linear(dim, dim)
        )
        
        self.g = nn.Sequential(
            nn.Linear(dim*2, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )

    def forward(self, x, edge_index, edge_attr):
        updated_ef = self.edge_updater(edge_index=edge_index, x=x, edge_attr=edge_attr)
        updated_nf = self.propagate(edge_index, x=x, edge_attr=updated_ef)    
        return updated_nf, updated_ef
    
    def edge_update(self, x, edge_index, edge_attr):
        row, col = edge_index
        x_i, x_j = x[row], x[col] # src and dst node features
        return self.f(torch.cat([x_i, x_j, edge_attr], dim=-1)) # Eq (1)

    def message(self, edge_attr):
        return edge_attr
    
    def update(self, aggr_msg, x): 
        return self.g(torch.cat([x, aggr_msg], dim=-1))

In [None]:
conv = AttentiveINLayer(dim=n_dim)
updated_nf, updated_ef = conv(g.x, g.edge_index, g.edge_attr)
print(updated_nf.shape, updated_ef.shape)

## Tutorial 2 : Build Graph Neural Networks with PyG

In this tutorial, we will learn how to build a graph neural network with PyG.
PyG offers various handy features when it comes to build GNNs, including
- An extended `Sequential` module that can be used to build GNN
- Pre-implemented and, also, optimized graph convolutional layers
- Graph Neural Network implementations

### A limitation of `torch.nn.Sequential`

In native PyTorch, `torch.nn.Sequential` is a handy module that allows us to build a neural network in a sequential manner. For example, we can build a simple MLP with `torch.nn.Sequential` as follows:

```python
mlp = nn.Sequential(
    nn.Linear(32, 32),
    nn.ReLU(),
    nn.Linear(32, 1),
)
```

`Sequential` class minimizes the boiler plate code for implementing `forward` methods. 
We can implement equivalent MLP without using `Sequential` as follows:

```python
import torch.nn as nn

class MLP(nn.Module):
    
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.ModuleList([nn.Linear(32, 32), 
                                     nn.ReLU(), 
                                     nn.Linear(32, 1)])
    
    def forward(self, x):        
        for layer in self.layers:
            x = layer(x)
        return x

mlp = MLP()
```



However, `torch.nn.Sequential` has a limitation that each layer should have only one input and output.
This limitation becomes a problem when it comes to building a graph neural network. For instance, when `INLayer` takes
two inputs, node and edge features and return two outputs updated node and edge features. Hence
it is less trivial to build a graph neural network with `torch.nn.Sequential`.

### Using `pytorch_geometric.nn.Sequential` to build GNN

`pytorch_geometric.nn.Sequential` is an extended version of `torch.nn.Sequential` that allows us to build a graph neural network in a sequential manner. Let's see how we can build a graph neural network with `pytorch_geometric.nn.Sequential`.

In [None]:
from torch_geometric.nn import Sequential
from torch_geometric.data import Batch

print(help(InteractionNetworkLayer.forward))

In [None]:
dim = 5
model = Sequential("x, edge_index, edge_attr", # input
                   [
                       (InteractionNetworkLayer(dim), "x, edge_index, edge_attr -> x, edge_attr"), 
                       (InteractionNetworkLayer(dim), "x, edge_index, edge_attr -> x, edge_attr"),
                   ]
)

In [None]:
gs = Batch.from_data_list([generate_random_graph(5 * (i+1),
                                                 node_feat_dim=dim,
                                                 edge_feat_dim=dim) for i in range(3)])

In [None]:
unf, uef = model(gs.x, gs.edge_index, gs.edge_attr) # update node feature (unf), updated edge feature (uef)

## Pre-implemented graph convolutional layers in PyG

PyG offers various pre-implemented graph convolutional layers. Let's see how we can use them.
In this tutorial, we will check three iconic graph convolutional layers, `GCNConv` and ` SAGEConv`.
The exhaustive list of implemented graph convolutional layers can be found [here](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#convolutional-layers).

### Implementing `GCNConv` in PyG

`GCNConv` is a graph convolutional layer proposed in [Semi-Supervised Classification with Graph Convolutional Networks](https://arxiv.org/abs/1609.02907). In PyG, we can implement Graph Convolutional Network (GCN) in various way.
We will check two different ways to implement GCN in PyG.
- Using `GCNConv` layer with `Sequential` module
- Using 'models.GCN' class

In [None]:
from torch_geometric.nn import GCNConv

out_dim = 13

# Construct GCN layer (i.e., GCNConv)
gcn_conv = GCNConv(dim, out_dim)
updated_x = gcn_conv(gs.x, gs.edge_index)

print(f'Input node feature size: {gs.x.shape}')
print(f'Output node feature size: {updated_x.shape} \n')

# Construct GCN by stacking GCNConv using Sequential
gcn = Sequential("x, edge_index", 
                 [(GCNConv(dim, dim), "x, edge_index -> x"),
                  (GCNConv(dim, dim), "x, edge_index -> x"),
                  (GCNConv(dim, dim), "x, edge_index -> x"),
                ]
)

# Or equivalently
# num_layers = 3
# gcn = Sequential("x, edge_index", [(GCNConv(dim, dim), "x, edge_index -> x") for _ in range(num_layers)])
# print(gcn)

print(f'Model spec: \n {gcn} \n')

# GCN forward
gcn_out = gcn(gs.x, gs.edge_index)
print(f'Input node feature size: {gs.x.shape}')
print(f'GCN output node feature size: {gcn_out.shape}')

### Construct GCN using `torch_geometric.nn.models.GCN`

`PyG` provides pre-implemented famous GNN models with the enhanced features and code-level optimizations.
`torch_geometric.nn.models.GCN` is one of the pre-implemented GCN in `PyG`. Using this we can build a GCN, by simplying
calling `models.GCN` class.

In [None]:
from torch_geometric.nn.models import GCN

gcn = GCN(in_channels=dim, 
          hidden_channels=dim, 
          out_channels=dim, num_layers=3)

gcn_out = gcn(gs.x, gs.edge_index)
print(f'Input node feature size: {gs.x.shape}')
print(f'GCN output node feature size: {gcn_out.shape}')

## Implementing Graph SAGE with PyG

Graph SAGE is a graph convolutional layer proposed in [Inductive Representation Learning on Large Graphs](https://arxiv.org/abs/1706.02216). 

In [None]:
from torch_geometric.nn import SAGEConv

sage_conv = SAGEConv(in_channels=dim, 
                     out_channels=dim,
                     aggr='mean')
print(sage_conv)
sage_out = sage_conv(gs.x, gs.edge_index)
print(f'Input node feature size: {gs.x.shape}')
print(f'Graph SAVE output node feature size: {sage_out.shape}')

In [None]:
from torch_geometric.nn.models import GraphSAGE

sage_conv_kwargs = {
    'aggr': 'mean'
}
graph_sage = GraphSAGE(in_channels=-1, # '-1' let the model infer the input dimension from the first forward!
                       # THis can be a handy feature, BUT not recommended for readability and reproducibility
                       hidden_channels=dim,
                       out_channels=13,
                       num_layers=3,
                       **sage_conv_kwargs)
graph_sage                       

In [None]:
graph_sage_out = graph_sage(gs.x, gs.edge_index)
print(f'Input node feature size: {gs.x.shape}')
print(f'Graph SAVE output node feature size: {graph_sage_out.shape}')

## Graph Readout and Pooling

So far, we've learned how to build a graph neural network with PyG. The graph neural network $f_\theta$ generically takes a graph $\mathcal{G}=(X,E)$ and returns the updated graph $\mathcal{G}'=(X',E')$ as follows:
$$
\mathcal{G}' = f_\theta(\mathcal{G})
$$

However, for some tasks, we want to map the graph to a single vector (e.g., Graph property prediction tasks, where input is a graph ans output is the scalar-represented values). In this case, we need to aggregate the node features into a single vector. This process is called graph pooling or graph readout. In this tutorial, we will learn how to implement graph pooling with PyG.

## A very simple pooling method; SumPooling

The simplest way to aggregate node features into a single vector is to sum up all the node features. This method is called SumPooling. Mathematically, SumPooling can be defined as follows:
$$
x_\mathcal{G} = \sum_{i \in \mathcal{N}} x_i
$$
where $\mathcal{N}$ is the set of nodes in the graph $\mathcal{G}$. We can also consider to pool edges features (if exists) in a similar fashion. 

Okay, why don't we the sum pooling as follows:

```python

import torch

num_graphs = 3
num_nodes = 5
hidden_dim = 12

h = torch.randn(num_graphs, num_nodes, hidden_dim)
aggr = h.sum(dim=1) # perform summation along the first dimension
print(aggr.shape) # torch.Size([3, 12])
```

Unfortunately, it is often impossible batching node features in a Tensor as the graphs in the batch often have different number of nodes. Therefore, we need to implement a custom SumPooling layer that can handle a batch of graphs with different number of nodes. In PyG, we can implement SumPooling as follows:

In [None]:
from torch_geometric.nn.pool import global_add_pool, global_mean_pool, global_max_pool # Yes, you can do mean, max pooling in PyG!

pooled = global_add_pool(x=gs.x, 
                         batch=gs.batch) # batch is a tensor that indicates which graph the node belongs to
pooled.shape

## Advanced: `pytorch_scatter` for more complex pooling routines

`pytorch_geometric` used to implement several key features with `pytorch_scatter` for 'pooling' (i.e., aggregate the set of vectors into a single vectors). In this section, we will learn how to use `pytorch_scatter` for more complex pooling routines.

<div style="text-align: center;">
  <img src="./assets/add.svg" alt="Image description" style="width: 400px;">
  <p style="margin-top: 10px;">The behavior of torch_scatter.scatter </p>
</div>

Figure from [here](https://pytorch-scatter.readthedocs.io/en/latest/functions/add.html)

As you can see from the figure, `scatter` operation aggregates 'src' (or input) into 'outputs' with the 'index' vectors. This design choice
allows store the data with plain tensor while aggregating the different number of inputs with a single operation.

**Note: the same features are now implemented in `torch_geometric.utils.scatter`**

In [None]:
import torch
from torch_scatter import scatter
# or equivalently
from torch_geometric.utils import scatter

dim1, dim2 = 11, 13
src = torch.randn(6, dim1, dim2)
index = torch.tensor([0, 1, 0, 1, 2, 1])

# Naive loopy implementation
out_naive = torch.zeros(index.unique().shape[0], dim1, dim2)
for i in index.unique().tolist():
    out_naive[i] = src[index == i].sum(dim=0)
    
# torch_scatter implementation
out = scatter(src, index, dim=0, reduce="sum") # reduce can be "sum", "mean", "max", "min"

assert torch.allclose(out_naive, out)

In [None]:
from torch_geometric.utils import segment
# A similar operation can be done also based on the segments
# For the further details, please refer to the documentation

src = torch.randn(10, 6, 64)
indptr = torch.tensor([0, 2, 5, 6])
indptr = indptr.view(1, -1)  # Broadcasting in the first and last dim.

out = segment(src, indptr, reduce="sum")

### Composite scatter operations; `Softmax`, `logsumexp`, and `scatter_std`

In [None]:
from torch_scatter.composite import scatter_softmax, scatter_std, scatter_logsumexp
# or equivalently
from torch_geometric.utils import softmax

src = torch.randn(10, 1)
idx = torch.tensor([0,0,0,1,1,2,2,2,2,2])
out = scatter_softmax(src, idx, dim=0)
print(out.shape)

print("Results of 'scatter_softmax'")
print(f'1st batch: {out[:3].view(-1)}, sum={out[:3].sum()}')
print(f'2nd batch: {out[3:5].view(-1)}, sum={out[3:5].sum()}')
print(f'3rd batch: {out[5:].view(-1)}, sum={out[5:].sum()}')

Alternative to the graph pooling, **Virtual node** is also often used to concentrate node/edge feature to a single vector. 
A primitive approach to virtual node is introducing additional node to the graph and attach the edges from the virtual node to all the nodes in the graph. 
It often works well, as compared to the naive graph pooling approaches. Please refer to the research papers for more details:
- [Graph Classification via Deep Learning with Virtual Nodes](https://arxiv.org/pdf/1708.04357.pdf),
- [On the Connection Between MPNN and Graph Transformer](https://arxiv.org/pdf/2301.11956.pdf)

## Tutorial 3 : Interfacing Graph `Data` with Pytorch Geometric `DataLoader`

In this tutorial, we will learn how to use Pytorch Geometric `DataLoader` to load graph data for mini-batch training. \
Q) Why we want to use Pytorch Geometric `DataLoader`? \
A) It is inherited from Pytorch `DataLoader`, which means it is easy to use + supported by Pytorch community.

Let's start the tutorial by implementing graph dataset.

In [None]:
import warnings
warnings.filterwarnings(action='ignore') 

In [None]:
import torch
from torch.utils.data import Dataset
from torch_geometric.utils import erdos_renyi_graph
from torch_geometric.data import Data

In [None]:
def generate_er_graph(num_nodes:int, 
                      edge_prob:float,
                      feat_dim:int=16):
    
    edge_idx = erdos_renyi_graph(num_nodes=num_nodes,edge_prob=edge_prob)
    x = torch.randn(num_nodes, feat_dim)
    y = (x.sum() / num_nodes).view(1,1)
    dummy = torch.randn(num_nodes, 32)
    g = Data(x=x, edge_index=edge_idx, y=y, dummy=dummy)
    return g

In [None]:
class ERDataset(Dataset):
    
    def __init__(self, 
                 num_graphs:int,
                 min_num_nodes: int = 32,
                 max_num_nodes: int = 64,
                 edge_prob: float = 0.3):
        
        num_nodes = torch.randint(min_num_nodes, max_num_nodes, (num_graphs,))
        self.gs = [
            generate_er_graph(num_nodes[i], edge_prob) for i in range(num_graphs)
        ]
        
    def __getitem__(self, index):
        return self.gs[index]
        
    def __len__(self):
        return len(self.gs)

In [None]:
dataset = ERDataset(128)
print(dataset[0])

## PyG `DataLoader`

As mentioned earlier, PyG `DataLoader` is inherited from Pytorch `DataLoader`. Meaning that we can
pass any arguments or keyword agruments that Pytorch `DataLoader` supports. Furthermore, PyG `DataLoader`
supports graph (mini) batching using custom `collate_fn` -- basically, an well engineering version of `Batch.from_data_list`.

In [None]:
from torch_geometric.data import DataLoader

dataloader = DataLoader(dataset, 
                        follow_batch=['batch'], # we can specify which attributes to follow to form "batch" attribute
                        exclude_keys=['dummy'], # we can specify which attributes can be excluded from the batch
                        batch_size=32, shuffle=True)
batched_g = next(iter(dataloader))

print(batched_g)
print(f'Number of graphs in batch: {batched_g.num_graphs}')

## Feed Batched Data to a GNN model

In [None]:
from torch_geometric.nn.models import GCN

model = GCN(in_channels=16,
            hidden_channels=32, 
            out_channels=1,
            num_layers=3)

pred = model(batched_g.x, batched_g.edge_index)
print(pred.shape) # [#. total nodes, output dim]

## Tutorial 4: Minimalistic re-implementation of Graph Convolutional Networks (GCN) in PyG

In this tutorial, we will reimplement Semi-Supervised Classification with Graph Convolutional Networks (GCN) introduced by [Kipf et al. (2017)](https://arxiv.org/abs/1609.02907) with PyTorch Geometric. The following codes are inspired by an open source implementation [here](https://github.com/ki-ljl/PyG-GCN/tree/main)

In [None]:
import torch
from torch_geometric.datasets import Planetoid, NELL
from torch_geometric.nn.models import GCN

from tqdm.auto import tqdm

In [None]:
def get_dataset(name: str):
    assert name in ['Cora', 'CiteSeer', 'PubMed']
    dataset = Planetoid(root=f'/tmp/{name}', name=f'{name}')
    return dataset 

def train(model, data, 
          num_epochs:int=200,
          device:str='cpu'):
    model = model.to(device)
    data = data.to(device)
    
    opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
    loss_fn = torch.nn.CrossEntropyLoss()
    model.train() # Set model to 'train' mode
    
    pbar = tqdm(range(num_epochs), total=num_epochs, ascii=' =', leave=True)
    for epoch in pbar:
        out = model(data.x, data.edge_index)
        opt.zero_grad()
        loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        opt.step()

        # Progress bar
        pbar.set_description('Epoch {:03d} loss {:.4f}'.format(epoch, loss.item()))
        
    model = model.to('cpu')

def test(model, data):
    model.eval()
    _, pred = model(data.x, data.edge_index).max(dim=1)
    correct = int(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
    acc = correct / int(data.test_mask.sum())
    print('Accuracy: {:.4f}'.format(acc))
    


In [None]:
dataset_names = ['Cora', 'CiteSeer', 'PubMed']
device = 'cuda:0'

for dataset_name in dataset_names:
    dataset = get_dataset(dataset_name)

    model = GCN(in_channels=dataset.num_node_features, 
                hidden_channels=32, 
                num_layers=2,
                out_channels=dataset.num_classes,
                dropout=0.5,
                norm='batch', # 'batch', 'instance', 'layer', 'none'
                )

    train(model, dataset[0], device=device)
    print(f'--- {dataset_name} ---')
    test(model, dataset[0])