### Sources:

- https://mlabonne.github.io/blog/graphsage/

In [2]:
import torch
torchversion = torch.__version__

# Install PyTorch Scatter, PyTorch Sparse, and PyTorch Geometric
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.4/512.4 KB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 KB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone


### GraphSAGE Model

In [3]:
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class GraphSAGE(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.sage1 = SAGEConv(dim_in, dim_h)
        self.sage2 = SAGEConv(dim_h, dim_out)
        
    def forward(self, x, edge_index):
        h = self.sage1(x, edge_index)
        h = torch.relu(h)
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.sage2(h, edge_index)
        # Return (embedding, prediction)
        return h, F.log_softmax(h, dim=1)

    def fit(self, epochs, node_loader):
        self.train()
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=5e-4)
        accuracy = lambda y_pred, y: ((y_pred == y).sum() / len(y)).item()
        for epoch in range(epochs + 1):
            # Train on batches
            for batch in node_loader:
                optimizer.zero_grad()
                _, out = self(batch.x, batch.edge_index)
                
                loss = criterion(out, batch.y)
                acc = accuracy(out.argmax(dim=1), batch.y)

                loss.backward()
                optimizer.step()

            # Print metrics every 50 epochs
            if epoch % 50 == 0:
                print(f"Epoch: {epoch:>3} | Train loss: {loss:.3f} | Train acc: {acc*100:>6.2f}%")

    def predict(self, X, edge_index):
        self.eval()
        return model(X, edge_index)

### Dataset

In [4]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root=".", name="Pubmed")
data = dataset[0]

# Print information about the dataset
print(f"Dataset: {dataset}")
print("-"*20)
print(f"Number of graphs   : {len(dataset)}")
print(f"Number of nodes    : {data.x.shape[0]}")
print(f"Number of edges    : {data.edge_index.shape[1]}")
print(f"Number of features : {data.x.shape[1]}")
print(f"Number of classes  : {dataset.num_classes}")

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...


Dataset: Pubmed()
--------------------
Number of graphs   : 1
Number of nodes    : 19717
Number of edges    : 88648
Number of features : 500
Number of classes  : 3


Done!


### Neighbor loader

#### 1. PyTorch implementation


In [5]:
from torch_geometric.loader import NeighborLoader
node_loader = NeighborLoader(
    data,
    num_neighbors=[5, 10],
    batch_size=16,
    input_nodes=data.train_mask
)

for i, elem in enumerate(node_loader):
    print(f"Subgraph {i}:", elem)
print(len(node_loader))

Subgraph 0: Data(x=[408, 500], edge_index=[2, 455], y=[408], train_mask=[408], val_mask=[408], test_mask=[408], n_id=[408], e_id=[455], input_id=[16], batch_size=16)
Subgraph 1: Data(x=[265, 500], edge_index=[2, 307], y=[265], train_mask=[265], val_mask=[265], test_mask=[265], n_id=[265], e_id=[307], input_id=[16], batch_size=16)
Subgraph 2: Data(x=[301, 500], edge_index=[2, 342], y=[301], train_mask=[301], val_mask=[301], test_mask=[301], n_id=[301], e_id=[342], input_id=[16], batch_size=16)
Subgraph 3: Data(x=[193, 500], edge_index=[2, 226], y=[193], train_mask=[193], val_mask=[193], test_mask=[193], n_id=[193], e_id=[226], input_id=[12], batch_size=12)
4


#### 2. Custom implementation

- The input nodes are the `seed_nodes`
- Each iteration takes at most `batch_size` from `seed_nodes` and ends up getting neighbors according to the `num_neighbors` list
- The return type must be a `torch_geometric.data.Data` object with at least `x := [node_id, features]` and `y := [labels]` defined
- I assume that `seed_nodes` were shuffled and selected previously

In [20]:
from torch_geometric.data import Data
import math
import numpy as np

class MyNeighborLoader:
    def __init__(self, 
                 seed_nodes,
                 num_neighbors, 
                 batch_size
        ):
        self.seed_nodes = seed_nodes
        self.num_neighbors = num_neighbors
        self.batch_size = batch_size

    def __iter__(self):
        self.current_batch = 0
        return self
                
    def query_build_subgraph(self, lo, hi):
        indices = self.seed_nodes[lo:hi]
        x = data.x[indices]
        y = data.y[indices]
        # TODO: Query for neighbors
        # TODO: Get edges
        # TODO: Relabel nodes and edges according to the node sample
        return Data(x=x, y=y, edge_index=torch.tensor([[],[]], dtype=int))

    def __next__(self):
        lo = self.current_batch * self.batch_size
        if lo >= len(self.seed_nodes): 
            raise StopIteration
        self.current_batch += 1
        hi = self.current_batch * self.batch_size
        return self.query_build_subgraph(lo, hi)

    def __len__(self):
        return math.ceil(len(self.seed_nodes) / self.batch_size)

loader_args = {
    # An 1-D array with randomly picked node_ids
    "seed_nodes": np.random.randint(0, high=data.x.shape[0], size=64, dtype=int),
    # Neighborhood sizes at distance index
    "num_neighbors": [5, 10],
    # seed_nodes picked per iteration
    "batch_size": 16,
}

my_loader = MyNeighborLoader(**loader_args)
for i, elem in enumerate(my_loader):
    print(f"Subgraph {i}:", elem)
print(len(my_loader))

Subgraph 0: Data(x=[16, 500], edge_index=[2, 0], y=[16])
Subgraph 1: Data(x=[16, 500], edge_index=[2, 0], y=[16])
Subgraph 2: Data(x=[16, 500], edge_index=[2, 0], y=[16])
Subgraph 3: Data(x=[16, 500], edge_index=[2, 0], y=[16])
4


### Model instance

In [7]:
dim_in = data.x.shape[1]
dim_h = 64
dim_out = dataset.num_classes
model = GraphSAGE(dim_in, dim_h, dim_out)
print(model)

GraphSAGE(
  (sage1): SAGEConv(500, 64, aggr=mean)
  (sage2): SAGEConv(64, 3, aggr=mean)
)


### Torch NeighborLoader

In [8]:
model.fit(
    epochs=200,
    node_loader=node_loader
)

Epoch:   0 | Train loss: 1.117 | Train acc:  23.16%
Epoch:  50 | Train loss: 0.214 | Train acc:  91.37%
Epoch: 100 | Train loss: 0.188 | Train acc:  94.47%
Epoch: 150 | Train loss: 0.167 | Train acc:  95.36%
Epoch: 200 | Train loss: 0.146 | Train acc:  95.02%


In [12]:
emb, out = model.predict(data.x, data.edge_index)
y_pred = out.argmax(axis=1)
acc = ((y_pred == data.y).sum() / len(y_pred)).item()
print("Dataset accuracy:", acc)

0.8360805511474609


### My NeighborLoader

In [21]:
model.fit(
    epochs=200,
    node_loader=my_loader
)

Epoch:   0 | Train loss: 0.648 | Train acc:  81.25%
Epoch:  50 | Train loss: 0.025 | Train acc: 100.00%
Epoch: 100 | Train loss: 0.018 | Train acc: 100.00%
Epoch: 150 | Train loss: 0.012 | Train acc: 100.00%
Epoch: 200 | Train loss: 0.016 | Train acc: 100.00%


In [22]:
emb, out = model.predict(data.x, data.edge_index)
y_pred = out.argmax(axis=1)
acc = ((y_pred == data.y).sum() / len(y_pred)).item()
print("Dataset accuracy:", acc)

0.7153725028038025
