# GCN on **PubMed** (Planetoid) with PyTorch Geometric

- PubMed is larger than Cora (19,717 nodes, 88,648 edges, 500 features, 3 classes)
- Defaults below: hidden=16, lr=0.01, weight_decay=5e-4, dropout=0.5.


The PubMed dataset consists of 19,717 scientific publications from the PubMed database, each classified into one of three classes (diabetes types).

The citation network contains 88,648 links.

Each publication is represented by a TF/IDF-weighted word vector indicating the presence importance of words from the dictionary.

The dictionary consists of 500 unique words.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import numpy as np
import networkx as nx
import torch
from torch_geometric.datasets import TUDataset, Planetoid
from torch_geometric.transforms import NormalizeFeatures

sys.path.append("..")
from models.node_clf.gcn.basic_gcn import GCN
from utils.seed_everything import seed_everything

In [3]:
seed_everything()

In [7]:
dataset = Planetoid(root='data/Planetoid', name='PubMed', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0] 
print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: PubMed():
Number of graphs: 1
Number of features: 500
Number of classes: 3

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])
Number of nodes: 19717
Number of edges: 88648
Average node degree: 4.50
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [8]:
# 3 CLASSES
np.unique(data.y, return_counts=True)

(array([0, 1, 2]), array([4103, 7739, 7875]))

In [9]:
data.edge_index.shape

torch.Size([2, 88648])

In [10]:
# adjacency matrix 
data.edge_index

tensor([[ 1378,  1544,  6092,  ..., 12278,  4284, 16030],
        [    0,     0,     0,  ..., 19714, 19715, 19716]])

In [11]:
# node classes (target), nb of nodes
data.y.shape, data.num_nodes

(torch.Size([19717]), 19717)

In [12]:
# node features, each node has a bag-of-wrods of size 500
data.x.shape

torch.Size([19717, 500])

In [13]:
edge_index = data.edge_index
print(edge_index.shape)  # [2, 88648]

# Count unique undirected edges
edges = edge_index.t().tolist() 
undirected_edges = set(tuple(sorted(edge)) for edge in edges)

print("Number of unique undirected edges:", len(undirected_edges))

torch.Size([2, 88648])
Number of unique undirected edges: 44324


In [14]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

train_dataset = dataset[:150]
test_dataset = dataset[150:]

In [15]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 1
DataBatch(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717], batch=[19717], ptr=[2])



In [None]:
dataset.num_classes

3

In [23]:
num_hidden_features = 16
output_dim = dataset.num_classes
model = GCN(dataset.num_node_features, num_hidden_features, output_dim, dropout_p= 0.5)
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x, edge_index)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(data.x, edge_index)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 001, Loss: 1.0998
Epoch: 002, Loss: 1.0962
Epoch: 003, Loss: 1.0933
Epoch: 004, Loss: 1.0883
Epoch: 005, Loss: 1.0813
Epoch: 006, Loss: 1.0725
Epoch: 007, Loss: 1.0685
Epoch: 008, Loss: 1.0603
Epoch: 009, Loss: 1.0510
Epoch: 010, Loss: 1.0439
Epoch: 011, Loss: 1.0318
Epoch: 012, Loss: 1.0157
Epoch: 013, Loss: 1.0102
Epoch: 014, Loss: 0.9992
Epoch: 015, Loss: 0.9960
Epoch: 016, Loss: 0.9780
Epoch: 017, Loss: 0.9656
Epoch: 018, Loss: 0.9528
Epoch: 019, Loss: 0.9503
Epoch: 020, Loss: 0.9308
Epoch: 021, Loss: 0.9218
Epoch: 022, Loss: 0.9066
Epoch: 023, Loss: 0.8828
Epoch: 024, Loss: 0.8804
Epoch: 025, Loss: 0.8642
Epoch: 026, Loss: 0.8555
Epoch: 027, Loss: 0.8250
Epoch: 028, Loss: 0.8334
Epoch: 029, Loss: 0.7858
Epoch: 030, Loss: 0.7982
Epoch: 031, Loss: 0.7790
Epoch: 032, Loss: 0.7788
Epoch: 033, Loss: 0.7780
Epoch: 034, Loss: 0.7632
Epoch: 035, Loss: 0.7128
Epoch: 036, Loss: 0.7041
Epoch: 037, Loss: 0.6901
Epoch: 038, Loss: 0.7056
Epoch: 039, Loss: 0.7072
Epoch: 040, Loss: 0.6629


In [24]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.7890
