The Cora dataset consists of 2708 scientific publications classified into one of seven classes. 

The citation network consists of 5429 links. 

Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. 

The dictionary consists of 1433 unique words.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import numpy as np
import networkx as nx
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

sys.path.append("..")
from models.node_clf.gcn.gcn import GCN
from utils.seed_everything import seed_everything

In [3]:
seed_everything()

# 1. Load Cora Dataset

In [4]:
dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures(), split="public")

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [5]:
np.unique(data.y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]), array([351, 217, 418, 818, 426, 298, 180]))

In [6]:
data.edge_index.shape

torch.Size([2, 10556])

In [8]:
# adjacency matrix in COO format
data.edge_index

tensor([[ 633, 1862, 2582,  ...,  598, 1473, 2706],
        [   0,    0,    0,  ..., 2707, 2707, 2707]])

In [9]:
# node classes (target)
data.y.shape, data.num_nodes

(torch.Size([2708]), 2708)

In [10]:
# node features, each node has a bag-of-wrods of size 1433
data.x.shape

torch.Size([2708, 1433])

In [11]:
# target node classes
data.y.shape

torch.Size([2708])

In [12]:
edge_index = data.edge_index
print(edge_index.shape)  # [2, 10556]

# Count unique undirected edges
edges = edge_index.t().tolist()  # list of (source, target)
undirected_edges = set(tuple(sorted(edge)) for edge in edges)

print("Number of unique undirected edges:", len(undirected_edges))

torch.Size([2, 10556])
Number of unique undirected edges: 5278


In [13]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

train_dataset = dataset[:150]
test_dataset = dataset[150:]



In [14]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 1
DataBatch(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], batch=[2708], ptr=[2])



# 2. Train GCN with Adam

In [18]:
num_hidden_features = 64
model = GCN(dataset.num_node_features, num_hidden_features, dataset.num_classes)
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x, edge_index)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(data.x, edge_index)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 001, Loss: 1.9485
Epoch: 002, Loss: 1.9427
Epoch: 003, Loss: 1.9352
Epoch: 004, Loss: 1.9241
Epoch: 005, Loss: 1.9080
Epoch: 006, Loss: 1.8851
Epoch: 007, Loss: 1.8535
Epoch: 008, Loss: 1.8117
Epoch: 009, Loss: 1.7585
Epoch: 010, Loss: 1.6926
Epoch: 011, Loss: 1.6134
Epoch: 012, Loss: 1.5213
Epoch: 013, Loss: 1.4172
Epoch: 014, Loss: 1.3035
Epoch: 015, Loss: 1.1832
Epoch: 016, Loss: 1.0601
Epoch: 017, Loss: 0.9372
Epoch: 018, Loss: 0.8176
Epoch: 019, Loss: 0.7041
Epoch: 020, Loss: 0.5988
Epoch: 021, Loss: 0.5037
Epoch: 022, Loss: 0.4190
Epoch: 023, Loss: 0.3451
Epoch: 024, Loss: 0.2819
Epoch: 025, Loss: 0.2289
Epoch: 026, Loss: 0.1855
Epoch: 027, Loss: 0.1506
Epoch: 028, Loss: 0.1227
Epoch: 029, Loss: 0.1009
Epoch: 030, Loss: 0.0838
Epoch: 031, Loss: 0.0706
Epoch: 032, Loss: 0.0603
Epoch: 033, Loss: 0.0525
Epoch: 034, Loss: 0.0465
Epoch: 035, Loss: 0.0420
Epoch: 036, Loss: 0.0387
Epoch: 037, Loss: 0.0362
Epoch: 038, Loss: 0.0345
Epoch: 039, Loss: 0.0333
Epoch: 040, Loss: 0.0327


In [19]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.7840


# 3. Train GCN with Riemannian Adam

In [20]:
from geoopt.optim import RiemannianAdam
from hgcn.models.encoders import HGCN
from hgcn.models.decoders import model2decoder
from hgcn.utils.data_utils import load_citation_data

ModuleNotFoundError: No module named 'layers'

In [16]:
adj, features, labels, idx_train, idx_val, idx_test = load_citation_data(dataset_str="cora", use_feats=True, data_path="../data/cora/")

In [17]:
np.nonzero(adj.toarray())

(array([   0,    0,    0, ..., 2707, 2707, 2707], shape=(10556,)),
 array([ 633, 1862, 2582, ...,  598, 1473, 2706], shape=(10556,)))

In [18]:
edge_index

tensor([[ 633, 1862, 2582,  ...,  598, 1473, 2706],
        [   0,    0,    0,  ..., 2707, 2707, 2707]])

In [None]:
model = HGCN()