The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words.

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import numpy as np
import networkx as nx
import torch
from torch_geometric.datasets import TUDataset, Planetoid
from torch_geometric.transforms import NormalizeFeatures

sys.path.append("..")
from models.gcn.gcn_3layer import GCN
from utils.seed_everything import seed_everything

In [4]:
seed_everything()

In [5]:
dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures(), split="public")

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [19]:
np.unique(data.y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]), array([351, 217, 418, 818, 426, 298, 180]))

In [22]:
data.edge_index.shape

torch.Size([2, 10556])

In [23]:
data.edge_index

tensor([[ 633, 1862, 2582,  ...,  598, 1473, 2706],
        [   0,    0,    0,  ..., 2707, 2707, 2707]])

In [6]:
data.edge_index[0]

tensor([ 633, 1862, 2582,  ...,  598, 1473, 2706])

In [7]:
edge_index = data.edge_index
print(edge_index.shape)  # [2, 10556]

# Count unique undirected edges
edges = edge_index.t().tolist()  # list of (source, target)
undirected_edges = set(tuple(sorted(edge)) for edge in edges)

print("Number of unique undirected edges:", len(undirected_edges))

torch.Size([2, 10556])
Number of unique undirected edges: 5278


In [26]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

train_dataset = dataset[:150]
test_dataset = dataset[150:]



In [27]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 1
DataBatch(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], batch=[2708], ptr=[2])



In [None]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=16)
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(data.x)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 002, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 003, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 004, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 005, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 006, Train Acc: 0.6533, Test Acc: 0.7368
Epoch: 007, Train Acc: 0.7133, Test Acc: 0.7632
Epoch: 008, Train Acc: 0.6867, Test Acc: 0.7632
Epoch: 009, Train Acc: 0.7267, Test Acc: 0.7632
Epoch: 010, Train Acc: 0.7200, Test Acc: 0.7895
Epoch: 011, Train Acc: 0.7267, Test Acc: 0.7632
Epoch: 012, Train Acc: 0.7133, Test Acc: 0.7895
Epoch: 013, Train Acc: 0.7200, Test Acc: 0.7895
Epoch: 014, Train Acc: 0.7200, Test Acc: 0.7895
Epoch: 015, Train Acc: 0.7333, Test Acc: 0.7632
Epoch: 016, Train Acc: 0.7333, Test Acc: 0.7895
Epoch: 017, Train Acc: 0.7200, Test Acc: 0.7895
Epoch: 018, Train Acc: 0.7467, Test Acc: 0.7368
Epoch: 019, Train Acc: 0.7467, Test Acc: 0.7368
Epoch: 020, Train Acc: 0.7200, Test Acc: 0.8684
Epoch: 021, Train Acc: 0.7400, Test Acc: