In [2]:
from torch_geometric.data import HeteroData



In [3]:
# load data
from torch_geometric.datasets import OGB_MAG

dataset = OGB_MAG(root='./data', preprocess='metapath2vec')
data = dataset[0]

In [4]:
data

HeteroData(
  paper={
    x=[736389, 128],
    year=[736389],
    y=[736389],
    train_mask=[736389],
    val_mask=[736389],
    test_mask=[736389],
  },
  author={ x=[1134649, 128] },
  institution={ x=[8740, 128] },
  field_of_study={ x=[59965, 128] },
  (author, affiliated_with, institution)={ edge_index=[2, 1043998] },
  (author, writes, paper)={ edge_index=[2, 7145660] },
  (paper, cites, paper)={ edge_index=[2, 5416271] },
  (paper, has_topic, field_of_study)={ edge_index=[2, 7505078] }
)

In [5]:
paper_node_data = data['paper']
paper_node_data

{'x': tensor([[-0.0954,  0.0408, -0.2109,  ...,  0.0616, -0.0277, -0.1338],
        [-0.1510, -0.1073, -0.2220,  ...,  0.3458, -0.0277, -0.2185],
        [-0.1148, -0.1760, -0.2606,  ...,  0.1731, -0.1564, -0.2780],
        ...,
        [ 0.0228, -0.0865,  0.0981,  ..., -0.0547, -0.2077, -0.2305],
        [-0.2891, -0.2029, -0.1525,  ...,  0.1042,  0.2041, -0.3528],
        [-0.0890, -0.0348, -0.2642,  ...,  0.2601, -0.0875, -0.5171]]), 'year': tensor([2015, 2012, 2012,  ..., 2016, 2017, 2014]), 'y': tensor([246, 131, 189,  ..., 266, 289,   1]), 'train_mask': tensor([True, True, True,  ..., True, True, True]), 'val_mask': tensor([False, False, False,  ..., False, False, False]), 'test_mask': tensor([False, False, False,  ..., False, False, False])}

In [6]:
cites_edge_data = data['paper', 'cites', 'paper']
cites_edge_data

{'edge_index': tensor([[     0,      0,      0,  ..., 736388, 736388, 736388],
        [    88,  27449, 121051,  ..., 421711, 427339, 439864]])}

In [7]:
print(data['has_topic'])

{'edge_index': tensor([[     0,      0,      0,  ..., 736388, 736388, 736388],
        [   145,   2215,   3205,  ...,  21458,  22283,  31934]])}


In [8]:
node_types, edge_types = data.metadata()
print(node_types)
print(edge_types)

['paper', 'author', 'institution', 'field_of_study']
[('author', 'affiliated_with', 'institution'), ('author', 'writes', 'paper'), ('paper', 'cites', 'paper'), ('paper', 'has_topic', 'field_of_study')]


In [9]:
print(data.has_isolated_nodes())
print(data.has_self_loops())
print(data.is_undirected())

False
False
False


In [10]:
import torch_geometric.transforms as T
from torch_geometric.datasets import OGB_MAG
from torch_geometric.nn import SAGEConv, to_hetero
import torch


dataset = OGB_MAG(root='./data', preprocess='metapath2vec', transform=T.ToUndirected())
data = dataset[0]

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


model = GNN(hidden_channels=64, out_channels=dataset.num_classes)
model = to_hetero(model, data.metadata(), aggr='sum')

In [18]:
# optimizer
from torch.optim import Adam
optimizer = Adam(model.parameters(), lr=0.01)
# F
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['paper'].train_mask
    loss = F.cross_entropy(out['paper'][mask], data['paper'].y[mask])
    loss.backward()
    optimizer.step()
    return float(loss)

In [19]:
# train the model
print('Training...')
for epoch in range(1, 101):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Training...
Epoch: 001, Loss: 6.1408
Epoch: 002, Loss: 5.0984
Epoch: 003, Loss: 4.8924
Epoch: 004, Loss: 4.5672
Epoch: 005, Loss: 4.4128
Epoch: 006, Loss: 4.1781
Epoch: 007, Loss: 3.9986
Epoch: 008, Loss: 3.8098
Epoch: 009, Loss: 3.6439
Epoch: 010, Loss: 3.4645
Epoch: 011, Loss: 3.3149
Epoch: 012, Loss: 3.1838
Epoch: 013, Loss: 3.0677
Epoch: 014, Loss: 2.9811
Epoch: 015, Loss: 2.9024
Epoch: 016, Loss: 2.8423
Epoch: 017, Loss: 2.7892
Epoch: 018, Loss: 2.7378
Epoch: 019, Loss: 2.6955
Epoch: 020, Loss: 2.6659
Epoch: 021, Loss: 2.6290
Epoch: 022, Loss: 2.5993
Epoch: 023, Loss: 2.5729
Epoch: 024, Loss: 2.5459
Epoch: 025, Loss: 2.5206
Epoch: 026, Loss: 2.4973
Epoch: 027, Loss: 2.4754
Epoch: 028, Loss: 2.4554
Epoch: 029, Loss: 2.4370
Epoch: 030, Loss: 2.4192
Epoch: 031, Loss: 2.4029
Epoch: 032, Loss: 2.3866
Epoch: 033, Loss: 2.3716
Epoch: 034, Loss: 2.3570
Epoch: 035, Loss: 2.3416
Epoch: 036, Loss: 2.3282
Epoch: 037, Loss: 2.3155
Epoch: 038, Loss: 2.3023
Epoch: 039, Loss: 2.2900
Epoch: 040, L

In [20]:
# save the model
torch.save(model.state_dict(), 'model.pt')

In [21]:
# predict
input = data.x_dict
out = model(input, data.edge_index_dict)
print(out['paper'].argmax(dim=-1))

# evaluate
def test():
    model.eval()
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['paper'].test_mask
    pred = out['paper'][mask].argmax(dim=-1)
    y = data['paper'].y[mask]
    acc = pred.eq(y).to(torch.float).mean().item()
    return acc

acc = test()

tensor([146, 291, 189,  ...,  83,   9,   1])


In [22]:
acc

0.4225899577140808