In [1]:
import os.path as osp
import numpy as np
import torch
from torch_geometric.datasets import AMiner
from torch_geometric.nn import MetaPath2Vec

  from .autonotebook import tqdm as notebook_tqdm


# MetaPath2Vec

[paper](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf)  
[code](https://github.com/rusty1s/pytorch_geometric/blob/master/examples/metapath2vec.py)

In [3]:
# load the dataset
path = osp.join('..', 'data', 'AMiner')
dataset = AMiner(path)
data = dataset[0]


Processing...
  data['author'].y = torch.from_numpy(df['y'].values) - 1
Done!


In [4]:
print(data)

HeteroData(
  author={
    y=[246678],
    y_index=[246678],
    num_nodes=1693531,
  },
  venue={
    y=[134],
    y_index=[134],
    num_nodes=3883,
  },
  paper={ num_nodes=3194405 },
  (paper, written_by, author)={ edge_index=[2, 9323605] },
  (author, writes, paper)={ edge_index=[2, 9323605] },
  (paper, published_in, venue)={ edge_index=[2, 3194405] },
  (venue, publishes, paper)={ edge_index=[2, 3194405] }
)


In [16]:
print(type(data.edge_index_dict))
print(data.edge_index_dict.keys())
print(data.edge_index_dict[('paper', 'written_by', 'author')])

<class 'dict'>
dict_keys([('paper', 'written_by', 'author'), ('author', 'writes', 'paper'), ('paper', 'published_in', 'venue'), ('venue', 'publishes', 'paper')])
tensor([[      0,       1,       2,  ..., 3194404, 3194404, 3194404],
        [      0,       1,       2,  ...,    4393,   21681,  317436]])


In [9]:
print(type(data.num_nodes_dict))
print(data.num_nodes_dict)

<class 'dict'>
{'author': 1693531, 'venue': 3883, 'paper': 3194405}


In [10]:
print(type(data.y_dict))
print(data.y_dict["venue"])

<class 'dict'>
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7])


In [11]:
print(type(data.y_index_dict))
print(data.y_index_dict["venue"])

<class 'dict'>
tensor([1741, 2245,  111,  837, 2588, 2116, 2696, 3648, 3784,  313, 3414,  598,
        2995, 2716, 1423,  783, 1902, 3132, 1753, 2748, 2660, 3182,  775, 3339,
        1601, 3589,  156, 1145,  692, 3048,  925, 1587,  820, 1374, 3719,  819,
         492, 3830, 2777, 3001, 3693,  517, 1808, 2353, 3499, 1763, 2372, 1030,
         721, 2680, 3355, 1217, 3400, 1271, 1970, 1127,  407,  353, 1471, 1095,
         477, 3701,   65, 1009, 1899, 1442, 2073, 3143, 2466,  289, 1996, 1070,
        3871, 3695,  281, 3633,   50, 2642, 1925, 1285, 2587, 3814, 3582, 1873,
        1339, 3450,  271, 2966,  453, 2638, 1354, 3211,  391, 1588, 3875, 2216,
        2146, 3765, 2486,  661, 3367,  426,  750, 2158,  519,  230, 1677,  839,
        2945, 1313, 1037, 2879, 2225, 3523, 1247,  448,  227, 3385,  529, 2849,
        1584, 1229,  373, 2235, 1819, 1764, 3155, 2852, 2789, 3474, 1571, 2088,
         208,  462])


In [12]:
# move the data to cpu or GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = "cpu"

In [None]:
# define the model

metapath = [
    ('author', 'writes', 'paper'),
    ('paper', 'published_in', 'venue'),
    ('venue', 'publishes', 'paper'),
    ('paper', 'written_by', 'author'),
]


model = MetaPath2Vec(data.edge_index_dict, 
                     embedding_dim=128,
                     metapath=metapath,
                     walk_length=5, 
                     context_size=3,
                     walks_per_node=3,
                     num_negative_samples=1,
                     sparse=True
                    ).to(device)


In [21]:
# use the loader to build a loader
loader = model.loader(batch_size=128, shuffle=True, num_workers=3)

In [22]:
for idx, (pos_rw, neg_rw) in enumerate(loader):
    if idx == 10: break
    print(idx, pos_rw.shape, neg_rw.shape)

KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/lukas/git_repos/PytorchGeometricTutorial/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/lukas/git_repos/PytorchGeometricTutorial/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/lukas/git_repos/PytorchGeometricTutorial/.venv/lib/python3.12/site-packages/torch_geometric/nn/models/metapath2vec.py", line 201, in _sample
    return self._pos_sample(batch), self._neg_sample(batch)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lukas/git_repos/PytorchGeometricTutorial/.venv/lib/python3.12/site-packages/torch_geometric/nn/models/metapath2vec.py", line 160, in _pos_sample
    self.rowptr_dict[edge_type],
    ~~~~~~~~~~~~~~~~^^^^^^^^^^^
KeyError: ('venue', 'published', 'paper')


In [None]:
print(pos_rw[0],neg_rw[0])

In [None]:
# Inizialize optimizer
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [None]:
def train(epoch, log_steps=500, eval_steps=1000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Acc: {acc:.4f}'))

@torch.no_grad()
def test(train_ratio=0.1):
    model.eval()

    z = model('author', batch=data.y_index_dict['author'])
    y = data.y_dict['author']

    perm = torch.randperm(z.size(0))
    train_perm = perm[:int(z.size(0) * train_ratio)]
    test_perm = perm[int(z.size(0) * train_ratio):]

    return model.test(z[train_perm], y[train_perm], z[test_perm],
                      y[test_perm], max_iter=150)


In [None]:
for epoch in range(1, 2):
    train(epoch)
    acc = test()
    print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')

# load the model

In [None]:
loaded_model = MetaPath2Vec(data.edge_index_dict, 
                     embedding_dim=128,
                     metapath=metapath,
                     walk_length=5, 
                     context_size=3,
                     walks_per_node=3,
                     num_negative_samples=1,
                     sparse=True
                    ).to(device)

In [None]:
print(loaded_model.embedding.weight[1][:5])

In [None]:
# load the model
loaded_model.load_state_dict(torch.load("mymodel").detach().cpu())

In [None]:
# move the model to cpu
file = torch.load('mymodel', map_location=lambda storage, loc: storage)
loaded_model.load_state_dict(file)

In [None]:
print(loaded_model.embedding.weight[1][:5])

In [None]:
z_venue = loaded_model('venue', batch=data.y_index_dict['venue']).detach().numpy()
z_auth = loaded_model('author', batch=data.y_index_dict['author']).detach().numpy()

In [None]:
z_venue = z_venue[0:100]
z_auth = z_auth[0:100]

In [None]:
import umap

embedder = umap.UMAP().fit(data,y)

z_venue_2d = umap.UMAP().fit_transform(z_venue)
z_auth_2d = umap.UMAP().fit_transform(z_auth)

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(6,6))
plt.scatter(z_auth_2d[:,0],z_auth_2d[:,1],color="red",alpha=0.5,label="author")
plt.scatter(z_venue_2d[:,0],z_venue_2d[:,1],color="blue",alpha=0.5,label="venue")
plt.legend()
plt.title("2D embedding")
plt.show()