In [59]:
import torch
import pykeen
import pandas as pd
from pykeen import predict
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
import numpy as np

file_path = 'C.1_query.tsv'

In [60]:
tf = TriplesFactory.from_path(file_path, delimiter="\t")
training, testing = tf.split([0.8, 0.2], random_state=2025)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [11148, 3066]


## The Most Basic Model

In [61]:
device = "cuda" if torch.cuda.is_available() else "cpu"

resultTransE = pipeline(
    training=training,
    testing=testing,
    model="TransE",
    model_kwargs=dict(
        embedding_dim=128,
    ),
    training_kwargs=dict(
        num_epochs=20
    ),
    negative_sampler_kwargs=dict(
        num_negs_per_pos=5,
    ),
    random_seed=2025,
    device = device
)

INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Training epochs on cuda:0:   0%|          | 0/20 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/48.0 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/3.07k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.56s seconds


We choose the paper with its id "4ab60d22-67d1-4683-8648-f1f2601d2ce0"

In [62]:
paper = '4ab60d22-67d1-4683-8648-f1f2601d2ce0'

cites = 'cites'
written_by = 'written_by'

entity_embeddings = resultTransE.model.entity_representations[0](indices= None).detach()
relation_embeddings = resultTransE.model.relation_representations[0](indices=None).detach()

citing_paper_id = resultTransE.training.entity_to_id[paper]
cites_id = resultTransE.training.relation_to_id[cites]

citing_paper_embedding = entity_embeddings[citing_paper_id]
cites_embedding = relation_embeddings[cites_id]


In [63]:
cited_paper_embedding = citing_paper_embedding + cites_embedding

distances = torch.norm(entity_embeddings - cited_paper_embedding.unsqueeze(0), p=2, dim=1)  # Euclidean Distance
sorted_distances, sorted_indices = torch.sort(distances)


# Check if the cited paper exists
top_cited_paper_id = -1
for index in sorted_indices:
    if index != citing_paper_id:
        top_cited_paper_id = index.item()
        break
if top_cited_paper_id != -1:
    print('Exist the most likely cited paper')
    # print(resultTransE.training.entity_id_to_label[top_cited_paper_id])
    print(f'The most likely cited paper embedding vector: \n {cited_paper_embedding}\n')
else:
    print(f'Not exist the most likely cited paper')

Exist the most likely cited paper
The most likely cited paper embedding vector: 
 tensor([-0.0153,  0.0196, -0.1119, -0.2186,  0.2171, -0.0996, -0.0183, -0.0590,
         0.0347, -0.3698,  0.0098,  0.2149,  0.0916, -0.4350, -0.0132, -0.0311,
         0.1624,  0.0806, -0.2477, -0.0835, -0.2188,  0.2290, -0.0873, -0.0931,
         0.1271, -0.0079,  0.0984, -0.1075, -0.1624,  0.0576,  0.1929,  0.3720,
         0.0327, -0.0503, -0.3467,  0.0171,  0.0278,  0.0174,  0.0968,  0.0798,
         0.0268,  0.0476,  0.2603,  0.0722,  0.0714,  0.1331,  0.0545, -0.0324,
        -0.0312, -0.0274, -0.0849, -0.1099, -0.0682,  0.0391, -0.0052, -0.0980,
        -0.0384, -0.2715, -0.1670, -0.0306, -0.4379, -0.0538,  0.3252, -0.0274,
        -0.1557, -0.0585, -0.2041, -0.0232,  0.3090, -0.0941, -0.1004, -0.0100,
        -0.0698,  0.0400, -0.2241,  0.0232, -0.1309,  0.1026,  0.1558,  0.0290,
        -0.2874,  0.0782, -0.1035,  0.0807, -0.0422,  0.1053, -0.0696, -0.3336,
        -0.0695, -0.0994, -0.0987, -0.

In [64]:
written_by_id = resultTransE.training.relation_to_id[written_by]
written_by_embedding = relation_embeddings[written_by_id]

author_embedding = cited_paper_embedding + written_by_embedding

print(f'The most likely author embedding vector: \n {author_embedding}')

The most likely author embedding vector: 
 tensor([-0.0596, -0.1272, -0.0987, -0.1236,  0.0834, -0.0604,  0.0774, -0.1767,
         0.0990, -0.4991,  0.1285,  0.0663,  0.1928, -0.2541, -0.1201,  0.0207,
         0.3781,  0.0579, -0.2186, -0.0316, -0.2454,  0.3289,  0.0854,  0.0028,
         0.0452,  0.1527,  0.1394, -0.1300, -0.0288,  0.0382,  0.1595,  0.4215,
         0.1607, -0.2039, -0.2075, -0.0007,  0.0467,  0.0329,  0.0525,  0.1158,
        -0.0060,  0.0012,  0.1678,  0.2500,  0.0433,  0.0531,  0.1357,  0.0033,
         0.0450, -0.1339,  0.1360,  0.0471, -0.0129,  0.0196, -0.1828,  0.0964,
         0.0726, -0.3165, -0.0575, -0.0146, -0.3383,  0.0318,  0.2626, -0.1863,
        -0.0621,  0.0766, -0.2621, -0.0150,  0.2354,  0.0452, -0.0377, -0.0680,
         0.0406, -0.1972, -0.2238, -0.1162,  0.0657,  0.0163, -0.0227,  0.1939,
        -0.2340, -0.0911, -0.0281,  0.0110, -0.0014,  0.0125,  0.0070, -0.1916,
        -0.0075, -0.0469, -0.0238, -0.1663, -0.1013,  0.0134, -0.0124, -0.059

In [65]:
distances = torch.norm(entity_embeddings - author_embedding.unsqueeze(0), p=2, dim=1)  # Euclidean Distance
sorted_distances, sorted_indices = torch.sort(distances)

top_author_id = -1
for index in sorted_indices:
    if index != citing_paper_id:
        top_author_id = index.item()
        break
if top_author_id != -1:
    top_author = resultTransE.training.entity_id_to_label[top_author_id]

    print(f'The most likely author is: {top_author}')

The most likely author is: AUTHOR_336
