In [5]:
import torch
import pykeen
import pandas as pd
from pykeen import predict
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
import numpy as np

file_path = 'C.1_query.tsv'

In [6]:
tf = TriplesFactory.from_path(file_path, delimiter="\t")
training, testing = tf.split([0.8, 0.2], random_state=2025)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [11422, 3318]


## The Most Basic Model

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

resultTransE = pipeline(
    training=training,
    testing=testing,
    model="TransE",
    model_kwargs=dict(
        embedding_dim=128,
    ),
    training_kwargs=dict(
        num_epochs=20
    ),
    negative_sampler_kwargs=dict(
        num_negs_per_pos=5,
    ),
    random_seed=2025,
    device = device
)

INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Training epochs on cuda:0:   0%|          | 0/20 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/3.32k [00:00<?, ?triple/s]

  return sum(
INFO:pykeen.evaluation.evaluator:Evaluation took 17.72s seconds


We choose the paper with id: "4ab60d22-67d1-4683-8648-f1f2601d2ce0"

In [8]:
paper = '4ab60d22-67d1-4683-8648-f1f2601d2ce0'

cites = 'cites'
written_by = 'written_by'

entity_embeddings = resultTransE.model.entity_representations[0](indices= None).detach()
relation_embeddings = resultTransE.model.relation_representations[0](indices=None).detach()

citing_paper_id = resultTransE.training.entity_to_id[paper]
cites_id = resultTransE.training.relation_to_id[cites]

citing_paper_embedding = entity_embeddings[citing_paper_id]
cites_embedding = relation_embeddings[cites_id]


In [13]:
# Compute the estimated embedding of the cited paper
cited_paper_embedding = citing_paper_embedding + cites_embedding

# Compute Euclidean distances to all entities
distances = torch.norm(entity_embeddings - cited_paper_embedding.unsqueeze(0), p=2, dim=1)
sorted_distances, sorted_indices = torch.sort(distances)

# Find the closest entity that is not the citing paper itself
top_cited_paper_id = -1
for index in sorted_indices:
    if index != citing_paper_id:
        top_cited_paper_id = index.item()
        break

# Output the result
if top_cited_paper_id != -1:
    print('A likely cited paper was found.')
    # print(resultTransE.training.entity_id_to_label[top_cited_paper_id])
    print(f'Most likely cited paper embedding vector:\n{cited_paper_embedding}\n')
else:
    print('No likely cited paper found.')

A likely cited paper was found.
Most likely cited paper embedding vector:
tensor([-0.3767, -0.0919, -0.0403, -0.0966,  0.0061,  0.1602,  0.3159,  0.0060,
        -0.0229, -0.1147, -0.0722,  0.0963, -0.1766, -0.2739,  0.1544, -0.4501,
        -0.0263,  0.1323, -0.1676,  0.0723,  0.1876, -0.3807, -0.2260, -0.2787,
        -0.0657, -0.0352,  0.0757, -0.3660,  0.0498, -0.1176,  0.0697,  0.0792,
         0.3866, -0.2878, -0.1642, -0.1327,  0.0896, -0.0776, -0.1176, -0.0690,
         0.0627, -0.1493,  0.0051,  0.0773, -0.0151,  0.0177,  0.0910,  0.0709,
        -0.0481, -0.0285, -0.0620,  0.2562, -0.4616,  0.0740, -0.0221,  0.1009,
         0.0700, -0.0685,  0.2536, -0.0914,  0.0361, -0.0306, -0.1291,  0.0658,
         0.2040,  0.0271,  0.3341, -0.0363,  0.1065,  0.0696,  0.0338,  0.0637,
        -0.0098, -0.0280,  0.1646, -0.0927,  0.2481,  0.1993, -0.0868, -0.0097,
         0.1960, -0.1219, -0.0144, -0.1320, -0.0831, -0.1968,  0.0216, -0.1987,
         0.1248, -0.0414, -0.0256,  0.1552, -0

In [10]:
written_by_id = resultTransE.training.relation_to_id[written_by]
written_by_embedding = relation_embeddings[written_by_id]

author_embedding = cited_paper_embedding + written_by_embedding

print(f'The most likely author embedding vector: \n {author_embedding}')

The most likely author embedding vector: 
 tensor([-2.4818e-01,  1.9305e-03, -2.9281e-02, -7.7749e-02, -7.0464e-02,
         8.7784e-02,  3.0137e-01,  4.1341e-02, -6.3045e-02,  5.2587e-02,
        -2.0990e-02, -3.8031e-02, -9.5406e-02, -1.4075e-01,  1.6771e-01,
        -5.6399e-01, -2.0654e-02,  1.1191e-01, -2.1349e-01,  2.2199e-01,
         3.1246e-01, -2.9146e-01, -1.5512e-01, -4.0889e-01,  9.1462e-02,
         9.7104e-02,  5.4135e-02, -1.9917e-01, -2.5941e-02, -5.8557e-02,
         1.9260e-01,  5.4811e-02,  5.4073e-01, -1.6118e-01, -1.9419e-01,
        -1.8914e-02,  1.7064e-01,  3.8842e-02,  5.8900e-02,  5.4219e-02,
        -7.2973e-03, -8.9927e-02,  5.3853e-02, -1.3237e-01, -6.3891e-02,
         6.8475e-02,  8.1229e-02, -3.5972e-02,  5.9759e-02,  2.5806e-02,
        -1.3173e-01,  3.5349e-01, -3.4844e-01, -6.0818e-02, -2.1220e-01,
         6.5024e-02,  1.1397e-02, -1.9281e-01,  2.8385e-01, -9.0903e-02,
        -3.4994e-02, -1.3689e-01,  2.5786e-02,  2.3222e-02,  3.6247e-01,
        

In [11]:
distances = torch.norm(entity_embeddings - author_embedding.unsqueeze(0), p=2, dim=1)  # Euclidean Distance
sorted_distances, sorted_indices = torch.sort(distances)

top_author_id = -1
for index in sorted_indices:
    if index != citing_paper_id:
        top_author_id = index.item()
        break
if top_author_id != -1:
    top_author = resultTransE.training.entity_id_to_label[top_author_id]

    print(f'The most likely author is: {top_author}')

The most likely author is: AUTHOR_336
