In [1]:
import torch
import pykeen
import pandas as pd
from pykeen import predict
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
import numpy as np

file_path = 'C.1_query.tsv'

In [2]:
tf = TriplesFactory.from_path(file_path, delimiter="\t")
training, testing = tf.split([0.8, 0.2], random_state=2025)

## The Most Basic Model

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

resultTransE = pipeline(
    training=training,
    testing=testing,
    model="TransE",
    model_kwargs=dict(
        embedding_dim=128,
    ),
    training_kwargs=dict(
        num_epochs=20
    ),
    negative_sampler_kwargs=dict(
        num_negs_per_pos=5,
    ),
    random_seed=2025,
    device = device
)

Training epochs on cuda:0:   0%|          | 0/20 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/52.0 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/3.28k [00:00<?, ?triple/s]

  return sum(
INFO:pykeen.evaluation.evaluator:Evaluation took 12.53s seconds


We choose the paper with id: "4ab60d22-67d1-4683-8648-f1f2601d2ce0"

In [4]:
paper = '4ab60d22-67d1-4683-8648-f1f2601d2ce0'

cites = 'cites'
written_by = 'written_by'

entity_embeddings = resultTransE.model.entity_representations[0](indices= None).detach()
relation_embeddings = resultTransE.model.relation_representations[0](indices=None).detach()

citing_paper_id = resultTransE.training.entity_to_id[paper]
cites_id = resultTransE.training.relation_to_id[cites]

citing_paper_embedding = entity_embeddings[citing_paper_id]
cites_embedding = relation_embeddings[cites_id]


In [None]:
# Compute the estimated embedding of the cited paper
cited_paper_embedding = citing_paper_embedding + cites_embedding

# Compute Euclidean distances to all entities
distances = torch.norm(entity_embeddings - cited_paper_embedding.unsqueeze(0), p=2, dim=1)
sorted_distances, sorted_indices = torch.sort(distances)

# Find the closest entity that is not the citing paper itself
top_cited_paper_id = -1
for index in sorted_indices:
    if index != citing_paper_id:
        top_cited_paper_id = index.item()
        break

# Output the result
if top_cited_paper_id != -1:
    print('A likely cited paper was found.')
    # print(resultTransE.training.entity_id_to_label[top_cited_paper_id])
    print(f'Most likely cited paper embedding vector:\n{cited_paper_embedding}\n')
else:
    print('No likely cited paper found.')

Exist the most likely cited paper
4ab9d02e-f259-4c8b-8a32-446f46e39b9d
The most likely cited paper embedding vector: 
 tensor([-0.4141, -0.0728, -0.0018, -0.1064,  0.0072,  0.1647,  0.3833,  0.0118,
        -0.0893, -0.0990, -0.0902,  0.0578, -0.0908, -0.2554,  0.1388, -0.3502,
         0.0323, -0.0440, -0.2122,  0.0627,  0.2828, -0.3726, -0.2846, -0.3105,
        -0.0636, -0.0487,  0.1014, -0.3503,  0.0586, -0.1336,  0.0543,  0.1321,
         0.4134, -0.3499, -0.1066, -0.1422,  0.0888, -0.0678,  0.0027, -0.0470,
         0.1017, -0.1785, -0.1072,  0.0879, -0.0465,  0.0072,  0.1511,  0.0588,
        -0.0484, -0.0184, -0.0692,  0.3753, -0.5091,  0.1053,  0.0048,  0.0741,
         0.0670, -0.1079,  0.2577, -0.1313,  0.0008, -0.0704, -0.1396,  0.0238,
         0.2597,  0.0568,  0.3082, -0.0633,  0.0732,  0.0029,  0.0502,  0.0682,
         0.1119, -0.0565,  0.1727, -0.0975,  0.2068,  0.2273, -0.0609, -0.0098,
         0.1960, -0.0964, -0.0171, -0.0784, -0.0602, -0.1564,  0.1163, -0.2336,
 

In [6]:
written_by_id = resultTransE.training.relation_to_id[written_by]
written_by_embedding = relation_embeddings[written_by_id]

author_embedding = cited_paper_embedding + written_by_embedding

print(f'The most likely author embedding vector: \n {author_embedding}')

The most likely author embedding vector: 
 tensor([-0.2291, -0.0750, -0.1683,  0.0036, -0.1720,  0.0972,  0.2532, -0.0023,
         0.0616, -0.0034, -0.0869, -0.0951,  0.0900, -0.3176,  0.1982, -0.4583,
        -0.0856,  0.1231, -0.0547, -0.0940,  0.4090, -0.3104, -0.2614, -0.3796,
         0.0322,  0.0903,  0.1911, -0.4285, -0.0134, -0.1983,  0.0956, -0.0377,
         0.3684, -0.1506, -0.2926, -0.0032,  0.0824,  0.1003,  0.1088,  0.0289,
         0.1076, -0.3791, -0.0201, -0.0281, -0.0638,  0.1286,  0.1393, -0.0857,
         0.0764, -0.1393, -0.0463,  0.2067, -0.4752,  0.0286, -0.0421,  0.1281,
        -0.0565, -0.0312,  0.2051, -0.0626,  0.0520, -0.2215, -0.2345,  0.0691,
         0.0910, -0.0806,  0.3323,  0.0194,  0.0602,  0.1336,  0.0536,  0.1749,
         0.0958, -0.0351,  0.1372, -0.2816,  0.2361,  0.2288,  0.0508, -0.0231,
         0.1109, -0.2008,  0.0111, -0.2359, -0.1036, -0.2390, -0.0220, -0.3131,
         0.0262, -0.0025, -0.0559,  0.1125, -0.0328, -0.1895, -0.0834, -0.103

In [7]:
distances = torch.norm(entity_embeddings - author_embedding.unsqueeze(0), p=2, dim=1)  # Euclidean Distance
sorted_distances, sorted_indices = torch.sort(distances)

top_author_id = -1
for index in sorted_indices:
    if index != citing_paper_id:
        top_author_id = index.item()
        break
if top_author_id != -1:
    top_author = resultTransE.training.entity_id_to_label[top_author_id]

    print(f'The most likely author is: {top_author}')

The most likely author is: AUTHOR_331
