In [1]:
import openke
from openke.config import Trainer, Tester
from openke.module.model import TransE
from openke.module.loss import MarginLoss
from openke.module.strategy import NegativeSampling
from openke.data import TrainDataLoader, TestDataLoader, EmbeddingDataLoader
import torch
import torch.nn.functional as F
import random
import time
from tqdm import tqdm

In [2]:
def compute_embedding(model, data, head, tail):
    batch_h = data['batch_h']
    batch_t = data['batch_t']
    combined_tensor = torch.cat((batch_h, batch_t))
    unique_tensor = torch.unique(combined_tensor)

    mask = (unique_tensor != head) & (unique_tensor != tail)
    filtered_tensor = unique_tensor[mask]
    if len(filtered_tensor) <= 1:
        embedding = model.ent_embeddings(filtered_tensor[0]) - model.ent_embeddings(filtered_tensor[0])
    else:
        embed_1 = model.ent_embeddings(filtered_tensor[0])
        embed_2 = model.ent_embeddings(filtered_tensor[1])
        embedding = embed_1 - embed_2
    
    return embedding


In [3]:
device = torch.device('cuda')
Cosine_dataloader = TrainDataLoader(
    in_path=None,
    tri_file='./benchmarks/YAGO3-10/train2id.txt',
    ent_file="./benchmarks/YAGO3-10/entity2id.txt",
    rel_file="./benchmarks/YAGO3-10/relation2id.txt",
    nbatches=100,
    threads=8,
    sampling_mode="normal",
    bern_flag=1,
    filter_flag=1,
    neg_ent=25,
    neg_rel=0)
Cosine_model = TransE(
	ent_tot = Cosine_dataloader.get_ent_tot(),
	rel_tot = Cosine_dataloader.get_rel_tot(),
	dim = 200, 
	p_norm = 1, 
	norm_flag = True)

Cosine_model.to(device)
Cosine_model.load_checkpoint('./checkpoint/YAGO/YAGO_TransE.ckpt')

In [4]:
tri_file = './benchmarks/YAGO3-10/train2id.txt'
unlearn_file = './benchmarks/YAGO3-10/deleted_0.1.txt'
schema_file = './benchmarks/YAGO3-10/type_constrain.txt'
weight_file = './checkpoint/YAGO/YAGO_TransE.ckpt'

In [5]:
Schema_dataloader = EmbeddingDataLoader.CosineSchemaDataLoader(
    tri_file=tri_file,
    unlearn_file=unlearn_file,
    schema_file=schema_file,
    weight_file=weight_file)

  super()._check_params_vs_input(X, default_n_init=10)


In [6]:
query_triple = (67142,97190,9) #42784 51176 9

In [7]:
e1, e2, e3, e4 = Schema_dataloader.query_match(Schema_dataloader.triples, Schema_dataloader.adj_matrix, query_triple, Schema_dataloader.labels
                                              , device)

In [27]:
def compute_embedding(model, e1, e2):
    e1 = torch.tensor(e1, dtype=torch.long).to(device)
    e2 = torch.tensor(e2, dtype=torch.long).to(device)
    embed_1 = model.ent_embeddings(e1)
    embed_2 = model.ent_embeddings(e2)
    embedding = embed_1 - embed_2
    
    return embedding


In [29]:
Cosine_Sampling = NegativeSampling(
	model = Cosine_model, 
	loss = MarginLoss(margin = 5.0),
	batch_size = Cosine_dataloader.get_batch_size()
)

trainer = Trainer(model = Cosine_Sampling, data_loader = Cosine_dataloader, train_times = 1000, alpha = 0.001, use_gpu = True)

In [None]:
start_time = time.time()
total_loss = 0.0

trainer.optimizer = torch.optim.Adam(
    trainer.model.parameters(),
    lr=trainer.alpha,
    weight_decay=trainer.weight_decay,
)

with tqdm(total=len(Schema_dataloader.removed_triples), desc="Processing triples") as pbar:
    for idx, data in enumerate(Schema_dataloader.removed_triples):
        e1, e2, e3, e4 = Schema_dataloader.query_match(Schema_dataloader.triples, Schema_dataloader.adj_matrix, data, Schema_dataloader.labels
                                              , device)

        Embed_Query = compute_embedding(Cosine_model, batch_subgraph, query_head, query_tail)
        max_similarity = float('-inf')
        all_iterations = 10
        
        trainer.optimizer.zero_grad()
        
        for _ in range(all_iterations):
            mapping_head, mapping_tail, mapping_subgraph = Schema_dataloader.mapping_subgraph(query_subgraph=subgraph, adj_matrix=Schema_dataloader.adj_matrix, triples=Schema_dataloader.triples)
            mapping_subgraph = Schema_dataloader.convert_to_batch_data(mapping_subgraph, device)
            Embed_Map = compute_embedding(Cosine_model, mapping_subgraph, mapping_head, mapping_tail)
            cosine_similarity = torch.nn.functional.cosine_similarity(Embed_Query, Embed_Map, dim=0)
            average_cosine_similarity = cosine_similarity.max()
            if average_cosine_similarity > max_similarity:
                max_similarity = average_cosine_similarity
        # print(Embed_Query)   
        # loss_value = torch.tensor(1 - max_similarity)
        loss_value = torch.log(1 - max_similarity + 1e-10)  
        loss = loss_value.requires_grad_(True)
        loss.backward()
        total_loss += loss.item()
        trainer.optimizer.step()
        # print(Embed_Query)    
        if idx == 1:
            for name, param in Cosine_model.named_parameters():
                if param.grad is not None:
                    print(f"{name} has gradient with mean value {param.grad.mean().item()}")
                else:
                    print(f"{name} has no gradient")
        pbar.set_description(f"Processing triples (Loss: {total_loss / (idx + 1):.4f})")
        pbar.update(1)

print(f'Running Time: {time.time() - start_time}s')


In [6]:
# start_time = time.time()
# total_loss = 0.0

# trainer.optimizer = torch.optim.Adam(
#     trainer.model.parameters(),
#     lr=trainer.alpha,
#     weight_decay=trainer.weight_decay,
# )

# with tqdm(total=len(Schema_dataloader.removed_triples), desc="Processing triples") as pbar:
#     for idx, data in enumerate(Schema_dataloader.removed_triples):
#         query_head, query_tail = data[0], data[1]
#         subgraph = Schema_dataloader.find_subgraph(Schema_dataloader.adj_matrix, data)
#         batch_subgraph = Schema_dataloader.convert_to_batch_data(subgraph, device)
#         Embed_Query = compute_embedding(Cosine_model, batch_subgraph, query_head, query_tail)
#         max_similarity = float('-inf')
#         all_iterations = 10
        
#         trainer.optimizer.zero_grad()
        
#         for _ in range(all_iterations):
#             mapping_head, mapping_tail, mapping_subgraph = Schema_dataloader.mapping_subgraph(query_subgraph=subgraph, adj_matrix=Schema_dataloader.adj_matrix, triples=Schema_dataloader.triples)
#             mapping_subgraph = Schema_dataloader.convert_to_batch_data(mapping_subgraph, device)
#             Embed_Map = compute_embedding(Cosine_model, mapping_subgraph, mapping_head, mapping_tail)
#             cosine_similarity = torch.nn.functional.cosine_similarity(Embed_Query, Embed_Map, dim=0)
#             average_cosine_similarity = cosine_similarity.max()
#             if average_cosine_similarity > max_similarity:
#                 max_similarity = average_cosine_similarity
#         # print(Embed_Query)   
#         # loss_value = torch.tensor(1 - max_similarity)
#         loss_value = torch.log(1 - max_similarity + 1e-10)  
#         loss = loss_value.requires_grad_(True)
#         loss.backward()
#         total_loss += loss.item()
#         trainer.optimizer.step()
#         # print(Embed_Query)    
#         if idx == 1:
#             for name, param in Cosine_model.named_parameters():
#                 if param.grad is not None:
#                     print(f"{name} has gradient with mean value {param.grad.mean().item()}")
#                 else:
#                     print(f"{name} has no gradient")
#         pbar.set_description(f"Processing triples (Loss: {total_loss / (idx + 1):.4f})")
#         pbar.update(1)

# print(f'Running Time: {time.time() - start_time}s')


Processing triples:   0%|                                                                    | 0/107904 [00:00<?, ?it/s]

Training Files Path : ./benchmarks/YAGO3-10/train2id.txt
Entity Files Path : ./benchmarks/YAGO3-10/entity2id.txt
Relation Files Path : ./benchmarks/YAGO3-10/relation2id.txt
The toolkit is importing datasets.
The total of relations is 37.
The total of entities is 123182.
The total of train triples is 1079040.


Processing triples (Loss: -0.2499):   0%|                                          | 9/107904 [00:00<1:42:34, 17.53it/s]

zero_const has no gradient
pi_const has no gradient
ent_embeddings.weight has gradient with mean value -3.870997150753029e-14
rel_embeddings.weight has no gradient


Processing triples (Loss: nan): 100%|███████████████████████████████████████████| 107904/107904 [34:23<00:00, 52.28it/s]

Running Time: 2064.08597779274s





In [7]:
loss

tensor(-3.5197, device='cuda:0', grad_fn=<LogBackward0>)

In [8]:
test_dataloader = TestDataLoader("./benchmarks/YAGO3-10/", "link")

Input Files Path : ./benchmarks/YAGO3-10/
The total of test triples is 5000.
The total of valid triples is 5000.


In [9]:
tester = Tester(model = Cosine_model, data_loader = test_dataloader, use_gpu = True)
tester.run_link_prediction(type_constrain = False)

100%|███████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:50<00:00, 98.57it/s]

0.012800000607967377





(0.007288469932973385,
 39366.6171875,
 0.012800000607967377,
 0.00860000029206276,
 0.003399999812245369)

no type constraint results:
metric:			 MRR 		 MR 		 hit@10 	 hit@3  	 hit@1 
l(raw):			 0.005278 	 56201.601562 	 0.010800 	 0.007000 	 0.001800 
r(raw):			 0.006779 	 24450.035156 	 0.013200 	 0.007600 	 0.002400 
averaged(raw):		 0.006028 	 40325.820312 	 0.012000 	 0.007300 	 0.002100 

l(filter):		 0.006814 	 54287.296875 	 0.012000 	 0.009000 	 0.003200 
r(filter):		 0.007763 	 24445.933594 	 0.013600 	 0.008200 	 0.003600 
averaged(filter):	 0.007288 	 39366.617188 	 0.012800 	 0.008600 	 0.003400 
0.012800
