In [1]:
# 03_TransE_l1_edge_similarity_based_on_link_recommendation_results
#
# created by LuYF-Lemon-love <luyanfeng_nlp@qq.com> on February 6, 2023
# updated by LuYF-Lemon-love <luyanfeng_nlp@qq.com> on March 14, 2023
#
# 该脚本展示了如何分析 TransE_l1 的关系类型推荐相似性 (Link Type Recommendation Similarity).
#
# 需要的包:
#          numpy
#          csv
#          torch
#
# 需要的文件:
#          ../../data/drkg/drkg.tsv
#          ../../data/drkg/entities.tsv
#          ../../data/drkg/relations.tsv
#          ../01-model/ckpts/TransE_l1_All_DRKG_0/All_DRKG_TransE_l1_entity.npy
#          ../01-model/ckpts/TransE_l1_All_DRKG_0/All_DRKG_TransE_l1_relation.npy
#
# 源教程链接: https://github.com/gnn4dr/DRKG/blob/master/embedding_analysis/Edge_similarity_based_on_link_recommendation_results.ipynb

# DRKG Relation Similarity Analysis based on link recommendations

本笔记本基于 DRKG 中不同关系类型的推荐结果, 对其进行相似性分析. 具体而言, 对于某个节点, 我们预测某个关系类型的 K 个最相似的邻居. 然后, 我们对所有关系类型重复此预测. 预测的邻居有显著重叠的关系类型将更相似.

## 导入需要的库

In [2]:
import os
import csv
import random
import numpy as np
import torch

def seed_torch(seed=42):
    seed = int(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = True

seed_torch()

In [3]:
!mkdir -p ./result/overlapping

定义用于对边进行评分的函数, 这应该与用于学习嵌入的函数相一致.

DGL-KE 官方实现 TransE_l1 评分函数的代码在:

- https://github.com/awslabs/dgl-ke/blob/master/python/dglke/models/ke_model.py 886 - 893 行

- https://github.com/awslabs/dgl-ke/blob/master/python/dglke/models/pytorch/score_fun.py 54 - 59 行

OpenKE 实现 TransE_l1 评分函数的代码在:

- https://github.com/thunlp/OpenKE/blob/OpenKE-PyTorch/openke/module/model/TransE.py

其中两者的实现代码基本一样.

In [4]:
import torch.nn.functional as fn

gamma = 18.0
def transE_l1(head, rel, tail):
    score = head + rel - tail
    return gamma - torch.norm(score, p=1, dim=-1)

## Loading Mapping files

加载映射文件, 同时加载实体和关系嵌入.

In [5]:
entity2id = {}
with open("../../data/drkg/entities.tsv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['id','entity'])
    for row_val in reader:
        id = row_val['id']
        entity2id[row_val['entity']] = int(id)

print(len(entity2id))

rel2id = {}
with open("../../data/drkg/relations.tsv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['id','relation'])
    for row_val in reader:
        id = row_val['id']
        rel2id[row_val['relation']] = int(id)

print(len(rel2id))

node_emb = np.load('../01-model/ckpts/TransE_l1_All_DRKG_0/All_DRKG_TransE_l1_entity.npy')
rel_emb = np.load('../01-model/ckpts/TransE_l1_All_DRKG_0/All_DRKG_TransE_l1_relation.npy')

97238
107


## Loading triplets

加载三元组, 映射成 ID.

In [6]:
head_ids = []
rel_ids = []
tail_ids = []
with open("../../data/drkg/drkg.tsv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['head', 'rel', 'tail'])
    for row_val in reader:
        head = row_val['head']
        rel = row_val['rel']
        tail = row_val['tail']

        head_id = entity2id[head]
        rel_id = rel2id[rel]
        tail_id = entity2id[tail]
        
        head_ids.append(head_id)
        rel_ids.append(rel_id)
        tail_ids.append(tail_id)
        
head_ids = np.array(head_ids)
rel_ids = np.array(rel_ids)
tail_ids = np.array(tail_ids)

## Link prediction

指定进行链接预测种子节点的个数.

In [7]:
L = 100
device = torch.device('cpu')
with torch.no_grad():
    node_emb = torch.tensor(node_emb).to(device)
    rel_emb = torch.tensor(rel_emb).to(device)
    head_ids = torch.tensor(head_ids).to(device)
    rel_ids = torch.tensor(rel_ids).to(device)
    tail_ids = torch.tensor(tail_ids).to(device)

    head_embedding = node_emb[head_ids]
    rel_embedding = rel_emb[rel_ids]
    tail_embedding = node_emb[tail_ids]
    
    # 选择 L 个随机 heads.
    perm = torch.randperm(head_ids.shape[0])
    seeds = head_ids[perm[:L]]
    seed_heads = node_emb[seeds]

预测所选种子节点和所有其他节点之间每个关系类型的得分

In [8]:
flag = True
scores = {}
for rel in rel2id.keys():
    rel_id = rel2id[rel]
    rel_embedding = ((rel_emb[rel_id]).repeat(node_emb.shape[0], 1))
    
    scores[rel] =[transE_l1((seed_heads[i].repeat(node_emb.shape[0], 1)),
                            rel_embedding, node_emb) for i in range(seed_heads.shape[0])]
    if flag:
        print(rel_embedding.shape)
        print(len(scores[rel]), scores[rel][0].shape)
        flag = False

torch.Size([97238, 400])
100 torch.Size([97238])


### Top K link predicition

指定得分最高的邻居的数量, 以评估链接预测的重叠.

In [None]:
flag = True
K = 10
top_neighbors={}
for rel in scores.keys():
    top_neighbors[rel] = [torch.argsort(score, descending = True)[:K] for score in scores[rel]]
    if flag:
        print(len(top_neighbors[rel]), len(top_neighbors[rel][0]))
        flag = False

100 10


## Overlap among predicted neighbors

计算每种关系类型的预测邻居节点的重叠

In [None]:
overlap_of_predicted_neighbors = []
keys = list(scores.keys()) 

for i in range(len(keys)):
    for j in range(i + 1, len(keys)):
        rel_1 = keys[i]
        rel_2 = keys[j]
        neighbors_seed_heads_1 = top_neighbors[rel_1]
        neighbors_seed_heads_2 = top_neighbors[rel_2]
        jacard = 0
        for k in range(len(neighbors_seed_heads_1)):
            neighbors_1 = list(neighbors_seed_heads_1[k].cpu().numpy())
            neighbors_2 = list(neighbors_seed_heads_2[k].cpu().numpy())
            jacard += float(len(set(neighbors_1).intersection(set(neighbors_2)))
                            / len(set(neighbors_1).union(set(neighbors_2))))
        jacard = jacard / len(neighbors_seed_heads_1)
        overlap_of_predicted_neighbors.append([rel_1, rel_2, jacard])

存储排序的重叠结果

In [None]:
# 降序排列
results = (sorted(overlap_of_predicted_neighbors, key=lambda x: float(x[2])))[::-1]
results_store = ["{}\t{}\t{}\n".format(result[0], result[1], result[2]) for result in results]
results_store = ["relation1\trelation2\tpercentage of overlapping predicted edges\n"] + results_store
file = "./result/overlapping/TransE_l1_percentage_of_overlapping_predicted_edges_per_edge_pair" + str(K) + ".tsv"

with open(file, 'w+') as f:
    f.writelines(results_store)