In [1]:
# _*_ coding:utf-8 _*_

from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
from scipy import optimize
from evaluate import evaluate
import tensorflow as tf
import tensorflow.keras.backend as K
from utils import *
import tensorly
import json
import os

seed = 12306
np.random.seed(seed)

#choose the GPU, "-1" represents using the CPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
tensorly.set_backend('tensorflow')

gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu,True)

#choose the base model and dataset
model = ["Dual_AMN","TransEdge","RSN"][0]
dataset = ["DBP_ZH_EN/","DBP_JA_EN/","DBP_FR_EN/","SRPRS_FR_EN/","SRPRS_DE_EN/"][0]

if "DBP" in dataset:
    path = "./EA_datasets/"+ ("sharing/" if model == "TransEdge" else "mapping/") + dataset + "0_3/"
else:
    path = "./EA_datasets/"+ ("sharing/" if model == "TransEdge" else "mapping/") + dataset
    
train_pair,test_pair = load_aligned_pair(path,ratio=0.3)

In [2]:
#build the adjacency sparse tensor of KGs and load the initial embeddings
triples = []

flag = model == "Dual_AMN"
with open(path + "triples_1") as f:
    for line in f.readlines():
        h,r,t = [int(x) for x in line.strip().split("\t")]
        triples.append([h,t,r+flag])
with open(path + "triples_2") as f:
    for line in f.readlines():
        h,r,t = [int(x) for x in line.strip().split("\t")]
        triples.append([h,t,r+flag])

if model != "TransEdge":
    triples = np.array(triples)
    triples = np.unique(triples,axis=0)
    node_size,rel_size = np.max(triples[:,0])+1 , np.max(triples[:,2])+1
    triples = np.concatenate([triples,[(t,h,r+rel_size) for h,t,r in triples]],axis=0)
    rel_size = rel_size * 2
    
    if model == "RSN":
        emb_path = "Embeddings/RSN/%s"%dataset
        ent_emb = tf.cast(np.load(emb_path + "ent_emb.npy"),"float32")
        rel_emb = tf.cast(np.load(emb_path + "rel_emb.npy"),"float32")
        ent_dic,rel_dic = json.load(open(emb_path+"ent_id2id.json")),json.load(open(emb_path+"rel_id2id.json"))
        new_triples,new_test = [],[]
        for h,t,r in triples:
            new_triples.append([int(ent_dic[str(h)]),int(ent_dic[str(t)]),int(rel_dic[str(r)])])
        for a,b in test_pair:
            new_test.append([int(ent_dic[str(a)]),int(ent_dic[str(b)])])
        triples = np.array(new_triples)
        test_pair = np.array(new_test)
    else:
        triples = np.concatenate([triples,[(t,t,0) for t in range(node_size)]],axis=0)
        ent_emb = tf.cast(np.load("Embeddings/Dual_AMN/%sent_emb.npy"%dataset),"float32")
        rel_emb = tf.cast(np.load("Embeddings/Dual_AMN/%srel_emb.npy"%dataset),"float32")
        
    triples = np.unique(triples,axis=0)
    
else:
    triples = np.array(triples)
    triples = np.unique(triples,axis=0)
    node_size,rel_size = np.max(triples)+1 , np.max(triples[:,2])+1
    triples = np.concatenate([triples,[(t,h,r) for h,t,r in triples]],axis=0)
    triples = np.unique(triples,axis=0)
    ent_emb = tf.cast(np.load("Embeddings/TransEdge/%sent_embeds.npy"%dataset),"float32")
    rel_emb = tf.cast(np.load("Embeddings/TransEdge/%srel_embeds.npy"%dataset),"float32")


ent_ent,triples_idx = [],[]
pair_dict,ent_degree = {},{}
last,index = (-1,-1), -1

for h,t,r in triples:
    if h not in ent_degree:
        ent_degree[h] = 0
    ent_degree[h] += 1
    
    if (h,t) != last:
        last = (h,t)
        index += 1
        ent_ent.append([h,t])
    
    triples_idx.append([index,r])
ent_ent = np.array(ent_ent)
triples_idx = np.unique(np.array(triples_idx),axis=0)
triples_val = np.array([1/ent_degree[ent_ent[idx][0]] for idx,_ in triples_idx])

2021-11-16 10:50:38.975285: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-16 10:50:39.433999: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22320 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6


In [3]:
%%time
# compute H^l_s and H^l_t and slove the assignment problem by Sinkhorn operator
def random_projection(x,out_dim):
    random_vec = K.l2_normalize(tf.random.normal((x.shape[-1],out_dim),mean=0,stddev=(1/out_dim)**0.5),axis=-1)
    return K.dot(x,random_vec)

def cal_sims(test_pair,feature):
    feature = tf.nn.l2_normalize(feature,axis = -1)
    feature_a = tf.gather(indices=test_pair[:,0],params=feature)
    feature_b = tf.gather(indices=test_pair[:,1],params=feature)
    return tf.matmul(feature_a,tf.transpose(feature_b,[1,0]))

sims = cal_sims(test_pair,ent_emb)

reserve_ratio = 0.02
ent_size,rel_size = ent_emb.shape[-1],rel_emb.shape[-1]
total_feature_size = int(ent_size*reserve_ratio)*rel_size

features = []
for head in tqdm(range(rel_size)):
    rel_weight = tf.gather(rel_emb[:,head],triples_idx[:,1])
    adj_value = tf.math.segment_sum(triples_val * rel_weight, triples_idx[:,0])
    sparse_graph = tf.SparseTensor(indices=ent_ent,values=adj_value,dense_shape=(ent_emb.shape[0],ent_emb.shape[0]))
    feature = tf.sparse.sparse_dense_matmul(sparse_graph,ent_emb)
    features.append(random_projection(feature,int(ent_size*reserve_ratio)))
features = tf.concat(features,axis=1)
print(features.shape)
features = tensorly.truncated_svd(features,total_feature_size)[0]
sims += cal_sims(test_pair,features)

depth = 3
for i in tqdm(range(depth)):
    sparse_graph = tf.SparseTensor(indices=ent_ent,values=tf.ones(ent_ent.shape[0]),dense_shape=(ent_emb.shape[0],ent_emb.shape[0]))
    sparse_graph = tf.sparse.softmax(sparse_graph)
    features = tf.sparse.sparse_dense_matmul(sparse_graph,features)
    sims += cal_sims(test_pair,features)

sims /= 2+depth
sims = tf.exp(sims/0.02)
for k in range(15):
    sims = sims / tf.reduce_sum(sims,axis=1,keepdims=True)
    sims = sims / tf.reduce_sum(sims,axis=0,keepdims=True)
test(sims,"sinkhorn")

2021-11-16 10:50:43.309175: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
100%|████████████████████████████████████████| 128/128 [00:00<00:00, 195.18it/s]
2021-11-16 10:50:43.969572: I tensorflow/core/util/cuda_solvers.cc:180] Creating CudaSolver handles for stream 0x5642e7253df0


(38960, 1920)


100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 23.90it/s]


hits@1 : 83.50% hits@10 : 95.14% MRR : 87.96%
CPU times: user 3.22 s, sys: 767 ms, total: 3.99 s
Wall time: 4.03 s


In [4]:
# the results of base model
def csls_sims(test_pair,feature):
    evaluater = evaluate(test_pair)
    feature = tf.nn.l2_normalize(feature,axis = -1)
    feature_a = tf.gather(indices=test_pair[:,0],params=feature)
    feature_b = tf.gather(indices=test_pair[:,1],params=feature)
    evaluater.test(feature_a,feature_b)

csls_sims(test_pair,ent_emb)

2021-11-16 10:50:46.907276: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Hits@1:  0.8039047619047619   Hits@5:  0.9155238095238095   Hits@10:  0.9368571428571428   MRR:  0.8529688830473474


In [5]:
# the results of the Hungarian algorithm
sims = cal_sims(test_pair,ent_emb)
result = optimize.linear_sum_assignment(sims,maximize=True)
test(result,"hungarian")

hits@1 : 79.95%
