In [None]:
# _*_ coding:utf-8 _*_

from tqdm import tqdm
from scipy import optimize
import tensorflow as tf
from utils import *
import json
import os

seed = 12345
np.random.seed(seed)

#choose the GPU, "-1" represents using the CPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
#load the pre-trained word embeddings
#please download the zip file from "http://nlp.stanford.edu/data/glove.6B.zip" and choose "glove.6B.300d.txt" as the word vectors.

word_vecs = {}
with open("./glove.6B.300d.txt",encoding='UTF-8') as f:
    for line in tqdm(f.readlines()):
        line = line.split()
        word_vecs[line[0]] = np.array([float(x) for x in line[1:]])

In [None]:
#load the translated entity names 

ent_names = json.load(open("translated_ent_name/dbp_ja_en.json","r"))

#load KGs and test set

file_path = "KGs/dbp_ja_en/"
all_triples,node_size,rel_size = load_triples(file_path,True)
train_pair,test_pair = load_aligned_pair(file_path,ratio=0)

In [None]:
#generate the bigram dictionary

d = {}
count = 0
for _,name in ent_names:
    for word in name:
        word = word.lower()
        for idx in range(len(word)-1):
            if word[idx:idx+2] not in d:
                d[word[idx:idx+2]] = count
                count += 1

In [None]:
#generate the word-level features and char-level features

ent_vec = np.zeros((node_size,300))
char_vec = np.zeros((node_size,len(d)))
for i,name in ent_names:
    k = 0
    for word in name:
        word = word.lower()
        if word in word_vecs:
            ent_vec[i] += word_vecs[word]
            k += 1
        for idx in range(len(word)-1):
            char_vec[i,d[word[idx:idx+2]]] += 1
    if k:
        ent_vec[i]/=k
    else:
        ent_vec[i] = np.random.random(300)-0.5
        
    if np.sum(char_vec[i]) == 0:
        char_vec[i] = np.random.random(len(d))-0.5
    ent_vec[i] = ent_vec[i]/ np.linalg.norm(ent_vec[i])
    char_vec[i] = char_vec[i]/ np.linalg.norm(char_vec[i])

In [None]:
#build the relational adjacency matrix

dr = {}
for x,r,y in all_triples:
    if r not in dr:
        dr[r] = 0
    dr[r] += 1
    
sparse_rel_matrix = []
for i in range(node_size):
    sparse_rel_matrix.append([i,i,np.log(len(all_triples)/node_size)]);
for h,r,t in all_triples:
    sparse_rel_matrix.append([h,t,np.log(len(all_triples)/dr[r])])

sparse_rel_matrix = np.array(sorted(sparse_rel_matrix,key=lambda x:x[0]))
sparse_rel_matrix = tf.SparseTensor(indices=sparse_rel_matrix[:,:2],values=sparse_rel_matrix[:,2],dense_shape=(node_size,node_size))

In [None]:
#feature selection 

mode = "hybrid-level"

if mode == "word-level":
    feature = ent_vec
if mode == "char-level":
    feature = char_vec
if mode == "hybrid-level": 
    feature = np.concatenate([ent_vec,char_vec],-1)
feature = tf.nn.l2_normalize(feature,axis=-1)

In [None]:
%%time
#choose the graph depth L and feature propagation

depth = 2
def cal_sims(test_pair,feature):
    feature_a = tf.gather(indices=test_pair[:,0],params=feature)
    feature_b = tf.gather(indices=test_pair[:,1],params=feature)
    return tf.matmul(feature_a,tf.transpose(feature_b,[1,0]))

sims = cal_sims(test_pair,feature)
for i in range(depth):    
    feature = tf.sparse.sparse_dense_matmul(sparse_rel_matrix,feature)
    feature = tf.nn.l2_normalize(feature,axis=-1)
    sims += cal_sims(test_pair,feature)
sims /= depth+1

In [None]:
%%time
#solving by Hungarian algorithm, only for the CPU
result = optimize.linear_sum_assignment(sims,maximize=True)
test(result,"hungarian")

In [None]:
%%time
#solving by Sinkhorn operation

sims = tf.exp(sims*50)
for k in range(10):
    sims = sims / tf.reduce_sum(sims,axis=1,keepdims=True)
    sims = sims / tf.reduce_sum(sims,axis=0,keepdims=True)
test(sims,"sinkhorn")