In [1]:
#choose the GPU, "-1" represents using the CPU

import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# import all the requirements
import faiss 
from utils import *
from tqdm import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K

gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu,True)

In [None]:
# choose the dataset and set the random seed
# the first run may be slow because the graph needs to be preprocessed into binary cache

np.random.seed(12306)
dataset = ["DBP_ZH_EN/","DBP_JA_EN/","DBP_FR_EN/","SRPRS_FR_EN/","SRPRS_DE_EN/","DBP_WD/","DBP_YG/"][2]
path = "./EA_datasets/"+ dataset

# set hyper-parameters, load graphs and pre-aligned entity pairs
# if your GPU is out of memory, try to reduce the ent_dim

ent_dim, depth, top_k = 1024, 2, 500
if "EN" in dataset:
    rel_dim, mini_dim = ent_dim//2, 16
else:
    rel_dim, mini_dim = ent_dim//3, 16
    
node_size, rel_size, ent_tuple, triples_idx, ent_ent, ent_ent_val, rel_ent, ent_rel = load_graph(path)

train_pair,test_pair = load_aligned_pair(path,ratio=0.30)
candidates_x,candidates_y = set([x for x,y in test_pair]), set([y for x,y in test_pair]) 

In [7]:
%%time

# main functions of LightEA

def random_projection(x,out_dim):
    random_vec = K.l2_normalize(tf.random.normal((x.shape[-1],out_dim)),axis=-1)
    return K.dot(x,random_vec)

def batch_sparse_matmul(sparse_tensor,dense_tensor,batch_size = 128,save_mem = False):
    results = []
    for i in range(dense_tensor.shape[-1]//batch_size + 1):
        temp_result = tf.sparse.sparse_dense_matmul(sparse_tensor,dense_tensor[:, i*batch_size:(i+1)*batch_size])
        if save_mem:
            temp_result = temp_result.numpy()
        results.append(temp_result)
    if save_mem:
        return np.concatenate(results,-1)
    else:
        return K.concatenate(results,-1)

def get_features(train_pair,extra_feature = None):
    
    if extra_feature is not None:
        ent_feature = extra_feature
    else:
        random_vec = K.l2_normalize(tf.random.normal((len(train_pair),ent_dim)),axis=-1)
        ent_feature = tf.tensor_scatter_nd_update(tf.zeros((node_size,ent_dim)),train_pair.reshape((-1,1)),tf.repeat(random_vec,2,axis=0))
    rel_feature = tf.zeros((rel_size,ent_feature.shape[-1]))
    
    ent_ent_graph = tf.SparseTensor(indices=ent_ent,values=ent_ent_val,dense_shape=(node_size,node_size))
    rel_ent_graph = tf.SparseTensor(indices=rel_ent,values=K.ones(rel_ent.shape[0]),dense_shape=(rel_size,node_size))
    ent_rel_graph = tf.SparseTensor(indices=ent_rel,values=K.ones(ent_rel.shape[0]),dense_shape=(node_size,rel_size))
    
    ent_list,rel_list = [ent_feature],[rel_feature]
    for i in range(2):
        new_rel_feature = batch_sparse_matmul(rel_ent_graph,ent_feature)
        new_rel_feature = tf.nn.l2_normalize(new_rel_feature,axis=-1)
        
        new_ent_feature = batch_sparse_matmul(ent_ent_graph,ent_feature)
        new_ent_feature += batch_sparse_matmul(ent_rel_graph,rel_feature)
        new_ent_feature = tf.nn.l2_normalize(new_ent_feature,axis=-1)
        
        ent_feature = new_ent_feature; rel_feature = new_rel_feature
        ent_list.append(ent_feature); rel_list.append(rel_feature)
    
    ent_feature = K.l2_normalize(K.concatenate(ent_list,1),-1)
    rel_feature = K.l2_normalize(K.concatenate(rel_list,1),-1)
    rel_feature = random_projection(rel_feature,rel_dim)
    
    
    batch_size = ent_feature.shape[-1]//mini_dim
    sparse_graph = tf.SparseTensor(indices=triples_idx,values=K.ones(triples_idx.shape[0]),dense_shape=(np.max(triples_idx)+1,rel_size))
    adj_value = batch_sparse_matmul(sparse_graph,rel_feature)
    
    features_list = []
    for batch in range(rel_dim//batch_size + 1):
        temp_list = []
        for head in range(batch_size):
            if batch*batch_size+head>=rel_dim:
                break
            sparse_graph = tf.SparseTensor(indices=ent_tuple,values=adj_value[:,batch*batch_size+head],dense_shape=(node_size,node_size))
            feature = batch_sparse_matmul(sparse_graph,random_projection(ent_feature,mini_dim))
            temp_list.append(feature)
        if len(temp_list):
            features_list.append(K.concatenate(temp_list,-1).numpy())
    features = np.concatenate(features_list,axis=-1)
    
    faiss.normalize_L2(features)
    if extra_feature is not None:
        features = np.concatenate([ent_feature,features],axis=-1)
    return features

CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 37.4 µs


In [8]:
# obtain the literal features of entities, only work on DBP15K & SRPRS
# for the first run, you need to download the pre-train word embeddings from "http://nlp.stanford.edu/data/glove.6B.zip"
# unzip this file and put "glove.6B.300d.txt" into the root of LightEA

using_name_features = False
if using_name_features and "EN" in dataset: 
    name_features = load_name_features(dataset,"./glove.6B.300d.txt",mode = "hybrid-level")
    l_features = get_features(train_pair,extra_feature = name_features) 

In [9]:
%%time 

# Obtain the structural features and iteratively generate Semi-supervised data
# "epoch = 1" represents removing the iterative strategy

epochs = 3
for epoch in range(epochs):
    print("Round %d start:"%(epoch+1))
    s_features = get_features(train_pair)  
    if using_name_features and "EN" in dataset:
        features = np.concatenate([s_features,l_features],-1)
    else:
        features = s_features
    if epoch < epochs-1:
        left,right = list(candidates_x),list(candidates_y)
        index,sims = sparse_sinkhorn_sims(left,right,features,top_k)
        ranks = tf.argsort(-sims,-1).numpy()
        sims = sims.numpy(); index = index.numpy()

        temp_pair = []
        x_list,y_list= list(candidates_x),list(candidates_y)
        for i in range(ranks.shape[0]):
            if sims[i,ranks[i,0]] > 0.5:
                x = x_list[i]
                y = y_list[index[i,ranks[i,0]]]
                temp_pair.append((x,y))

        for x,y in temp_pair:
            if x in candidates_x:
                candidates_x.remove(x);
            if y in candidates_y:
                candidates_y.remove(y);
        
        print("new generated pairs = %d"%(len(temp_pair)))
        print("rest pairs = %d"%(len(candidates_x)))
        
        if not len(temp_pair):
            break
        train_pair = np.concatenate([train_pair,np.array(temp_pair)])
        
    right_list, wrong_list = test(test_pair,features,top_k)

Round 1 start:
new generated pairs = 7092
rest pairs = 3408
Hits@1: 0.754 Hits@10: 0.903 MRR: 0.808

Round 2 start:
new generated pairs = 1056
rest pairs = 2352
Hits@1: 0.806 Hits@10: 0.915 MRR: 0.847

Round 3 start:
Hits@1: 0.806 Hits@10: 0.912 MRR: 0.846

CPU times: user 1min 36s, sys: 6.34 s, total: 1min 42s
Wall time: 11.4 s
