In [2]:
import easydict
 
args = easydict.EasyDict({
        "data_path" : "./Dataset/Total",
        "emb_path" : "./Result_for_embedding/Total/Loss_1.2758",
        "result_path" : "./Result_for_HinSage/Total",
        "weight_toggle" : False,
        "num_folds" : 5,
        "num_samples" : [4,2],
        "layer_sizes" : [16,8],
        "batch_size" : 1000,
        "epochs" : 100,
        "lr" : 0.001,
        "drop_out" : 0.4,
        "num_workers" : -1
    })

from stellargraph import StellarGraph
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_classification
from utils import *

from tensorflow.keras import Model, optimizers, losses, metrics




In [21]:
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

In [5]:
# load dataset
wiki2csv = load_pickle(os.path.join(args.data_path,"wiki2csv"))
csv2wiki = load_pickle(os.path.join(args.data_path,"csv2wiki"))
embedding = load_pickle(os.path.join(args.emb_path,"Embedding"))
num_csv = len(csv2wiki.keys())
num_wiki = len(wiki2csv.keys())
print(f"num_csv = {num_csv}, num_wiki = {num_wiki}")

graph = make_graph(csv2wiki, embedding, num_csv, args)
G_train, edge_ids_train, edge_labels_train, G_test, edge_ids_test, edge_labels_test = train_test_graph_split(graph)

head_node_types = ["wiki","csv"]
if edge_ids_train[0][0].find("csv") >= 0:
    head_node_types = ["csv", "wiki"]
    
train_gen = make_generate(G_train, head_node_types, args)
train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle = True)


num_csv = 77, num_wiki = 2512
edge_df.shape = (8120, 2)

[After applying drop_duplicate]
edge_df.shape = (7237, 2)

[After applying rename_for_value]
      csv       wiki
0  csv-26  wiki-2327
1  csv-26  wiki-1312
2  csv-26  wiki-1278
3  csv-26  wiki-1241
4  csv-26   wiki-913
node_csv =             0         1         2         3         4         5         6    \
csv-0 -0.240582 -0.096518 -0.204321 -0.057264  0.130768 -0.063019  0.180385   
csv-1  0.033454  0.065465  0.428110 -0.113888  0.536284 -0.092917  0.170378   
csv-2 -0.210087 -0.152295  0.210778 -0.025063  0.449653  0.291625  0.071882   
csv-3 -0.266294  0.060738 -0.420196  0.443523  0.224524 -0.249679  0.490691   
csv-4  0.253309  0.038146  0.021372  0.101454 -0.286800  0.230597 -0.146937   

            7         8         9    ...       118       119       120  \
csv-0  0.001783  0.212850  0.051950  ... -0.153015 -0.027282 -0.037565   
csv-1  0.053746  0.079655  0.144963  ... -0.077511 -0.286580 -0.161355   
csv-2  0.155274 

# model

In [14]:
hinsage = HinSAGE(
                  layer_sizes = [32,8],
                  generator = train_gen,
                  bias = True,
                  dropout = 0.4
                  )
x_inp, x_out = hinsage.in_out_tensors()
score_prediction = link_classification(edge_embedding_method = "ip")(x_out) # ip = inner proudct

model = Model(inputs = x_inp,
              outputs = [x_out, score_prediction])


link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [15]:
def get_loss(x_out, train_flow):

    f1, f2 = x_out
    #train_flow.ids
    csv_dict = defaultdict(list)
    wiki_dict = defaultdict(list)
    csv_wiki_map = []
    csv_id_mapping = {}
    wiki_id_mapping = {}
    csv_wiki_id_matching = []

    for idx, (csv_id, wiki_id) in enumerate(train_flow.ids):
        csv_dict[csv_id].append(f1[idx])
        wiki_dict[wiki_id].append(f2[idx])

        if csv_id not in csv_id_mapping.keys():
            csv_id_mapping[csv_id] = len(csv_id_mapping)

        if wiki_id not in wiki_id_mapping.keys():
            wiki_id_mapping[wiki_id] = len(wiki_id_mapping)

        mapping_id_csv = csv_id_mapping[csv_id]
        mapping_id_wiki = wiki_id_mapping[wiki_id]

        csv_wiki_id_matching.append((mapping_id_csv, mapping_id_wiki))

    csv_wiki_map = [[0] * len(wiki_dict) for _ in range(len(csv_dict))]

    for row,col in csv_wiki_id_matching:
        csv_wiki_map[row][col] = 1

    csv_wiki_map = tf.convert_to_tensor(csv_wiki_map)

    csv_embeddings = []
    for value in list(csv_dict.values()):
        csv_embeddings.append(tf.math.reduce_mean(tf.convert_to_tensor(value), axis = 0))
    csv_embeddings = tf.convert_to_tensor(csv_embeddings)

    wiki_embeddings = []
    for value in list(wiki_dict.values()):
        wiki_embeddings.append(tf.math.reduce_mean(tf.convert_to_tensor(value), axis = 0))
    wiki_embeddings = tf.convert_to_tensor(wiki_embeddings)

    mm_result = tf.matmul(csv_embeddings, tf.transpose(wiki_embeddings))

    exp_mm_result = tf.exp(mm_result)

    sum_mm_result = tf.math.reduce_sum(exp_mm_result, axis = 1)

    normalized_result = tf.math.log(tf.divide(exp_mm_result, tf.expand_dims(sum_mm_result, axis = 1)))

    csv_wiki_map = tf.cast(csv_wiki_map, dtype = tf.float32)

    tf.reduce_sum(csv_wiki_map, axis = 1)

    masked_result = tf.math.multiply(normalized_result, csv_wiki_map)
    return -tf.reduce_mean(tf.reduce_sum(masked_result, axis = 1) / tf.reduce_sum(csv_wiki_map, axis = 1))


In [None]:
class ContrastiveLoss(keras.losses.Loss):
    def __init__(self, train_flow, temperature = 1, name = None):
        super().__init__(name = name)
        self.train_flow = train_flow
        self.temperature = temperature
    
    def __call__(self, x_out)
        return get_loss(x_out, self.train_flow, self.temperature)
    
    
    
    

In [16]:
model.predict(train_flow)

[[array([[-0.28893813,  0.5966094 , -0.0124186 , ..., -0.27756092,
           0.2731821 , -0.01758261],
         [-0.54426485,  0.11575687,  0.6242469 , ..., -0.00588385,
           0.05755868,  0.5390443 ],
         [-0.36878133,  0.00283373,  0.46419984, ..., -0.6029627 ,
          -0.02978542,  0.20677583],
         ...,
         [-0.87653315,  0.19256586,  0.21630451, ..., -0.0430028 ,
          -0.03459273, -0.05998769],
         [-0.6499131 , -0.13803956,  0.20086503, ..., -0.19584562,
           0.3588418 ,  0.2059231 ],
         [ 0.22625199,  0.29241568,  0.7084212 , ..., -0.35155287,
           0.28053364,  0.11138066]], dtype=float32),
  array([[ 0.25375697, -0.32260874,  0.53871274, ...,  0.20019461,
          -0.6692845 ,  0.22496323],
         [ 0.03965148, -0.21042545,  0.49646878, ...,  0.5866819 ,
           0.13504139,  0.30877545],
         [ 0.37531194,  0.3689295 ,  0.3989492 , ...,  0.14470008,
           0.01831088,  0.17906035],
         ...,
         [ 0.219153

In [20]:
model.compile(
    optimizer = optimizers.Adam(learning_rate = args.lr),
    loss = {'x_out': get_loss(train_flow)}
)

TypeError: get_loss() missing 2 required positional arguments: 'f2' and 'train_flow'

In [18]:
model.fit(train_flow, epochs = args.epochs, shuffle = True)

0.001