In [154]:
import easydict
 
args = easydict.EasyDict({
        "data_path" : "./Dataset/Total",
        "emb_path" : "./Result_for_embedding/Total/Loss_1.2758",
        "result_path" : "./Result_for_HinSage/Total",
        "weight_toggle" : False,
        "num_folds" : 5,
        "num_samples" : [4,2],
        "layer_sizes" : [16,8],
        "batch_size" : 1000,
        "epochs" : 100,
        "lr" : 0.001,
        "drop_out" : 0.4,
        "num_workers" : -1
    })
    

In [155]:
from stellargraph import StellarGraph
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_classification
from utils import *

from tensorflow.keras import Model, optimizers, losses, metrics


In [156]:
# load dataset
wiki2csv = load_pickle(os.path.join(args.data_path,"wiki2csv"))
csv2wiki = load_pickle(os.path.join(args.data_path,"csv2wiki"))
embedding = load_pickle(os.path.join(args.emb_path,"Embedding"))
num_csv = len(csv2wiki.keys())
num_wiki = len(wiki2csv.keys())
print(f"num_csv = {num_csv}, num_wiki = {num_wiki}")

num_csv = 77, num_wiki = 2512


In [157]:
graph = make_graph(csv2wiki, embedding, num_csv, args)
G_train, edge_ids_train, edge_labels_train, G_test, edge_ids_test, edge_labels_test = train_test_graph_split(graph)

edge_df.shape = (8120, 2)

[After applying drop_duplicate]
edge_df.shape = (7237, 2)

[After applying rename_for_value]
      csv       wiki
0  csv-26  wiki-2327
1  csv-26  wiki-1312
2  csv-26  wiki-1278
3  csv-26  wiki-1241
4  csv-26   wiki-913
node_csv =             0         1         2         3         4         5         6    \
csv-0 -0.240582 -0.096518 -0.204321 -0.057264  0.130768 -0.063019  0.180385   
csv-1  0.033454  0.065465  0.428110 -0.113888  0.536284 -0.092917  0.170378   
csv-2 -0.210087 -0.152295  0.210778 -0.025063  0.449653  0.291625  0.071882   
csv-3 -0.266294  0.060738 -0.420196  0.443523  0.224524 -0.249679  0.490691   
csv-4  0.253309  0.038146  0.021372  0.101454 -0.286800  0.230597 -0.146937   

            7         8         9    ...       118       119       120  \
csv-0  0.001783  0.212850  0.051950  ... -0.153015 -0.027282 -0.037565   
csv-1  0.053746  0.079655  0.144963  ... -0.077511 -0.286580 -0.161355   
csv-2  0.155274  0.048890 -0.120483  ... -0.16

In [158]:
head_node_types = ["wiki","csv"]
if edge_ids_train[0][0].find("csv") >= 0:
    head_node_types = ["csv", "wiki"]

In [159]:
train_gen = make_generate(G_train, head_node_types, args)

In [160]:
train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle = True)
next(iter(train_flow))

([array([[[ 0.67447007, -0.06583045,  0.2126683 , ..., -0.27408352,
            0.00780342,  0.0046704 ]],
  
         [[ 0.79992306, -0.43949798,  0.3929974 , ..., -0.28042975,
           -0.17354643,  0.01253277]],
  
         [[ 0.10230416, -0.0596146 ,  0.47902945, ..., -0.19324736,
           -0.03901784,  0.35872605]],
  
         ...,
  
         [[ 0.07648198, -0.36519095,  0.12458345, ..., -0.45250326,
            0.10212797, -0.07068767]],
  
         [[ 0.79992306, -0.43949798,  0.3929974 , ..., -0.28042975,
           -0.17354643,  0.01253277]],
  
         [[ 0.0334538 ,  0.06546532,  0.42810956, ..., -0.1693636 ,
            0.33214802, -0.05017089]]], dtype=float32),
  array([[[-0.29129902, -0.55538267,  0.13675027, ..., -0.1997944 ,
            0.46844715, -0.00972283]],
  
         [[ 0.00393434, -0.13329993,  0.11578717, ..., -0.14251943,
            0.09655527,  0.19859964]],
  
         [[-0.14415954, -0.33194423,  0.36723498, ...,  0.1442004 ,
            0.2529240

In [161]:
train_gen.num_batch_dims()

1

In [162]:
hinsage = HinSAGE(
                  layer_sizes = [32,8],
                  generator = train_gen,
                  bias = True,
                  dropout = 0.4
                  )
x_inp, x_out = hinsage.in_out_tensors()
score_prediction = link_classification(edge_embedding_method = "ip")(x_out) # ip = inner proudct
model = Model(inputs = x_inp,
              outputs = x_out)
model.summary()

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [191]:
f1,f2 = model.predict(train_flow)

In [306]:
def make_contrastive_model(hinsage, train_flow, temperature):
    x_inp, x_out = hinsage.in_out_tensors()
    
    f1 = tf.math.l2_normalize(x_out[0], axis = 1)
    f2 = tf.math.l2_normalize(x_out[1], axis = 1)
    
    loss = get_loss(f1, f2, train_flow)
    
    model = Model(inputs = x_inp,
                  outputs = loss)
    return model
    

In [307]:
c_model = make_contrastive_model(hinsage, train_flow, temperature = 1)

In [311]:
c_model.predict(train_flow)

5.520757675170898

In [302]:
def get_loss(f1, f2, train_flow):

    #train_flow.ids
    csv_dict = defaultdict(list)
    wiki_dict = defaultdict(list)
    csv_wiki_map = []
    csv_id_mapping = {}
    wiki_id_mapping = {}
    csv_wiki_id_matching = []

    for idx, (csv_id, wiki_id) in enumerate(train_flow.ids):
        csv_dict[csv_id].append(f1[idx])
        wiki_dict[wiki_id].append(f2[idx])

        if csv_id not in csv_id_mapping.keys():
            csv_id_mapping[csv_id] = len(csv_id_mapping)

        if wiki_id not in wiki_id_mapping.keys():
            wiki_id_mapping[wiki_id] = len(wiki_id_mapping)

        mapping_id_csv = csv_id_mapping[csv_id]
        mapping_id_wiki = wiki_id_mapping[wiki_id]

        csv_wiki_id_matching.append((mapping_id_csv, mapping_id_wiki))



    csv_wiki_map = [[0] * len(wiki_dict) for _ in range(len(csv_dict))]

    for row,col in csv_wiki_id_matching:
        csv_wiki_map[row][col] = 1

    csv_wiki_map = tf.convert_to_tensor(csv_wiki_map)

    csv_embeddings = []
    for value in list(csv_dict.values()):
        csv_embeddings.append(tf.math.reduce_mean(tf.convert_to_tensor(value), axis = 0))
    csv_embeddings = tf.convert_to_tensor(csv_embeddings)

    wiki_embeddings = []
    for value in list(wiki_dict.values()):
        wiki_embeddings.append(tf.math.reduce_mean(tf.convert_to_tensor(value), axis = 0))
    wiki_embeddings = tf.convert_to_tensor(wiki_embeddings)

    mm_result = tf.matmul(csv_embeddings, tf.transpose(wiki_embeddings))

    exp_mm_result = tf.exp(mm_result)

    sum_mm_result = tf.math.reduce_sum(exp_mm_result, axis = 1)

    normalized_result = tf.math.log(tf.divide(exp_mm_result, tf.expand_dims(sum_mm_result, axis = 1)))

    csv_wiki_map = tf.cast(csv_wiki_map, dtype = tf.float32)

    tf.reduce_sum(csv_wiki_map, axis = 1)

    masked_result = tf.math.multiply(normalized_result, csv_wiki_map)
    return -tf.reduce_mean(tf.reduce_sum(masked_result, axis = 1) / tf.reduce_sum(csv_wiki_map, axis = 1))


In [110]:
model.predict(train_flow)[1].shape

(1,)

In [111]:
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

In [112]:
class SupervisedContrastiveLoss(keras.losses.Loss):
    def __init__(self, temperature = 1, name = None):
        super().__init__(name = name)
        self.temperature = temperature
        
    def __call__(self, feature_vectors, sample_weight = None):
        # Normalize feature vectors
        feature_vectors_normalized = tf.math.l2_normalize(feature_vectors, axis = 1)
        logits = tf.divide(
                            tf.matmul(
                                        feature_vectors_normalized, tf.transpose(feature_vectors_normalized)
                                    ),
                self.temperature
        )
        return logits
        
        #return tfa.losses.npairs_loss(tf.squeeze(labels), logits)
        

In [175]:
for x,y in train_flow.ids:
    print(x,y)

50 2194
57 765
40 1050
76 816
41 333
28 868
5 2078
28 1065
23 2541
11 1302
40 2522
30 2561
32 990
25 1198
73 351
22 1794
18 1050
47 2355
30 305
51 648
52 915
55 2057
41 478
67 1147
73 843
26 990
75 506
32 1101
47 1804
25 118
59 1377
58 1809
33 2126
64 370
6 409
73 635
28 2031
61 1511
65 720
61 409
40 520
8 635
72 419
65 125
66 462
60 1147
27 1245
75 228
66 1957
23 2078
69 1191
56 458
70 1377
7 1203
19 125
56 462
59 462
69 2518
51 427
6 1282
70 996
27 2288
2 1377
72 1125
7 898
33 1203
47 478
55 1752
35 426
27 622
40 2367
29 1377
11 1304
23 665
47 1274
38 728
22 841
40 449
30 622
28 1406
7 228
32 312
32 520
10 791
56 990
30 1706
66 2021
76 1461
34 820
73 1944
27 2460
24 1707
53 635
27 409
70 385
28 1997
55 850
4 2242
4 1482
36 1218
35 1437
26 2022
62 2368
1 1556
49 288
65 2364
69 1906
13 728
9 449
37 1065
42 2194
44 2134
41 784
24 1803
41 225
51 1063
27 1782
37 1326
52 942
24 202
69 903
6 1752
52 2287
23 2022
67 1065
6 2019
75 1235
70 2551
69 636
66 1804
20 2057
23 1408
38 1891
29 784
60

In [183]:
from collections import defaultdict
g = defaultdict(list)

g[0].append(0)
g[1].append(1)



In [185]:
g

defaultdict(list, {0: [0], 1: [1]})

In [184]:
list(g.keys())

[0, 1]

In [134]:
from collections import defaultdict


def make_contrastive(hinsage, temperature, train_flow):
    x_inp, x_out = hinsage.in_out_tensors()
    
    f1 = tf.math.l2_normalize(x_out[0], axis = 1)
    f2 = tf.math.l2_normalize(x_out[1], axis = 1)
    
    
    out = tf.divide(tf.matmul(f1, tf.transpose(f2)), temperature)
    
#     logits = tf.divide(tf.matmul(x_out[0], tf.transpose(x_out[1])), temperature)
    
    csv_dict = defaultdict(list)
    wiki_dict = defaultdict(list)
    for idx, (csv_id, wiki_id) in enumerate(train_flow.ids):
        csv_dict[csv_id].append(f1[idx])
        wiki_dict[wiki_id].append(f2[idx])
    
    
        
        
        
        
        
        
        
    
    model = Model(inputs = x_inp,
                  outputs = [f1,f2])
    return model

In [135]:
m = make_contrastive(hinsage, temperature = 0.8)

In [136]:
next(iter(train_flow))[1].shape

(296,)

In [141]:
for idx, (x,y) in enumerate(train_flow.ids):
    print(idx,x,y)
    

0 38 2221
1 68 1101
2 26 1355
3 19 1202
4 22 1923
5 23 1523
6 35 1696
7 10 2413
8 71 389
9 52 896
10 75 2197
11 33 1245
12 23 941
13 71 2207
14 50 191
15 76 2022
16 23 1202
17 47 1131
18 54 2190
19 8 900
20 30 2225
21 41 1385
22 52 2320
23 30 1201
24 35 585
25 63 1773
26 49 720
27 73 275
28 10 2384
29 69 2022
30 8 156
31 55 850
32 25 1769
33 39 1990
34 28 1303
35 65 1053
36 27 2022
37 25 1556
38 55 409
39 24 1329
40 65 1187
41 52 2393
42 27 419
43 6 491
44 30 2522
45 25 1302
46 11 2159
47 38 572
48 60 1355
49 41 1958
50 48 1355
51 65 2022
52 47 2165
53 35 300
54 28 1953
55 25 1882
56 11 1760
57 13 2022
58 37 2459
59 66 173
60 28 1944
61 67 338
62 6 2522
63 50 2062
64 32 1203
65 56 201
66 28 664
67 66 949
68 32 1198
69 65 1670
70 52 2402
71 70 921
72 57 1662
73 76 1715
74 75 898
75 26 410
76 23 1203
77 5 2022
78 9 196
79 40 459
80 28 455
81 24 1065
82 40 83
83 28 1707
84 9 1408
85 4 1858
86 56 917
87 47 1823
88 52 2055
89 45 409
90 30 2367
91 73 765
92 7 97
93 18 125
94 52 635
95 67 132

In [None]:
m.fit()

In [139]:
f1,f2 = m.predict(train_flow)
f1.shape
f2.shape

(296, 8)

In [143]:
f1[0]

array([ 0.4042195 , -0.20395012,  0.3147617 , -0.69365185, -0.24146982,
       -0.03517517,  0.27411813,  0.28301486], dtype=float32)

In [144]:
f1[47]

array([ 0.33018923, -0.27720347,  0.23873505, -0.5902531 , -0.5801053 ,
        0.22291677, -0.14987074, -0.00805409], dtype=float32)

In [145]:
f1[105]

array([-0.06448361,  0.53577816, -0.39245713, -0.6800377 , -0.17270365,
        0.19710909, -0.07519139,  0.13408008], dtype=float32)

In [87]:
f1 = m.predict(train_flow)[0]

In [88]:
f2 = m.predict(train_flow)[1]

In [91]:
tf.matmul(f1, tf.transpose(f2)) / 2

<tf.Tensor: shape=(296, 296), dtype=float32, numpy=
array([[-0.20920447,  0.22074522,  0.07516934, ..., -0.02597416,
        -0.06924   ,  0.1152197 ],
       [-0.21209389,  0.00664073, -0.10639209, ...,  0.09749413,
        -0.23197694,  0.05408767],
       [-0.24445957, -0.183058  , -0.00791817, ...,  0.10845301,
        -0.06082166,  0.13253297],
       ...,
       [-0.14771488, -0.23719992, -0.14626664, ..., -0.01174234,
        -0.09364757, -0.00169909],
       [-0.14651774,  0.2402444 ,  0.11424123, ...,  0.15048374,
        -0.05038797,  0.18549879],
       [-0.20747185, -0.03672608, -0.27357656, ..., -0.01964769,
        -0.18882476,  0.19624804]], dtype=float32)>