# Unicredit training camp

---

## Imports

In [3]:
import pandas as pd
import node2vec
import networkx as nx

## Data loading

In [4]:
train_set = pd.read_csv('../data/kg_train.csv')
val_set = pd.read_csv('../data/kg_validation.csv')
test_set = pd.read_csv('../data/kg_test_nolabel.csv')
train_val_set = pd.read_csv('../data/kg_pykeen_trainvalidation.tsv', sep='\t', names=['source', 'relation', 'tail'])

In [5]:
train_set.head()

Unnamed: 0,Id,Triple,Label
0,0,34881 intercommunality 14230,0
1,1,9387 ownerOper 39573,0
2,2,12480 coach 24064,0
3,3,6871 branches 22010,0
4,4,13789 damsire 33095,0


In [6]:
train_set[['head', 'relation', 'tail']] = train_set['Triple'].str.split(' ', 2, expand=True)
val_set[['head', 'relation', 'tail']] = val_set['Triple'].str.split(' ', 2, expand=True)
test_set[['head', 'relation', 'tail']] = test_set['Triple'].str.split(' ', 2, expand=True)

In [7]:
train_set.head()

Unnamed: 0,Id,Triple,Label,head,relation,tail
0,0,34881 intercommunality 14230,0,34881,intercommunality,14230
1,1,9387 ownerOper 39573,0,9387,ownerOper,39573
2,2,12480 coach 24064,0,12480,coach,24064
3,3,6871 branches 22010,0,6871,branches,22010
4,4,13789 damsire 33095,0,13789,damsire,33095


In [8]:
val_set.head()

Unnamed: 0,Id,Triple,Label,head,relation,tail
0,91802,41074 host 9832,0,41074,host,9832
1,91803,12583 primaryLanguages 388,0,12583,primaryLanguages,388
2,91804,22259 seasonTopscorer 22261,1,22259,seasonTopscorer,22261
3,91805,8408 jurisdiction 646,0,8408,jurisdiction,646
4,91806,32 combatant 20690,0,32,combatant,20690


In [9]:
test_set.head()

Unnamed: 0,Id,Triple,head,relation,tail
0,114754,1322 operatingSystem 14477,1322,operatingSystem,14477
1,114755,5210 parent 11412,5210,parent,11412
2,114756,38658 leadersSeat 35321,38658,leadersSeat,35321
3,114757,41457 cableServ 4591,41457,cableServ,4591
4,114758,21579 visitorConference 214,21579,visitorConference,214


In [10]:
train_val_set.head()

Unnamed: 0,source,relation,tail
0,10001,champions,9985
1,10005,tributariesLeft,10008
2,10005,tributariesLeft,10014
3,10005,tributariesLeft,10015
4,10005,tributariesLeft,10016


### Compute embeddings

In [11]:
graph = nx.from_pandas_edgelist(train_val_set, source='source', target='tail', edge_attr='relation', create_using=nx.MultiDiGraph)

In [12]:
embedding = node2vec.Node2Vec(graph, dimensions=64)

Computing transition probabilities:   0%|          | 0/43990 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.21s/it]


In [17]:
model = embedding.fit(window=10, min_count=1, batch_words=4) 

In [18]:
model.wv.most_similar('10001')

[('6314', 0.9881356358528137),
 ('24630', 0.9874294996261597),
 ('31025', 0.9869814515113831),
 ('27075', 0.9868086576461792),
 ('19860', 0.9866402745246887),
 ('16863', 0.9865047335624695),
 ('38446', 0.9864374399185181),
 ('10983', 0.9863516688346863),
 ('22769', 0.9862580895423889),
 ('15338', 0.9861183166503906)]

Edge embeddings

In [19]:
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

In [20]:
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)

In [30]:
for head, rel, tail in train_val_set.itertuples():
    

array([ 0.0624701 ,  0.03421304,  0.01889687,  0.0360032 ,  0.02340166,
        0.01081064,  0.00539609, -0.10514799,  0.0099972 , -0.08288552,
       -0.13095427, -0.03101761, -0.17819105, -0.00924894,  0.04178149,
        0.11270613, -0.09185932, -0.0489062 , -0.04081935,  0.1034681 ,
        0.06663747,  0.10671864,  0.07600545, -0.02598541, -0.09982543,
        0.14421503, -0.0334549 ,  0.06526703, -0.08632009, -0.03830667,
        0.00988579, -0.1199495 , -0.03753826,  0.0596816 ,  0.0090041 ,
       -0.02315225,  0.00169792,  0.00049679,  0.09358896, -0.06502618,
       -0.10051589,  0.03809523,  0.02182027,  0.00799868, -0.02158404,
        0.01745255,  0.02430105, -0.02636931, -0.03013489, -0.09939053,
        0.02390336, -0.04540624,  0.02313965,  0.09056495,  0.0515229 ,
        0.00967712, -0.09068506, -0.01114145, -0.1428173 , -0.03895466,
        0.00368675, -0.10857921,  0.12659143,  0.12857458], dtype=float32)

In [28]:
type(edges_embs)

node2vec.edges.HadamardEmbedder