### Step 1: Import Required Libraries

In [1]:
import csv
import glob
import random
import sklearn
import numpy as np
import networkx as nx
import pickle5 as pickle
from sklearn.svm import SVC
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

### Step 2: Loading Embedded Locations

In [2]:
embeddings = np.load('data/EMBEDDING/embeddeds_dim128_batchNumber200_length5_lookback1.npy')

# We will predict 2019 collaborations based on 2018 and 2017 (these data is based on previous years, so prediction is based on
# all previous years 2015, 2016, 2017, and 2018)
embed2018 = embeddings[2]
embed2017 = embeddings[1]

### Step 3: Sorted List of all Authors to have the same order in all Graphs

In [3]:
# To have sorted embedding in Output
def node_sorter(graph):
    H = nx.Graph()
    new_nodes = sorted(graph.nodes(data=True))
    H.add_nodes_from(new_nodes)
    H.add_edges_from(graph.edges(data=True))
    return H

### Step 4: Embedded Locations for 2017 and 2018

In [4]:
# nodes in all these graphs are their intersection, so nose_set is the same for all years

graph_2019 = nx.read_pajek('data/Graphs/pajek_graph19.net')
SortedNodeG2019 = node_sorter(graph_2019)
node_emb_dict_2018 = {list(SortedNodeG2019.nodes())[i] : embed2018[i] for i in range(len(SortedNodeG2019.nodes()))}
node_emb_dict_2017 = {list(SortedNodeG2019.nodes())[i] : embed2017[i] for i in range(len(SortedNodeG2019.nodes()))}

### Step 5: Get Authors' names as Key for Embedded Locations Dictionary

In [5]:
node_emb_dict18 = list(node_emb_dict_2018)
nodes_num = len(node_emb_dict18)
names = []
for i in range(nodes_num):
    names.append(node_emb_dict18[i])

### Step 6: Computing (Predicting) Future Locations (2019) using 2017, 2018

In [6]:
all_velbased_loc = []
for i in range(nodes_num):
     all_velbased_loc.append(2*node_emb_dict_2018[names[i]] - node_emb_dict_2017[names[i]])

### Similarity Computation for Sample of size 20,000 (almost one-third of whole data)

In [12]:
import time

t = time.time()
a = all_velbased_loc[:20000]
b = all_velbased_loc[:20000]
t = time.time()
c = cosine_similarity(a, b)
x = time.time()-t
    
print(x)

1.4296760559082031


In [57]:
base_loc = []
for i in range(nodes_num):
     base_loc.append(node_emb_dict_2018[names[i]])

In [58]:
import time

t = time.time()
a_b = base_loc[:20000]
b_b = base_loc[:20000]
t = time.time()
c_b = cosine_similarity(a_b, b_b)
x = time.time()-t
    
print(x)

0.8210158348083496


In [59]:
# Creates a list containing 20000 lists, each of 20000 items, all set to 0
w, h = 20000, 20000
grand_truth = [[0 for x in range(w)] for y in range(h)]
samp_size = 20000
nodes_ = list(SortedNodeG2019.nodes())
for i in range(samp_size):
    for j in range(samp_size):
        if (nodes_[j] in SortedNodeG2019[nodes_[i]]):
            grand_truth[i][j] = 1

In [60]:
n_grand_truth = np.array(grand_truth)

In [61]:
n_c = np.array(c)

In [62]:
n_c_b = np.array(c_b)

In [63]:
n_grand_truth = n_grand_truth > 0

In [64]:
n_c = n_c >= 0.8

In [65]:
n_c_b = n_c_b >= 0.8

In [66]:
for i in range(samp_size):
    n_c[i][i] = False
    n_c_b[i][i] = False

In [67]:
n_c = n_c.flatten()

In [68]:
n_c_b = n_c_b.flatten()

In [69]:
n_grand_truth = n_grand_truth.flatten()

In [70]:
from sklearn.metrics import roc_auc_score

In [71]:
roc_auc_score(n_grand_truth, n_c)

0.5730506152174495

In [72]:
roc_auc_score(n_grand_truth, n_c_b)

0.5938053956499748

In [79]:
roc_auc_score(n_grand_truth, n_c, average='micro')

0.5730506152174495

In [80]:
roc_auc_score(n_grand_truth, n_c_b, average='micro')

0.5938053956499748

In [81]:
from sklearn import metrics

In [84]:
vel_auprc = sklearn.metrics.average_precision_score(n_grand_truth, n_c)

In [85]:
base_auprc = sklearn.metrics.average_precision_score(n_grand_truth, n_c_b)

In [87]:
print(vel_auprc)
print(base_auprc)

0.03436041142668613
0.01835590212738386
