## Cosine Similarity

### Step 1: Import Required Libraries

In [None]:
import csv
import glob
import random
import sklearn
import numpy as np
import networkx as nx
import pickle5 as pickle
from sklearn.svm import SVC
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

### Step 2: Loading Embedded Locations

In [3]:
embeddings = np.load('data/EMBEDDING/embeddeds_dim128_batchNumber200_length5_lookback1.npy')

# We will predict 2019 collaborations based on 2018 and 2017 (these data is based on previous years, so prediction is based on
# all previous years 2015, 2016, 2017, and 2018)
embed2018 = embeddings[2]
embed2017 = embeddings[1]

### Step 3: Sorted List of all Authors to have the same order in all Graphs

In [26]:
# To have sorted embedding in Output
def node_sorter(graph):
    H = nx.Graph()
    new_nodes = sorted(graph.nodes(data=True))
    H.add_nodes_from(new_nodes)
    H.add_edges_from(graph.edges(data=True))
    return H

### Step 4: Embedded Locations for 2017 and 2018

In [27]:
# nodes in all these graphs are their intersection, so nose_set is the same for all years

graph_2019 = nx.read_pajek('data/Graphs/pajek_graph19.net')
SortedNodeG2019 = node_sorter(graph_2019)
node_emb_dict_2018 = {list(SortedNodeG2019.nodes())[i] : embed2018[i] for i in range(len(SortedNodeG2019.nodes()))}
node_emb_dict_2017 = {list(SortedNodeG2019.nodes())[i] : embed2017[i] for i in range(len(SortedNodeG2019.nodes()))}

### Step 5: Get Authors' names as Key for Embedded Locations Dictionary

In [29]:
node_emb_dict18 = list(node_emb_dict_2018)
nodes_num = len(node_emb_dict18)
names = []
for i in range(nodes_num):
    names.append(node_emb_dict18[i])

### Step 6: Computing (Predicting) Future Locations (2019) using 2017, 2018

In [30]:
all_velbased_loc = []
for i in range(nodes_num):
     all_velbased_loc.append(2*node_emb_dict_2018[names[i]] - node_emb_dict_2017[names[i]])

### Step 6: Partitioning Data to Handle Heavy Computation

In [43]:
#ww, hh = nodes_num, nodes_num
#batch = [[0 for x in range(ww)] for y in range(hh)]
batch_ = []
strt = 0
stp = 1000
end_ = 1000
num_batch = int(nodes_num/1000) + 1
for i in range(num_batch):
    #batch[i][:] = all_velbased_loc[strt:end_]
    batch_.append(all_velbased_loc[strt:end_])
    strt = end_
    end_ = end_ + stp

### Step 7: Computing Cosine Similarity between each pair of Authors using their 2019 Predicted Location

In [None]:
# Creates a list containing h lists, each of w items, all set to 0
w, h = nodes_num, nodes_num
Matrix = [[0 for x in range(w)] for y in range(h)]

c = 0

for i in range(num_batch):
    for j in range(1000):
        for k in range(num_batch):
            for l in range(1000):
                a = batch_[i][k].reshape(-1, 1)
                b = batch_[j][l].reshape(-1, 1)
                temp = cosine_similarity(a, b)
                row = int(c/nodes_num)
                hight = c%nodes_num
                Matrix[row][hight]
                c = c + 1

### Save 2019 batched locations, so you can skip steps:1-6 

In [56]:
for i in range(num_batch):
    with open("batch_" + str(i), "wb") as fp:   #Pickling
        pickle.dump(batch_[i], fp)

### Step 8: load data, comput

In [None]:
loaded_batch_ = []
for i in range(num_batch):
    with open("Data_/batch_"+str(i), "rb") as fp:
        loaded_batch_.append(pickle.load(fp))
        
batch_ = loaded_batch_

# Creates a list containing h lists, each of w items, all set to 0
w, h = nodes_num, nodes_num
Matrix = [[0 for x in range(w)] for y in range(h)]

c = 0

for i in range(num_batch):
    for j in range(1000):
        for k in range(num_batch):
            for l in range(1000):
                a = batch_[i][k].reshape(-1, 1)
                b = batch_[j][l].reshape(-1, 1)
                temp = cosine_similarity(a, b)
                row = int(c/nodes_num)
                hight = c%nodes_num
                Matrix[row][hight]
                c = c + 1