In [18]:
%%capture
import json
import networkx as nx
from node2vec import Node2Vec
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
import time
import numpy as np

# source : https://github.com/eliorc/node2vec

In [2]:
FULL_SET_PATH = './data/aan_full.json'
TEST_SET_PATH = './data/aan_test.json'
DIRECTED_OUTPUT_PATH = './results/base_node2veccosine_directed_aan.json'
UNDIRECTED_OUTPUT_PATH = './results/base_node2veccosine_undirected_aan.json'

In [19]:
FULL_SET_PATH = './data/dblp_full.json'
TEST_SET_PATH = './data/dblp_test.json'
DIRECTED_OUTPUT_PATH = './results/base_node2veccosine_directed_dblp.json'
UNDIRECTED_OUTPUT_PATH = './results/base_node2veccosine_undirected_dblp.json'

In [20]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)
    
directed_citation_graph = nx.DiGraph()
for paper in full_set:
    for ref_id in paper['references']:
        directed_citation_graph.add_edge(paper['id'], ref_id)
        
undirected_citation_graph = directed_citation_graph.to_undirected()

In [21]:
paper_ids = [paper['id'] for paper in full_set]

In [22]:
start = time.time()
node2vec_directed = Node2Vec(directed_citation_graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=2, workers=4)
time.time() - start

Computing transition probabilities: 100%|██████████| 22726/22726 [00:02<00:00, 10083.24it/s]


39.81916356086731

In [23]:
start = time.time()
model_directed = node2vec_directed.fit(window=10, min_count=1, batch_words=4)
time.time() - start

70.95656251907349

In [24]:
directed_paper_node2vec_vectors = [model_directed.wv[paper['id']] for paper in full_set]
directed_paper_node2vec_array = np.vstack(directed_paper_node2vec_vectors)

In [25]:
start = time.time()
node2vec_undirected = Node2Vec(directed_citation_graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=2, workers=4)
time.time() - start

Computing transition probabilities: 100%|██████████| 22726/22726 [00:02<00:00, 10179.95it/s]


39.8017942905426

In [26]:
start = time.time()
model_undirected = node2vec_undirected.fit(window=10, min_count=1, batch_words=4)
time.time() - start

71.74264287948608

In [27]:
undirected_paper_node2vec_vectors = [model_undirected.wv[paper['id']] for paper in full_set]
undirected_paper_node2vec_array = np.vstack(undirected_paper_node2vec_vectors)

In [28]:
with open(TEST_SET_PATH) as f:
    test_set = json.load(f)

# directed variant

In [29]:
paper_node2vec_array = directed_paper_node2vec_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_node2vec_vectors = [model_directed.wv[paper_id] for paper_id in partial_input_ids]
    
    input_node2vec_array = np.vstack(input_node2vec_vectors)

    partial_score_arrays.append(cosine_similarity(input_node2vec_array, paper_node2vec_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [30]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [31]:
with open(DIRECTED_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)

# undirected variant

In [32]:
paper_node2vec_array = undirected_paper_node2vec_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_node2vec_vectors = [model_undirected.wv[paper_id] for paper_id in partial_input_ids]
    
    input_node2vec_array = np.vstack(input_node2vec_vectors)

    partial_score_arrays.append(cosine_similarity(input_node2vec_array, paper_node2vec_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [33]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [34]:
with open(UNDIRECTED_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)