In [7]:
%%capture
import json
import networkx as nx
from node2vec import Node2Vec
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
import time
import numpy as np

# source : https://github.com/eliorc/node2vec

In [8]:
with open('./data/aan_full.json') as f:
    full_set = json.load(f)
    
directed_citation_graph = nx.DiGraph()
for paper in full_set:
    for ref_id in paper['references']:
        directed_citation_graph.add_edge(paper['id'], ref_id)
        
undirected_citation_graph = directed_citation_graph.to_undirected()

In [9]:
paper_ids = [paper['id'] for paper in full_set]

In [10]:
start = time.time()
node2vec_directed = Node2Vec(directed_citation_graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=2, workers=4)
time.time() - start

Computing transition probabilities: 100%|██████████| 15366/15366 [00:02<00:00, 5680.65it/s]


20.508866786956787

In [11]:
start = time.time()
model_directed = node2vec_directed.fit(window=10, min_count=1, batch_words=4)
time.time() - start

64.60270595550537

In [12]:
directed_paper_node2vec_vectors = [model_directed.wv[paper['id']] for paper in full_set]
directed_paper_node2vec_array = np.vstack(directed_paper_node2vec_vectors)

In [13]:
start = time.time()
node2vec_undirected = Node2Vec(directed_citation_graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=2, workers=4)
time.time() - start

Computing transition probabilities: 100%|██████████| 15366/15366 [00:02<00:00, 5333.72it/s]


20.062878131866455

In [14]:
start = time.time()
model_undirected = node2vec_undirected.fit(window=10, min_count=1, batch_words=4)
time.time() - start

63.0034966468811

In [15]:
undirected_paper_node2vec_vectors = [model_undirected.wv[paper['id']] for paper in full_set]
undirected_paper_node2vec_array = np.vstack(undirected_paper_node2vec_vectors)

# aan_test_single.json

In [7]:
with open('./data/aan_test_single.json') as f:
    test_set = json.load(f)

In [8]:
input_node2vec_vectors = [model_directed.wv[paper['id']] for paper in test_set]
input_ids = [paper['id'] for paper in test_set]

paper_node2vec_array = directed_paper_node2vec_array
input_node2vec_array = np.vstack(input_node2vec_vectors)

score_array = cosine_similarity(input_node2vec_array, paper_node2vec_array)

In [9]:
results = []

for input_paper_id, scores in tqdm(zip(input_ids, score_array.tolist())):
    result = {}
    result['input'] = [input_paper_id]
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] != input_paper_id]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [10]:
with open('./results/single_base_node2veccosine_directed.json', 'w') as f:
    json.dump(results, f)

In [8]:
input_node2vec_vectors = [model_undirected.wv[paper['id']] for paper in test_set]
input_ids = [paper['id'] for paper in test_set]

paper_node2vec_array = undirected_paper_node2vec_array
input_node2vec_array = np.vstack(input_node2vec_vectors)

score_array = cosine_similarity(input_node2vec_array, paper_node2vec_array)

In [9]:
results = []

for input_paper_id, scores in tqdm(zip(input_ids, score_array.tolist())):
    result = {}
    result['input'] = [input_paper_id]
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] != input_paper_id]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [10]:
with open('./results/single_base_node2veccosine_undirected.json', 'w') as f:
    json.dump(results, f)

# aan_test_triplet.json

In [11]:
with open('./data/aan_test_triplet.json') as f:
    test_set = json.load(f)

In [12]:
paper_node2vec_array = directed_paper_node2vec_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_node2vec_vectors = [model_directed.wv[paper_id] for paper_id in partial_input_ids]
    
    input_node2vec_array = np.vstack(input_node2vec_vectors)

    partial_score_arrays.append(cosine_similarity(input_node2vec_array, paper_node2vec_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [13]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [14]:
with open('./results/triplet_base_node2veccosine_directed.json', 'w') as f:
    json.dump(results, f)

In [12]:
paper_node2vec_array = undirected_paper_node2vec_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_node2vec_vectors = [model_undirected.wv[paper_id] for paper_id in partial_input_ids]
    
    input_node2vec_array = np.vstack(input_node2vec_vectors)

    partial_score_arrays.append(cosine_similarity(input_node2vec_array, paper_node2vec_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [13]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [14]:
with open('./results/triplet_base_node2veccosine_undirected.json', 'w') as f:
    json.dump(results, f)

# aan_test_tripletfromref.json

In [16]:
with open('./data/aan_test_tripletfromref.json') as f:
    test_set = json.load(f)

In [17]:
paper_node2vec_array = directed_paper_node2vec_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_node2vec_vectors = [model_directed.wv[paper_id] for paper_id in partial_input_ids]
    
    input_node2vec_array = np.vstack(input_node2vec_vectors)

    partial_score_arrays.append(cosine_similarity(input_node2vec_array, paper_node2vec_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [18]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [19]:
with open('./results/tripletfromref_base_node2veccosine_directed.json', 'w') as f:
    json.dump(results, f)

In [20]:
paper_node2vec_array = undirected_paper_node2vec_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_node2vec_vectors = [model_undirected.wv[paper_id] for paper_id in partial_input_ids]
    
    input_node2vec_array = np.vstack(input_node2vec_vectors)

    partial_score_arrays.append(cosine_similarity(input_node2vec_array, paper_node2vec_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [21]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [22]:
with open('./results/tripletfromref_base_node2veccosine_undirected.json', 'w') as f:
    json.dump(results, f)