In [36]:
%%capture
import json
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
import numpy as np
import networkx as nx
import random
from gensim.models import Word2Vec

In [37]:
with open('./data/aan_full.json') as f:
    full_set = json.load(f)
    
directed_citation_graph = nx.DiGraph()
for paper in full_set:
    for ref_id in paper['references']:
        directed_citation_graph.add_edge(paper['id'], ref_id)
        
undirected_citation_graph = directed_citation_graph.to_undirected()

In [4]:
paper_ids = [paper['id'] for paper in full_set]

In [38]:
# inspire by https://github.com/phanein/deepwalk/

def deepwalk(G, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1):
    
    # build deepwalk corpus
    walks = []
    nodes = list(G.nodes())
    random.seed(seed)
  
    for _ in range(number_walks):
        random.shuffle(nodes)
        for node in nodes:
            # random walk
            path = [node]

            while len(path) < walk_length:
                current = path[-1]
                current_neighbors = list(G[current])
                if current_neighbors:
                    path.append(random.choice(current_neighbors))
                else:
                    break
                    
            #return [str(node) for node in path] 
            walks.append(path)

    # build model
    model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1, workers=workers)    
    
    return model.wv

In [40]:
model_undirected = deepwalk(undirected_citation_graph)
model_directed = deepwalk(directed_citation_graph)

In [41]:
directed_paper_deepwalk_vectors = [model_directed.get_vector(paper['id']) for paper in full_set]
directed_paper_deepwalk_array = np.vstack(directed_paper_deepwalk_vectors)

In [42]:
undirected_paper_deepwalk_vectors = [model_undirected.get_vector(paper['id']) for paper in full_set]
undirected_paper_deepwalk_array = np.vstack(undirected_paper_deepwalk_vectors)

# aan_test_single.json

In [43]:
with open('./data/aan_test_single.json') as f:
    test_set = json.load(f)

In [44]:
input_deepwalk_vectors = [model_directed.get_vector(paper['id']) for paper in test_set]
input_ids = [paper['id'] for paper in test_set]

paper_deepwalk_array = directed_paper_deepwalk_array
input_deepwalk_array = np.vstack(input_deepwalk_vectors)

score_array = cosine_similarity(input_deepwalk_array, paper_deepwalk_array)

In [45]:
results = []

for input_paper_id, scores in tqdm(zip(input_ids, score_array.tolist())):
    result = {}
    result['input'] = [input_paper_id]
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] != input_paper_id]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [46]:
with open('./results/single_base_deepwalkcosine_directed.json', 'w') as f:
    json.dump(results, f)

In [47]:
input_deepwalk_vectors = [model_undirected.get_vector(paper['id']) for paper in test_set]
input_ids = [paper['id'] for paper in test_set]

paper_deepwalk_array = undirected_paper_deepwalk_array
input_deepwalk_array = np.vstack(input_deepwalk_vectors)

score_array = cosine_similarity(input_deepwalk_array, paper_deepwalk_array)

In [48]:
results = []

for input_paper_id, scores in tqdm(zip(input_ids, score_array.tolist())):
    result = {}
    result['input'] = [input_paper_id]
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] != input_paper_id]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [49]:
with open('./results/single_base_deepwalkcosine_undirected.json', 'w') as f:
    json.dump(results, f)

# aan_test_triplet.json

In [50]:
with open('./data/aan_test_triplet.json') as f:
    test_set = json.load(f)

In [51]:
paper_deepwalk_array = directed_paper_deepwalk_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_deepwalk_vectors = [model_directed.get_vector(paper_id) for paper_id in partial_input_ids]
    
    input_deepwalk_array = np.vstack(input_deepwalk_vectors)

    partial_score_arrays.append(cosine_similarity(input_deepwalk_array, paper_deepwalk_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [52]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [53]:
with open('./results/triplet_base_deepwalkcosine_directed.json', 'w') as f:
    json.dump(results, f)

In [54]:
paper_deepwalk_array = undirected_paper_deepwalk_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_deepwalk_vectors = [model_undirected.get_vector(paper_id) for paper_id in partial_input_ids]
    
    input_deepwalk_array = np.vstack(input_deepwalk_vectors)

    partial_score_arrays.append(cosine_similarity(input_deepwalk_array, paper_deepwalk_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [55]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [56]:
with open('./results/triplet_base_deepwalkcosine_undirected.json', 'w') as f:
    json.dump(results, f)

# aan_test_tripletfromref.json

In [57]:
with open('./data/aan_test_tripletfromref.json') as f:
    test_set = json.load(f)

In [58]:
paper_deepwalk_array = directed_paper_deepwalk_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_deepwalk_vectors = [model_directed.get_vector(paper_id) for paper_id in partial_input_ids]
    
    input_deepwalk_array = np.vstack(input_deepwalk_vectors)

    partial_score_arrays.append(cosine_similarity(input_deepwalk_array, paper_deepwalk_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [59]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [60]:
with open('./results/tripletfromref_base_deepwalkcosine_directed.json', 'w') as f:
    json.dump(results, f)

In [61]:
paper_deepwalk_array = undirected_paper_deepwalk_array

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_deepwalk_vectors = [model_undirected.get_vector(paper_id) for paper_id in partial_input_ids]
    
    input_deepwalk_array = np.vstack(input_deepwalk_vectors)

    partial_score_arrays.append(cosine_similarity(input_deepwalk_array, paper_deepwalk_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [62]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [63]:
with open('./results/tripletfromref_base_deepwalkcosine_undirected.json', 'w') as f:
    json.dump(results, f)