# Method 39 : Paper2vec: Combining graph and text information for scientific paper representation (2017)

Source : S. Ganguly and V. Pudi, “Paper2vec: Combining graph and text information for scientific paper representation,” Lect. Notes Comput. Sci. (including Subser. Lect. Notes Artif. Intell. Lect. Notes Bioinformatics), vol. 10193 LNCS, pp. 383–395, 2017.

Original code : https://github.com/asxzy/paper2vec-gensim/

In [14]:
%%capture
import json
import networkx as nx
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np
import random

In [2]:
FULL_SET_PATH = './data/aan_full.json'
TEST_SET_PATH = './data/aan_test.json'
OUTPUT_PATH = './results/stateofart_method39_aan.json'

In [15]:
FULL_SET_PATH = './data/dblp_full.json'
TEST_SET_PATH = './data/dblp_test.json'
OUTPUT_PATH = './results/stateofart_method39_dblp.json'

### Step 1 : Load data

In [16]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)
    
with open(TEST_SET_PATH) as f:
    test_set = json.load(f)

### Step 2 : Train doc2vec

In [17]:
documents = [TaggedDocument(simple_preprocess(ref['title'] + ' ' + ref['abstract']), [ref['id']])
             for ref in full_set]
doc2vec_model = Doc2Vec(alpha=0.025, window=10, min_count=10, min_alpha=0.025, size=100)
random.seed(42)
doc2vec_model.build_vocab(documents)

# decrease alpha
for i in tqdm(range(10)):
    random.shuffle(documents)
    doc2vec_model.alpha = 0.025-0.002*i
    doc2vec_model.min_alpha = doc2vec_model.alpha
    doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




### Step 3 : Build graph

In [18]:
# citation graph as an undirected graph
citation_graph = nx.Graph()

# add citations links
for paper in full_set:
    for ref_id in paper['references']:
        citation_graph.add_edge(paper['id'], ref_id)
        
# add similar doc links
for document in documents:
    for n in doc2vec_model.docvecs.most_similar(document.tags,topn=2):
        citation_graph.add_edge(document.tags[0], n[0])

### Step 4 : Random walks

In [19]:
# inspire by https://github.com/phanein/deepwalk/
# build deepwalk corpus

walks = []
nodes = list(citation_graph.nodes())
number_walks = 10
walk_length = 40

for _ in range(number_walks):
    random.shuffle(nodes)
    for node in nodes:
        # random walk
        path = [node]

        while len(path) < walk_length:
            current = path[-1]
            current_neighbors = list(citation_graph[current])
            if current_neighbors:
                path.append(random.choice(current_neighbors))
            else:
                break

        #return [str(node) for node in path] 
        walks.append(path)

### Step 5 : Train final model (paper2vec)

In [20]:
# Prepare doc2vec embeddings
with open('./temp/doc2vec.embd','w') as f:
    f.write("%s %s\n"%(len(documents),100))
    for document in documents:
        f.write(document.tags[0]+" "+" ".join([str(x) for x in doc2vec_model.docvecs[document.tags[0]]])+"\n")

In [21]:
paper2vec_model = Word2Vec(size=100, window=5, min_count=0)
paper2vec_model.build_vocab(walks)

# Load doc2vec embeddings into paper2vec model
paper2vec_model.intersect_word2vec_format('./temp/doc2vec.embd')

paper2vec_model.train(walks, total_examples=paper2vec_model.corpus_count, epochs=paper2vec_model.epochs)

(44550881, 45452000)

### Step 6 : Recommendations (test_set)

In [22]:
# Prepare data
paper2vec_vectors = [paper2vec_model.wv.get_vector(paper['id']) for paper in full_set]
paper2vec_array = np.vstack(paper2vec_vectors)
paper_ids = [paper['id'] for paper in full_set]

In [23]:
# Compute scores

partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_paper2vec_vectors = [paper2vec_model.wv.get_vector(paper_id)
                               for paper_id in partial_input_ids]
    
    input_paper2vec_array = np.vstack(input_paper2vec_vectors)

    partial_score_arrays.append(cosine_similarity(input_paper2vec_array, paper2vec_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [24]:
# Get recommendations

results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [25]:
with open(OUTPUT_PATH, 'w') as f:
    json.dump(results, f)