In [1]:
%%capture
import json
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from tqdm.notebook import tqdm
import numpy as np

# source : https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [2]:
with open('./data/aan_full.json') as f:
    full_set = json.load(f)

In [3]:
corpus = [simple_preprocess(ref['title'] + ' ' + ref['abstract']) for ref in full_set]
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
model = Doc2Vec(documents, vector_size=100, epochs=40, workers=4)

In [4]:
for ref in tqdm(full_set):
    document = simple_preprocess(ref['title'] + ' ' + ref['abstract'])
    ref['doc2vec_vector'] = model.infer_vector(document).reshape(1, -1)

HBox(children=(FloatProgress(value=0.0, max=15366.0), HTML(value='')))




In [5]:
full_set_dict = dict([(paper['id'], paper) for paper in full_set])

In [6]:
paper_doc2vec_vectors = [paper['doc2vec_vector'] for paper in full_set]
paper_ids = [paper['id'] for paper in full_set]

paper_doc2vec_array = np.vstack(paper_doc2vec_vectors)

# aan_test_single.json

In [14]:
with open('./data/aan_test_single.json') as f:
    test_set = json.load(f)

In [15]:
input_doc2vec_vectors = [full_set_dict[paper['id']]['doc2vec_vector'] for paper in test_set]
input_ids = [paper['id'] for paper in test_set]

input_doc2vec_array = np.vstack(input_doc2vec_vectors)

score_array = cosine_similarity(input_doc2vec_array, paper_doc2vec_array)

In [16]:
results = []

for input_paper_id, scores in tqdm(zip(input_ids, score_array.tolist())):
    result = {}
    result['input'] = [input_paper_id]
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] != input_paper_id]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [17]:
with open('./results/single_base_doc2veccosine.json', 'w') as f:
    json.dump(results, f)

# aan_test_triplet.json

In [7]:
with open('./data/aan_test_triplet.json') as f:
    test_set = json.load(f)

In [20]:
input_doc2vec_vectors = []
for input_papers in tqdm(test_set):
    input_document = ''
    
    for input_paper in input_papers:
        input_document += input_paper['title'] + ' ' + input_paper['abstract']
    
    input_doc2vec_vectors.append(model.infer_vector(simple_preprocess(input_document)).reshape(1, -1))

input_doc2vec_array = np.vstack(input_doc2vec_vectors)

score_array = cosine_similarity(input_doc2vec_array, paper_doc2vec_array)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [21]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [22]:
with open('./results/triplet_base_doc2veccosine_concat.json', 'w') as f:
    json.dump(results, f)

In [8]:
partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_doc2vec_vectors = [full_set_dict[paper_id]['doc2vec_vector'] for paper_id in partial_input_ids]
    
    input_doc2vec_array = np.vstack(input_doc2vec_vectors)

    partial_score_arrays.append(cosine_similarity(input_doc2vec_array, paper_doc2vec_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [9]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [10]:
with open('./results/triplet_base_doc2veccosine_sum.json', 'w') as f:
    json.dump(results, f)

# aan_test_tripletfromref.json

In [7]:
with open('./data/aan_test_tripletfromref.json') as f:
    test_set = json.load(f)

In [8]:
input_doc2vec_vectors = []
for input_papers in tqdm(test_set):
    input_document = ''
    
    for input_paper in input_papers:
        input_document += input_paper['title'] + ' ' + input_paper['abstract']
    
    input_doc2vec_vectors.append(model.infer_vector(simple_preprocess(input_document)).reshape(1, -1))

input_doc2vec_array = np.vstack(input_doc2vec_vectors)

score_array = cosine_similarity(input_doc2vec_array, paper_doc2vec_array)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [9]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [10]:
with open('./results/tripletfromref_base_doc2veccosine_concat.json', 'w') as f:
    json.dump(results, f)

In [11]:
partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_doc2vec_vectors = [full_set_dict[paper_id]['doc2vec_vector'] for paper_id in partial_input_ids]
    
    input_doc2vec_array = np.vstack(input_doc2vec_vectors)

    partial_score_arrays.append(cosine_similarity(input_doc2vec_array, paper_doc2vec_array))

score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [12]:
results = []

for input_papers, scores  in tqdm(zip(test_set, score_array.tolist())):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores = list(zip(paper_ids, scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [13]:
with open('./results/tripletfromref_base_doc2veccosine_sum.json', 'w') as f:
    json.dump(results, f)