In [3]:
%%capture
import json
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm.notebook import tqdm

# source : https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [4]:
with open('./datasets/train_set_aan.json') as f:
    train_set = json.load(f)
    
with open('./datasets/test_set_aan.json') as f:
    test_set = json.load(f)
    
full_set = train_set + test_set

In [5]:
corpus = [ref['title'] + ' ' + ref['abstract'] for ref in full_set]
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
model = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)

In [15]:
for ref in tqdm(full_set):
    document = [ref['title'] + ' ' + ref['abstract']]
    ref['doc2vec_vector'] = model.infer_vector(document).reshape(1, -1)

HBox(children=(FloatProgress(value=0.0, max=15602.0), HTML(value='')))




In [16]:
full_set[0]['doc2vec_vector'].shape

(1, 100)

In [17]:
ground_truth = dict([(ref['id'], ref['references']) for ref in test_set])
recommendation = {}

for input_ref in tqdm(test_set):
    candidate_scores = []
    
    for ref_id, doc2vec_vector in [(ref['id'], ref['doc2vec_vector']) for ref in train_set]:
        score = cosine_similarity(input_ref['doc2vec_vector'], doc2vec_vector)
        candidate_scores.append((ref_id, score))
        
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    
    recommendation[input_ref['id']] = [cs[0] for cs in candidate_scores[:100]]
    

len(ground_truth) - len(recommendation)

HBox(children=(FloatProgress(value=0.0, max=3085.0), HTML(value='')))




0

In [18]:
with open('./evaloffsets/base_doc2veccosine_gt.json', 'w') as f:
    json.dump(ground_truth, f)
    
with open('./evaloffsets/base_doc2veccosine_rec.json', 'w') as f:
    json.dump(recommendation, f)