In [1]:
%%capture
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
from tqdm.notebook import tqdm

In [2]:
with open('./datasets/train_set_aan.json') as f:
    train_set = json.load(f)
    
with open('./datasets/test_set_aan.json') as f:
    test_set = json.load(f)
    
full_set = train_set + test_set

In [3]:
corpus = [ref['title'] + ' ' + ref['abstract'] for ref in full_set]
vectorizer = TfidfVectorizer(stop_words='english').fit(corpus)
print(len(vectorizer.get_feature_names()))

82950


In [4]:
for ref in tqdm(full_set):
    document = [ref['title'] + ' ' + ref['abstract']]
    ref['tfidf_vector'] = vectorizer.transform(document)

HBox(children=(FloatProgress(value=0.0, max=15602.0), HTML(value='')))




In [5]:
ground_truth = dict([(ref['id'], ref['references']) for ref in test_set])
recommendation = {}

for input_ref in tqdm(test_set):
    candidate_scores = []
    
    for ref_id, tfidf_vector in [(ref['id'], ref['tfidf_vector']) for ref in train_set]:
        score = cosine_similarity(input_ref['tfidf_vector'], tfidf_vector)
        candidate_scores.append((ref_id, score))
        
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    
    recommendation[input_ref['id']] = [cs[0] for cs in candidate_scores[:100]]
    

len(ground_truth) - len(recommendation)

HBox(children=(FloatProgress(value=0.0, max=3085.0), HTML(value='')))




0

In [6]:
with open('./evaloffsets/base_tfidfcosine_gt.json', 'w') as f:
    json.dump(ground_truth, f)
    
with open('./evaloffsets/base_tfidfcosine_rec.json', 'w') as f:
    json.dump(recommendation, f)