In [1]:
%%capture
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
import numpy as np
import scipy as sp

In [2]:
with open('./data/aan_full.json') as f:
    full_set = json.load(f)

In [3]:
corpus = [ref['title'] + ' ' + ref['abstract'] for ref in full_set]
vectorizer = TfidfVectorizer(stop_words='english').fit(corpus)
print(len(vectorizer.get_feature_names()))

81817


In [4]:
for ref in tqdm(full_set):
    document = [ref['title'] + ' ' + ref['abstract']]
    ref['tfidf_vector'] = vectorizer.transform(document)

HBox(children=(FloatProgress(value=0.0, max=15366.0), HTML(value='')))




In [5]:
full_set_dict = dict([(paper['id'], paper) for paper in full_set])

In [6]:
paper_tfidf_vectors = [paper['tfidf_vector'] for paper in full_set]
paper_ids = [paper['id'] for paper in full_set]

paper_tfidf_array = sp.sparse.vstack(paper_tfidf_vectors)

# aan_test_tripletfromref.json

In [7]:
with open('./data/aan_test_tripletfromref.json') as f:
    test_set = json.load(f)

In [8]:
jaccard_scores_list = []
for input_papers in tqdm(test_set):
    current_scores = []
    input_ids_list = []
    for input_paper in input_papers:
        input_ids_list.append(set(input_paper['references'] + input_paper['citations']))
    
    for paper in full_set:
        current_score = 0
        current_paper_ids = set(paper['references'] + paper['citations'])
        for i in range(3):
            current_score += len(current_paper_ids.intersection(input_ids_list[i])) / len(current_paper_ids.union(input_ids_list[i]))
        current_scores.append(current_score)
        
    jaccard_scores_list.append(current_scores)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [9]:
len(jaccard_scores_list)

3000

In [10]:
len(jaccard_scores_list[0])

15366

In [11]:
partial_score_arrays = []
for i in tqdm(range(3)):
    partial_input_ids = [input_papers[i]['id'] for input_papers in test_set]
    input_tfidf_vectors = [full_set_dict[paper_id]['tfidf_vector'] for paper_id in partial_input_ids]
    
    input_tfidf_array = sp.sparse.vstack(input_tfidf_vectors)

    partial_score_arrays.append(cosine_similarity(input_tfidf_array, paper_tfidf_array))

tfidfcosine_score_array = sum(partial_score_arrays)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [12]:
tfidfcosine_score_array.shape

(3000, 15366)

In [13]:
results = []

for input_papers, tfidfcosine_scores, jaccard_scores in tqdm(zip(test_set, tfidfcosine_score_array.tolist(), jaccard_scores_list)):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    final_scores = [tf_score + jac_score for tf_score, jac_score in zip(tfidfcosine_scores, jaccard_scores)]
    candidate_scores = list(zip(paper_ids, final_scores))     
    candidate_scores.sort(key=lambda x: x[1], reverse=True)    
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [14]:
with open('./results/tripletfromref_custom_tfidfcosine_jaccard.json', 'w') as f:
    json.dump(results, f)