In [3]:
%%capture
import json
from gensim.summarization import bm25
from gensim.utils import tokenize
from tqdm.notebook import tqdm

# source : https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/summarization/bm25.py

In [4]:
with open('./data/aan_full.json') as f:
    full_set = json.load(f)

In [8]:
corpus = [list(tokenize(ref['title'] + ' ' + ref['abstract'])) for ref in full_set]
full_set_ids = [ref['id'] for ref in full_set]
model = bm25.BM25(corpus)

# aan_test_single.json

In [27]:
with open('./data/aan_test_single.json') as f:
    test_set = json.load(f)

In [28]:
results = []

for input_paper in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id']]
    
    document = (input_paper['title'] + ' ' + input_paper['abstract']).split()
    candidate_scores = [(paper_id, score) for paper_id, score in zip(full_set_ids, model.get_scores(document))]
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] != input_paper['id']]    
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [29]:
with open('./results/single_base_bm25.json', 'w') as f:
    json.dump(results, f)

# aan_test_triplet.json

In [64]:
with open('./data/aan_test_triplet.json') as f:
    test_set = json.load(f)

In [31]:
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    input_document = ''
    for input_paper in input_papers:
        input_document += input_paper['title'] + ' ' + input_paper['abstract']
    input_document = input_document.split()
    
    candidate_scores = [(paper_id, score) for paper_id, score in zip(full_set_ids, model.get_scores(input_document))]
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]    
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [32]:
with open('./results/triplet_base_bm25_concat.json', 'w') as f:
    json.dump(results, f)

In [65]:
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    partial_scores = [[] for _ in range(3)]
    
    for i, input_paper in enumerate(input_papers):
        input_document = (input_paper['title'] + ' ' + input_paper['abstract']).split()    
        partial_scores[i].append(model.get_scores(input_document))
        
    complete_scores = [s1+s2+s3 for s1,s2,s3 in zip(partial_scores[0][0], partial_scores[1][0], partial_scores[2][0])]
    candidate_scores = [(paper_id, score) for paper_id, score in zip(full_set_ids, complete_scores)]   
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]    
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [66]:
with open('./results/triplet_base_bm25_sum.json', 'w') as f:
    json.dump(results, f)

# aan_test_tripletfromref.json

In [9]:
with open('./data/aan_test_tripletfromref.json') as f:
    test_set = json.load(f)

In [10]:
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    input_document = ''
    for input_paper in input_papers:
        input_document += input_paper['title'] + ' ' + input_paper['abstract']
    input_document = input_document.split()
    
    candidate_scores = [(paper_id, score) for paper_id, score in zip(full_set_ids, model.get_scores(input_document))]
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]    
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [11]:
with open('./results/tripletfromref_base_bm25_concat.json', 'w') as f:
    json.dump(results, f)

In [12]:
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    partial_scores = [[] for _ in range(3)]
        
    for i, input_paper in enumerate(input_papers):
        input_document = (input_paper['title'] + ' ' + input_paper['abstract']).split()    
        partial_scores[i].append(model.get_scores(input_document))
                
    complete_scores = [s1+s2+s3 for s1,s2,s3 in zip(partial_scores[0][0], partial_scores[1][0], partial_scores[2][0])]
    
    candidate_scores = [(paper_id, score) for paper_id, score in zip(full_set_ids, complete_scores)]   
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]    
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [13]:
with open('./results/tripletfromref_base_bm25_sum.json', 'w') as f:
    json.dump(results, f)