In [1]:
%%capture
import json
from tqdm.notebook import tqdm

In [2]:
FULL_SET_PATH = './data/aan_full.json'
TEST_SET_PATH = './data/aan_test.json'
OUTPUT_PATH = './results/custom_citationcosine_aan.json'

In [14]:
FULL_SET_PATH = './data/dblp_full.json'
TEST_SET_PATH = './data/dblp_test.json'
OUTPUT_PATH = './results/custom_citationcosine_dblp.json'

In [15]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)
    
full_set_dict = dict([(paper['id'], paper) for paper in full_set])
paper_ids = [paper['id'] for paper in full_set]

In [16]:
with open(TEST_SET_PATH) as f:
    test_set = json.load(f)

In [17]:
def compute_citation_cosine_similarity(paper_set1, paper_set2):
    intersection = paper_set1.intersection(paper_set2)
    degree_geometric_mean = (len(paper_set1) * len(paper_set2)) ** 0.5
    
    return len(intersection) / degree_geometric_mean

In [18]:
# testing compute_citation_cosine_similarity
paper_set1 = {'paper1', 'paper2', 'paper3'}
paper_set2 = {'paper1', 'paper2', 'paper3'}

compute_citation_cosine_similarity(paper_set1, paper_set2)

1.0

In [19]:
candidate_score_tuples_list = []
for input_papers in tqdm(test_set):
    candidate_score_tuples = []
    input_ref_cit_set = set()
    for input_paper in input_papers:
        input_ref_cit_set.update(input_paper['references'] + input_paper['citations'])
    
    for paper in full_set:
        cosine_similarity = compute_citation_cosine_similarity(input_ref_cit_set, set(paper['references'] + paper['citations']))
        candidate_score_tuples.append((paper['id'], cosine_similarity))
        
    candidate_score_tuples_list.append(candidate_score_tuples)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [20]:
len(candidate_score_tuples_list)

2000

In [21]:
len(candidate_score_tuples_list[0])

22726

In [22]:
candidate_score_tuples_list[0][:10]

[('6468916', 0.0),
 ('12824513', 0.0),
 ('16066432', 0.0),
 ('19790595', 0.0),
 ('20948110', 0.0),
 ('22861983', 0.0),
 ('24640360', 0.0),
 ('24836904', 0.0),
 ('28988658', 0.0),
 ('34559920', 0.0)]

In [23]:
results = []

for candidate_scores, input_papers in tqdm(zip(candidate_score_tuples_list, test_set)):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    candidate_scores.sort(key=lambda x: x[1], reverse=True)    
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [24]:
candidate_score_tuples_list[0][:10]

[('2951755740', 0.8459824315609656),
 ('2606987267', 0.6340037731068526),
 ('2624503621', 0.4983184185907058),
 ('2950642167', 0.3549711258126779),
 ('2556802233', 0.2993924754260479),
 ('2560609797', 0.27517203630084297),
 ('2792096654', 0.26403934479377983),
 ('2793173694', 0.2619684159977919),
 ('2769473888', 0.25332019855244947),
 ('2989599677', 0.24715576637149037)]

In [25]:
with open(OUTPUT_PATH, 'w') as f:
    json.dump(results, f)