In [1]:
%%capture
import json
from tqdm.notebook import tqdm
import numpy as np

In [2]:
FULL_SET_PATH = './data/aan_full.json'
TEST_SET_PATH = './data/aan_test.json'
OUTPUT_PATH = './results/custom_fosjaccard_aan.json'

In [12]:
FULL_SET_PATH = './data/dblp_full.json'
TEST_SET_PATH = './data/dblp_test.json'
OUTPUT_PATH = './results/custom_fosjaccard_dblp.json'

In [13]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)
    
full_set_dict = dict([(paper['id'], paper) for paper in full_set])
paper_ids = [paper['id'] for paper in full_set]

In [14]:
with open(TEST_SET_PATH) as f:
    test_set = json.load(f)

In [15]:
candidate_score_tuples_list = []
for input_papers in tqdm(test_set):
    candidate_score_tuples = []
    input_fos_set = set(fos for paper in input_papers for fos in paper['fos'])
    
    for paper in full_set:
        candidate_fos_set = set(paper['fos'])
        jaccard_similarity = len(input_fos_set.intersection(candidate_fos_set)) / len(input_fos_set.union(candidate_fos_set))
        candidate_score_tuples.append((paper['id'], jaccard_similarity))
        
    candidate_score_tuples_list.append(candidate_score_tuples)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [16]:
len(candidate_score_tuples_list)

2000

In [17]:
len(candidate_score_tuples_list[0])

22726

In [18]:
candidate_score_tuples_list[0][:10]

[('6468916', 0.15625),
 ('12824513', 0.15625),
 ('16066432', 0.16129032258064516),
 ('19790595', 0.125),
 ('20948110', 0.09090909090909091),
 ('22861983', 0.17857142857142858),
 ('24640360', 0.1935483870967742),
 ('24836904', 0.125),
 ('28988658', 0.13793103448275862),
 ('34559920', 0.125)]

In [19]:
results = []

for input_papers, candidate_scores in tqdm(zip(test_set, candidate_score_tuples_list)):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
        
    candidate_scores.sort(key=lambda x: x[1], reverse=True)    
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [20]:
candidate_score_tuples_list[0][:10]

[('2963830382', 0.48),
 ('2606987267', 0.44),
 ('2793173694', 0.44),
 ('2951755740', 0.44),
 ('2962928871', 0.37037037037037035),
 ('2775682873', 0.3333333333333333),
 ('2109992539', 0.32142857142857145),
 ('2770997764', 0.3103448275862069),
 ('2734558172', 0.2857142857142857),
 ('2795374598', 0.2857142857142857)]

In [21]:
with open(OUTPUT_PATH, 'w') as f:
    json.dump(results, f)