In [1]:
%%capture
import json
from tqdm.notebook import tqdm
import numpy as np

In [2]:
FULL_SET_PATH = './data/aan_full_with_fos_w.json'
TEST_SET_PATH = './data/aan_test.json'
OUTPUT_PATH = './results/custom_foscosine_aan.json'

In [15]:
FULL_SET_PATH = './data/dblp_full_with_fos_w.json'
TEST_SET_PATH = './data/dblp_test.json'
OUTPUT_PATH = './results/custom_foscosine_dblp.json'

In [16]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)
    
full_set_dict = dict([(paper['id'], paper) for paper in full_set])
paper_ids = [paper['id'] for paper in full_set]

In [17]:
with open(TEST_SET_PATH) as f:
    test_set = json.load(f)

In [18]:
def merge_fos_lists(papers):
    fos_dict = {}
    for paper in papers:
        for t in paper['fos_w']:
            if t['name'] in fos_dict:
                fos_dict[t['name']].append(t['w'])
            else:
                fos_dict[t['name']] = [t['w']]
    
    for name, weights in fos_dict.items():
        fos_dict[name] = np.mean(weights)
        
    return fos_dict

In [19]:
# testing merge_fos_lists
list1 = [{'name':'name1','w':0}, {'name':'name2','w':0}, {'name':'name3','w':0}]
list2 = [{'name':'name2','w':1}, {'name':'name3','w':1}, {'name':'name4','w':1}]

merge_fos_lists([{'fos_w':list1}, {'fos_w':list2}])

{'name1': 0.0, 'name2': 0.5, 'name3': 0.5, 'name4': 1.0}

In [20]:
def compute_cosine_similarity(fos_dict1, fos_dict2):
    dot_product = sum([weight1*fos_dict2[name1] for name1, weight1 in fos_dict1.items() if name1 in fos_dict2])

    norm1 = sum([weight*weight for weight in fos_dict1.values()]) ** 0.5
    norm2 = sum([weight*weight for weight in fos_dict2.values()]) ** 0.5
    
    return dot_product / (norm1 * norm2)

In [21]:
# testing compute_cosine_similarity
fos_dict1 = {'name1': 0.0, 'name2': 0.5, 'name3': 0.5, 'name4': 1.0}
fos_dict2 = {'name1': 0.0, 'name2': 0.5, 'name3': 0.5, 'name4': 1.0}

compute_cosine_similarity(fos_dict1, fos_dict2)

1.0000000000000002

In [22]:
candidate_score_tuples_list = []
for input_papers in tqdm(test_set):
    candidate_score_tuples = []
    input_fos_dict = merge_fos_lists([full_set_dict[paper['id']] for paper in input_papers])
    
    for paper in full_set:
        candidate_fos_dict = {t['name']:t['w'] for t in paper['fos_w']}
        cosine_similarity = compute_cosine_similarity(input_fos_dict, candidate_fos_dict)
        candidate_score_tuples.append((paper['id'], cosine_similarity))
        
    candidate_score_tuples_list.append(candidate_score_tuples)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [23]:
len(candidate_score_tuples_list)

2000

In [24]:
len(candidate_score_tuples_list[0])

22726

In [25]:
candidate_score_tuples_list[0][:10]

[('6468916', 0.2462609911408718),
 ('12824513', 0.21308908819188255),
 ('16066432', 0.25215196973939374),
 ('19790595', 0.19793567174173568),
 ('20948110', 0.1463034732055883),
 ('22861983', 0.32956461818199273),
 ('24640360', 0.2760641358530575),
 ('24836904', 0.17452447280132255),
 ('28988658', 0.23537007533879512),
 ('34559920', 0.16594477086745157)]

In [26]:
results = []

for input_papers, candidate_scores in tqdm(zip(test_set, candidate_score_tuples_list)):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
        
    candidate_scores.sort(key=lambda x: x[1], reverse=True)    
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [27]:
candidate_score_tuples_list[0][:10]

[('2963830382', 0.7133015006016702),
 ('2793173694', 0.6871250749253732),
 ('2951755740', 0.6333915450842204),
 ('2606987267', 0.6101222266557221),
 ('2981138795', 0.5322518639531668),
 ('2962928871', 0.5214278641685693),
 ('2775682873', 0.49987175534665007),
 ('2109992539', 0.49649211810497135),
 ('2624503621', 0.47803381646663706),
 ('2752506724', 0.4710190742249424)]

In [28]:
with open(OUTPUT_PATH, 'w') as f:
    json.dump(results, f)