# Method 20 : Collaborative approach for research paper recommender system (2017)

Source : Haruna, K., Akmar Ismail, M., Damiasih, D., Sutopo, J., & Herawan, T. (2017). A collaborative approach for research paper recommender system. PLOS ONE, 12(10), e0184516.

In [1]:
%%capture
import json
import networkx as nx
from tqdm.notebook import tqdm

In [2]:
with open('./data/aan_full.json') as f:
    full_set = json.load(f)
    
directed_citation_graph = nx.DiGraph()
for paper in full_set:
    for ref_id in paper['references']:
        directed_citation_graph.add_edge(paper['id'], ref_id)
        
undirected_citation_graph = directed_citation_graph.to_undirected()

full_set_dict = dict([(paper['id'], paper) for paper in full_set])

In [3]:
def compute_recommendations(G_dir, G_undir, targets):
    # 1. Retrieveall the set of references Rfj of the target paper pi from the paper-citation relation matrix C.
    rfj = set()
    for target in targets:
        rfj.update(G_dir.successors(target))
    
    # a. For each of the references Rfj, extract all other papers pci that also cited Rfj other than the target paper pi.
    pci = set()
    for paper in rfj:
        pci.update(G_dir.predecessors(paper))
    for paper in targets:
        pci.discard(paper)
        
    # 2. Retrieve all the set of citations Cfj of the target paper pi from the paper-citation relation matrix C.
    cfj = set()
    for target in targets:
        cfj.update(G_dir.predecessors(target))
        
    # a. For each of the citations Cfj, extract all other papers pri that Cfj referenced other than the target paper pi.
    pri = set()
    for paper in cfj:
        pri.update(G_dir.successors(paper))
    for paper in targets:
        pri.discard(paper)
        
    # 3. Qualify all the candidate papers pc from pci that has been referenced by at least any of the pri.
    pri_references = set()
    for paper in pri:
        pri_references.update(G_dir.successors(paper))
    qualified_candidates = set([paper for paper in pci if paper in pri_references])
    
    # 4. Measure the extent of jaccard similarity between the target paper pi and the qualified candidate papers pc.
    scores = []
    for candidate in qualified_candidates:
        node_pairs = [(candidate, target) for target in targets]
        partial_scores = nx.jaccard_coefficient(G_undir, node_pairs)
        score = sum([t[2] for t in partial_scores])
        scores.append((candidate, score))

    # 5. Recommend the top-N most similar papers to the user.
    scores.sort(key=lambda e: e[1], reverse=True)
    limit = min(100, len(scores))
    return [e[0] for e in scores[:limit]]

# aan_test_single.json

In [28]:
with open('./data/aan_test_single.json') as f:
    test_set = json.load(f)

In [29]:
results = []
empty_recs = 0
uncomplete_recs = 0

for input_paper in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id']]    
    result['output'] = compute_recommendations(directed_citation_graph, undirected_citation_graph, result['input'])
    if not result['output']:
        empty_recs += 1
    elif len(result['output']) < 100:
        uncomplete_recs += 1
    results.append(result)
    
print('empty recs', str(empty_recs))
print('uncomplet recs', str(uncomplete_recs))

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


empty recs 1635
uncomplet recs 1163


In [30]:
with open('./results/single_stateofart_method20.json', 'w') as f:
    json.dump(results, f)

# aan_test_triplet.json

In [31]:
with open('./data/aan_test_triplet.json') as f:
    test_set = json.load(f)

In [32]:
results = []
empty_recs = 0
uncomplete_recs = 0

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    result['output'] = compute_recommendations(directed_citation_graph, undirected_citation_graph, result['input'])
    if not result['output']:
        empty_recs += 1
    elif len(result['output']) < 100:
        uncomplete_recs += 1
    results.append(result)
    
print('empty recs', str(empty_recs))
print('uncomplet recs', str(uncomplete_recs))

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


empty recs 372
uncomplet recs 1837


In [33]:
with open('./results/triplet_stateofart_method20.json', 'w') as f:
    json.dump(results, f)

# aan_test_tripletfromref.json

In [4]:
with open('./data/aan_test_tripletfromref.json') as f:
    test_set = json.load(f)

In [5]:
results = []
empty_recs = 0
uncomplete_recs = 0

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    result['output'] = compute_recommendations(directed_citation_graph, undirected_citation_graph, result['input'])
    if not result['output']:
        empty_recs += 1
    elif len(result['output']) < 100:
        uncomplete_recs += 1
    results.append(result)
    
print('empty recs', str(empty_recs))
print('uncomplet recs', str(uncomplete_recs))

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


empty recs 58
uncomplet recs 938


In [6]:
with open('./results/tripletfromref_stateofart_method20.json', 'w') as f:
    json.dump(results, f)