# Method 21 : Academic paper recommender system using multilevel simultaneous citation networks (2018)

Source : Son, J., & Kim, S. B. (2018). Academic paper recommender system using multilevel simultaneous citation networks. Decision Support Systems, 105, 24–33.

In [12]:
%%capture
import json
import networkx as nx
from tqdm.notebook import tqdm
import time
from collections import defaultdict

In [13]:
with open('./data/aan_full.json') as f:
    full_set = json.load(f)
    
directed_citation_graph = nx.DiGraph()
for paper in full_set:
    for ref_id in paper['references']:
        directed_citation_graph.add_edge(paper['id'], ref_id)
        
undirected_citation_graph = directed_citation_graph.to_undirected()

In [35]:
# Generating multilevel citation networks
def helper_generate_for_one_node(G, target_node, depth_level):
    distances = defaultdict(lambda: depth_level+1)
    candidate_nodes = {target_node}
    last_backward_nodes = {target_node}
    last_forward_nodes = {target_node}
    for dist in range(1, depth_level+1):
        current_backward_nodes = set()
        for node in last_backward_nodes:
            current_backward_nodes.update([pred for pred in G.predecessors(node)])
        
        current_forward_nodes = set()
        for node in last_forward_nodes:
            current_forward_nodes.update([succ for succ in G.successors(node)])
        
        candidate_nodes.update(current_backward_nodes)
        candidate_nodes.update(current_forward_nodes)
        last_backward_nodes = current_backward_nodes
        last_forward_nodes = current_forward_nodes
        
        for node in current_backward_nodes.union(current_forward_nodes):
            if node not in distances:
                distances[node] = dist
   
    return candidate_nodes, distances

def generate_multilevel_citation_networks(G, target_nodes, depth_level):
    partial_distances = []
    candidate_nodes = set()
    for target_node in target_nodes:
        candidates, distances = helper_generate_for_one_node(G, target_node, depth_level)
        partial_distances.append(distances)
        candidate_nodes.update(candidates)
        
    for node in candidate_nodes:
        distances[node] = sum([d[node] for d in partial_distances]) / len(partial_distances)
        
    return G.subgraph(candidate_nodes), distances

In [25]:
full_set[100]

{'id': 'E09-1020',
 'author': ['Cromierès,Fabien', 'Kurohashi,Sadao'],
 'title': 'An Alignment Algorithm Using Belief Propagation and a Structure-Based Distortion Model',
 'venue': 'EACL',
 'year': 2009,
 'citations': ['D12-1105',
  'N12-1004',
  'P11-1042',
  'P11-1043',
  'P11-1151',
  'P13-1003',
  'P13-1106',
  'P14-1139'],
 'references': ['P01-1067', 'P03-1010', 'P03-1011', 'P06-3003', 'W03-0301'],
 'abstract': 'In this paper, we first demonstrate the interest of the Loopy Belief Propagation algorithm to train and use a simple alignment model where the expected marginalvalues needed for an efficient EM-trainingare not easily computable. We then improve this model with a distortion modelbased on structure conservation.'}

In [37]:
generate_multilevel_citation_networks(directed_citation_graph, ['E09-1020'], 2)

(<networkx.classes.digraph.DiGraph at 0x1dd7075c2c8>,
 defaultdict(<function __main__.helper_generate_for_one_node.<locals>.<lambda>()>,
             {'W03-0301': 1.0,
              'P11-1042': 1.0,
              'P11-1043': 1.0,
              'P01-1067': 1.0,
              'P14-1139': 1.0,
              'D12-1105': 1.0,
              'P03-1010': 1.0,
              'P06-3003': 1.0,
              'P13-1106': 1.0,
              'P11-1151': 1.0,
              'N12-1004': 1.0,
              'P03-1011': 1.0,
              'P13-1003': 1.0,
              'W04-1118': 2.0,
              'P13-1012': 2.0,
              'W03-0304': 2.0,
              'N12-1052': 2.0,
              'W12-3810': 2.0,
              'D13-1205': 2.0,
              'P14-1138': 2.0,
              'W03-0309': 2.0,
              'P13-1165': 2.0,
              'A92-1021': 2.0,
              'P05-1032': 2.0,
              'W03-0305': 2.0,
              'Q13-1001': 2.0,
              'C02-1002': 2.0,
              'C94-2175': 

In [4]:
# Selection of candidate papers
def select_candidate_papers(G, target_nodes, limit, distances):
    candidates = [node for node in G.nodes if node not in target_nodes]
    candidates_scores = []    
    
    for candidate in candidates:
        other_nodes = [node for node in G.nodes if node != candidate]
        
        # bibliographic coupling
        bc_score = 0
        candidate_successors = set(list(G.successors(candidate)))
        for node in other_nodes:
            node_successors = set(list(G.successors(node)))            
            bc_score += len(candidate_successors.intersection(node_successors))
    
        # co-citation coupling
        cc_score = 0
        candidate_predecessors = set(list(G.predecessors(candidate)))
        for node in other_nodes:
            node_predecessors = set(list(G.predecessors(node)))
            cc_score += len(candidate_predecessors.intersection(node_predecessors))
        
        # final score
        score = (bc_score + cc_score) / distances[candidate]
        candidates_scores.append((candidate, score))
    
    candidates_scores.sort(key=lambda t: t[1], reverse=True)
    limit = min(limit, len(candidates_scores))
    
    return G.subgraph([t[0] for t in candidates_scores[:limit]])

In [42]:
sub_graph = generate_multilevel_citation_networks(directed_citation_graph, ['E09-1020'], 1)
select_candidate_papers(sub_graph, ['E09-1020'], 500, 1).nodes

NodeView(('P11-1043', 'P13-1003', 'P03-1011', 'P13-1106', 'P06-3003', 'P14-1139', 'N12-1004', 'P03-1010', 'P11-1151', 'P01-1067', 'D12-1105', 'P11-1042', 'W03-0301'))

In [5]:
# Determination of recommend papers
def recommend_papers(G, target_nodes, limit):
    candidates = [node for node in G.nodes if node not in target_nodes]
    
    degree_ranks = [node for node, score in sorted(nx.degree_centrality(G).items(), key=lambda i: i[1], reverse=True) if node not in target_nodes]
    closeness_ranks = [node for node, score in sorted(nx.closeness_centrality(G).items(), key=lambda i: i[1], reverse=True) if node not in target_nodes]
    betweenness_ranks = [node for node, score in sorted(nx.betweenness_centrality(G).items(), key=lambda i: i[1], reverse=True) if node not in target_nodes]
    eigenvector_ranks = [node for node, score in sorted(nx.eigenvector_centrality_numpy(G).items(), key=lambda i: i[1], reverse=True) if node not in target_nodes]
    
    partial_ranks = dict([(candidate, []) for candidate in candidates])
    for nodes in [degree_ranks, closeness_ranks, betweenness_ranks, eigenvector_ranks]:
        for i, node in enumerate(nodes):
            partial_ranks[node].append(i+1)
        
    candidates_ranks = []
    for candidate, partial_rank in partial_ranks.items():
        candidates_ranks.append((candidate, sum(partial_rank)/len(partial_rank)))
        
    candidates_ranks.sort(key=lambda t: t[1])
    limit = min(limit, len(candidates_ranks))
    
    return [t[0] for t in candidates_ranks[:limit]]

In [67]:
sub_graph = generate_multilevel_citation_networks(directed_citation_graph, ['E09-1020'], 4)
sub_graph = select_candidate_papers(sub_graph, ['E09-1020'], 500, 4)
recommend_papers(sub_graph.to_undirected(), ['E09-1020'], 10)

['J93-2004',
 'N12-1052',
 'N13-1039',
 'N03-1017',
 'P97-1003',
 'C96-2141',
 'P11-1043',
 'P01-1067',
 'W96-0213',
 'P02-1040']

# aan_test_tripletfromref.json

In [6]:
with open('./data/aan_test_tripletfromref.json') as f:
    test_set = json.load(f)

In [38]:
results = []
error_count = 0

for input_papers in tqdm(test_set[:10]):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    
    try:
        first_sub_graph, distances = generate_multilevel_citation_networks(directed_citation_graph, result['input'], 3)
        second_sub_graph = select_candidate_papers(first_sub_graph, result['input'], 500, distances)
        result['output'] = recommend_papers(second_sub_graph.to_undirected(), result['input'], 100)
        results.append(result)
    except:
        error_count += 1
        result['output'] = []
    
error_count

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




2

In [8]:
with open('./results/tripletfromref_stateofart_method21.json', 'w') as f:
    json.dump(results, f)