# Method 21 simplified

Former method : Academic paper recommender system using multilevel simultaneous citation networks (2018)

Source : Son, J., & Kim, S. B. (2018). Academic paper recommender system using multilevel simultaneous citation networks. Decision Support Systems, 105, 24–33.

Algorithm (using undirected citation graph) :
1. Generating multilevel citation networks
2. Selection of candidate papers based on Jaccard coefficient and Eigenvector centrality

In [1]:
%%capture
import json
import networkx as nx
from tqdm.notebook import tqdm
from collections import defaultdict

In [2]:
FULL_SET_PATH = './data/aan_full.json'
TEST_SET_PATH = './data/aan_test.json'
OUTPUT_PATH = './results/stateofart_method21simplified_aan.json'

In [15]:
FULL_SET_PATH = './data/dblp_full.json'
TEST_SET_PATH = './data/dblp_test.json'
OUTPUT_PATH = './results/stateofart_method21simplified_dblp.json'

In [16]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)
    
citation_graph = nx.DiGraph()
for paper in full_set:
    for ref_id in paper['references']:
        citation_graph.add_edge(paper['id'], ref_id)

In [17]:
with open(TEST_SET_PATH) as f:
    test_set = json.load(f)

In [18]:
# Generating multilevel citation networks
def helper_generate_for_one_node(G, target_node, depth_level):
    distances = defaultdict(lambda: depth_level+1)
    candidate_nodes = {target_node}
    last_backward_nodes = {target_node}
    last_forward_nodes = {target_node}
    for dist in range(1, depth_level+1):
        current_backward_nodes = set()
        for node in last_backward_nodes:
            current_backward_nodes.update([pred for pred in G.predecessors(node)])
        
        current_forward_nodes = set()
        for node in last_forward_nodes:
            current_forward_nodes.update([succ for succ in G.successors(node)])
        
        candidate_nodes.update(current_backward_nodes)
        candidate_nodes.update(current_forward_nodes)
        last_backward_nodes = current_backward_nodes
        last_forward_nodes = current_forward_nodes
        
        for node in current_backward_nodes.union(current_forward_nodes):
            if node not in distances:
                distances[node] = dist
   
    return candidate_nodes, distances

def generate_multilevel_citation_networks(G, target_nodes, depth_level):
    partial_distances = []
    candidate_nodes = set()
    for target_node in target_nodes:
        candidates, distances = helper_generate_for_one_node(G, target_node, depth_level)
        partial_distances.append(distances)
        candidate_nodes.update(candidates)
        
    for node in candidate_nodes:
        distances[node] = sum([d[node] for d in partial_distances]) / len(partial_distances)
        
    return G.subgraph(candidate_nodes), distances

In [13]:
# test
#len(full_set[100]['references']+full_set[100]['citations'])

In [14]:
# test
#len(generate_multilevel_citation_networks(citation_graph, ['E09-1020'], 1)[1])

In [19]:
# Selection of candidate papers
def select_candidate_papers(G, target_nodes, alpha, limit, distances):
    # alpha = balance between Jaccard and Eigenvector
    candidates = [node for node in G.nodes if node not in target_nodes]
    candidates_scores = []
    
    jaccard_scores = {}
    for candidate in candidates:
        node_pairs = [(candidate, target) for target in target_nodes]
        partial_scores = [t[2] for t in nx.jaccard_coefficient(G, node_pairs)]
        jaccard_scores[candidate] = sum(partial_scores) / len(partial_scores)
            
    try:
        eigenvector_scores = nx.eigenvector_centrality_numpy(G)
    except:
        eigenvector_scores = defaultdict(int)
        
    # final score
    candidates_scores = []
    for candidate in candidates:
        score = (alpha * jaccard_scores[candidate] + (1-alpha) * eigenvector_scores[candidate]) / distances[candidate]
        candidates_scores.append((candidate, score))
    
    candidates_scores.sort(key=lambda t: t[1], reverse=True)
    limit = min(limit, len(candidates_scores))
    
    return [t[0] for t in candidates_scores[:limit]]

In [12]:
# test
#sub_graph, distances = generate_multilevel_citation_networks(citation_graph, ['E09-1020'], 5)
#select_candidate_papers(sub_graph.to_undirected(), ['E09-1020'], 0.5, 10, distances)

In [20]:
results = []
empty_recs = 0
uncomplete_recs = 0

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    
    sub_graph, distances = generate_multilevel_citation_networks(citation_graph, result['input'], 4)
    result['output'] = select_candidate_papers(sub_graph.to_undirected(), result['input'], 0.5, 100, distances)
    results.append(result)
    
    if not result['output']:
        empty_recs += 1
    elif len(result['output']) < 100:
        uncomplete_recs += 1

print('empty recs', str(empty_recs))
print('uncomplet recs', str(uncomplete_recs))

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


empty recs 0
uncomplet recs 50


In [21]:
with open(OUTPUT_PATH, 'w') as f:
    json.dump(results, f)