# Method 7 : DiSCern: A diversified citation recommendation system for scientific queries (2015)

Source : Chakraborty, T., Modani, N., Narayanam, R., & Nagar, S. (2015). DiSCern: A diversified citation recommendation system for scientific queries. In 2015 IEEE 31st International Conference on Data Engineering (Vol. 2015–May, pp. 555–566).

In [1]:
%%capture
import json
import networkx as nx
from tqdm.notebook import tqdm
import community as louvain

In [2]:
FULL_SET_PATH = './data/aan_full.json'
TEST_SET_PATH = './data/aan_test.json'
LOC_OUTPUT_PATH = './results/stateofart_method7_loc_aan.json'
GLO_OUTPUT_PATH = './results/stateofart_method7_glo_aan.json'

In [12]:
FULL_SET_PATH = './data/dblp_full.json'
TEST_SET_PATH = './data/dblp_test.json'
LOC_OUTPUT_PATH = './results/stateofart_method7_loc_dblp.json'
GLO_OUTPUT_PATH = './results/stateofart_method7_glo_dblp.json'

In [13]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)
    
full_set_dict = dict([(paper['id'], paper) for paper in full_set])

keyword_paper_dict = {}
for paper in full_set:
    for fos in paper['fos']:
        if fos not in keyword_paper_dict:
            keyword_paper_dict[fos] = [paper['id']]
        else:
            keyword_paper_dict[fos].append(paper['id'])

with open(TEST_SET_PATH) as f:
    test_set = json.load(f)

### Step 1 : Citation network construction

In [14]:
citation_graph = nx.DiGraph()
for paper in full_set:
    citation_graph.add_edge(paper['id'], paper['id'])
    for ref_id in paper['references']:
        citation_graph.add_edge(paper['id'], ref_id)

### Step 2 : Keyword network construction

In [15]:
keyword_graph = nx.Graph()
keyword_graph.add_nodes_from(keyword_paper_dict.keys())


for paper in full_set:
    keywords = paper['fos'].copy()
    
    while keywords:
        current_keyword = keywords.pop()
        for other_keyword in keywords:
            if keyword_graph.has_edge(current_keyword, other_keyword):
                keyword_graph[current_keyword][other_keyword]['weight'] += 1
            else:
                keyword_graph.add_edge(current_keyword, other_keyword, weight=1)
                
# create communities
keyword_community_dict = louvain.best_partition(keyword_graph)
community_keyword_dict = dict([(community, []) for community in set(keyword_community_dict.values())])
for keyword, community  in keyword_community_dict.items():
    community_keyword_dict[community].append(keyword)

### Step 3 : Query expansion by clustering keywords

In [16]:
def expand_query(G, input_papers):
    input_keywords = set()
    for paper in input_papers:
        input_keywords.update(paper['fos'])
        
    communities = set([keyword_community_dict[keyword] for keyword in input_keywords])
        
    expand_keywords = set()
    for community in communities:
        expand_keywords.update(community_keyword_dict[community])
                
    expand_papers = set()
    for keyword in expand_keywords:
        expand_papers.update(keyword_paper_dict[keyword])
        
    return G.subgraph(expand_papers)

### Step 4 : Retrieving diverse and relevant citations

In [17]:
def recommend(G, input_papers, alpha_c = 0.25, lambda_c = 0.9, n_iter = 100, limit = 100):
    input_paper_ids = set([p['id'] for p in input_papers])
    p_star = 1/G.number_of_nodes()
    
    p_0_uv = {}
    for u in G.nodes():
        p_0_uv[u] = {}
        node_degree = G.out_degree(u)
        for v in G.neighbors(u):
            p_0_uv[u][v] = alpha_c / node_degree
        p_0_uv[u][u] = 1 - alpha_c
    
    # init for first iter
    p_t_u = dict([(node, p_star) for node in G.nodes()])
    p_t_uv = p_0_uv
    for _ in range(20):
        d_t_u = {}
        for u in G.nodes():
            d_t_u[u] = sum([p_0_uv[u][v] * p_t_u[v] for v in G.neighbors(u)])
        
        p_t1_u = {}
        for u in G.nodes():
            p_t1_u[u] = sum([p_t_uv[u][v] * p_t_u[u] for v in G.neighbors(u)])
        
        
        p_t1_uv = {}
        for u in G.nodes():
            p_t1_uv[u] = {}
            for v in G.neighbors(u):
                p_t1_uv[u][v] = (1-lambda_c) * p_star + lambda_c * p_0_uv[u][v] * p_t_u[v] / d_t_u[u]
                
        p_t_u = p_t1_u
        p_t_uv = p_t1_uv
    
    candidate_scores = [(p, v) for p, v in p_t_u.items() if p not in input_paper_ids]
    candidate_scores.sort(key=lambda e: e[1], reverse=True)
    limit = min(limit, len(candidate_scores))
    
    return [e[0] for e in candidate_scores[:limit]]

## Step 5 : Compute recommandations

In [18]:
# LocDiSCern
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    
    subgraph = expand_query(citation_graph, input_papers)    
    result['output'] = recommend(subgraph, input_papers)
    
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [19]:
with open(LOC_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)

In [20]:
# GloDiSCern
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    
    candidate_nodes = set()
    for input_paper in input_papers:
        for keyword in input_paper['fos']:
            candidate_nodes.update(keyword_paper_dict[keyword])
                
    result['output'] = recommend(citation_graph.subgraph(candidate_nodes), input_papers)
        
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [21]:
with open(GLO_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)