# Method 7 : DiSCern: A diversified citation recommendation system for scientific queries (2015)

Source : Chakraborty, T., Modani, N., Narayanam, R., & Nagar, S. (2015). DiSCern: A diversified citation recommendation system for scientific queries. In 2015 IEEE 31st International Conference on Data Engineering (Vol. 2015–May, pp. 555–566).

In [17]:
%%capture
import json
import networkx as nx
from tqdm.notebook import tqdm
import pke
import community as louvain
from string import punctuation
import time
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

In [18]:
with open('./data/aan_full.json') as f:
    full_set = json.load(f)
    
full_set_dict = dict([(paper['id'], paper) for paper in full_set])

### Step 1 : Citation network construction

In [19]:
citation_graph = nx.DiGraph()
for paper in full_set:
    citation_graph.add_edge(paper['id'], paper['id'])
    for ref_id in paper['references']:
        citation_graph.add_edge(paper['id'], ref_id)

### Step 2 : Keyword network construction

In [20]:
for paper in tqdm(full_set):
    with open('./temp/corpus/'+paper['id']+'.txt', 'w') as f:
        f.write(" ".join([paper['title'],paper['abstract']]))

HBox(children=(FloatProgress(value=0.0, max=15366.0), HTML(value='')))




##### Step 2.1 : Compute Document Frequency (no normalization)

In [7]:
# source : https://boudinfl.github.io/pke/build/html/tutorials/df.html
# 90 min duration

start = time.time()

stoplist=list(punctuation)

pke.compute_document_frequency(input_dir='./temp/corpus/',
                               output_file='./temp/pke_df.tsv.gz',
                               extension='txt',
                               language='en',
                               normalization=None,
                               stoplist=stoplist)

time.time() - start

5413.600337266922

##### Step 2.1 : Compute Document Frequency (stemming)

In [8]:
# source : https://boudinfl.github.io/pke/build/html/tutorials/df.html
# 90 min duration

start = time.time()

stoplist=list(punctuation)

pke.compute_document_frequency(input_dir='./temp/corpus/',
                               output_file='./temp/pke_df_stem.tsv.gz',
                               extension='txt',
                               language='en',
                               normalization='stemming',
                               stoplist=stoplist)

time.time() - start

5445.91331744194

##### Step 2.2 : Extract keyphrases (KPMiner)

In [24]:
# Step 2.2 : Extract keyphrases
# source : https://boudinfl.github.io/pke/build/html/unsupervised.html#kpminer

no_keyword_count = 0
df = pke.load_document_frequency_file(input_file='./temp/pke_df.tsv.gz')
#df = pke.load_document_frequency_file(input_file='./temp/pke_df_stem.tsv.gz')

keyword_kpminer_paper_dict = {}
for paper in tqdm(full_set):
    extractor = pke.unsupervised.KPMiner()
    extractor.load_document(input='./temp/corpus/'+paper['id']+'.txt',
                            language='en',
#                            normalization='stemming')
                            normalization=None)
    
    extractor.candidate_selection(lasf=2, cutoff=500)

    extractor.candidate_weighting(df=df, alpha=2.3, sigma=3.0)
    
    keywords = [kw[0] for kw in extractor.get_n_best(n=5)]
    paper['keywords_kpminer'] = keywords
    
    if not keywords:
        no_keyword_count += 1
    
    for keyword in keywords:
        if keyword not in keyword_kpminer_paper_dict:
            keyword_kpminer_paper_dict[keyword] = [paper['id']]
        else:
            keyword_kpminer_paper_dict[keyword].append(paper['id'])
            
no_keyword_count

HBox(children=(FloatProgress(value=0.0, max=15366.0), HTML(value='')))






29

In [25]:
len(keyword_kpminer_paper_dict)

24649

##### Step 2.2 : Extract keyphrases (KEA)

In [26]:
# Step 2.2 : Extract keyphrases
# source : https://boudinfl.github.io/pke/build/html/supervised.html#kea

stoplist = stopwords.words('english')
no_keyword_count = 0
df = pke.load_document_frequency_file(input_file='./temp/pke_df.tsv.gz')

keyword_kea_paper_dict = {}
for paper in tqdm(full_set):
    extractor = pke.supervised.Kea()
    extractor.load_document(input='./temp/corpus/'+paper['id']+'.txt',
                            language='en',
                            normalization=None)
    
    extractor.candidate_selection(stoplist=stoplist)

    extractor.candidate_weighting(df=df)
    
    keywords = [kw[0] for kw in extractor.get_n_best(n=5)]
    paper['keywords_kea'] = keywords
    
    if not keywords:
        no_keyword_count += 1
    
    for keyword in keywords:
        if keyword not in keyword_kea_paper_dict:
            keyword_kea_paper_dict[keyword] = [paper['id']]
        else:
            keyword_kea_paper_dict[keyword].append(paper['id'])
            
no_keyword_count

HBox(children=(FloatProgress(value=0.0, max=15366.0), HTML(value='')))




0

In [27]:
len(keyword_kea_paper_dict)

43980

In [28]:
with open('./data/aan_full_with_keywords.json', 'w') as f:
    json.dump(full_set, f)

##### Step 2.3 : create keywords graph

In [37]:
keyword_graph = nx.Graph()
#keyword_graph.add_nodes_from(keyword_kea_paper_dict.keys())
keyword_graph.add_nodes_from(keyword_kpminer_paper_dict.keys())


for paper in full_set:
    #keywords = paper['keywords_kea'].copy()
    keywords = paper['keywords_kpminer'].copy()
    
    while keywords:
        current_keyword = keywords.pop()
        for other_keyword in keywords:
            if keyword_graph.has_edge(current_keyword, other_keyword):
                keyword_graph[current_keyword][other_keyword]['weight'] += 1
            else:
                keyword_graph.add_edge(current_keyword, other_keyword, weight=1)
                
# create communities
keyword_community_dict = louvain.best_partition(keyword_graph)
community_keyword_dict = dict([(community, []) for community in set(keyword_community_dict.values())])
for keyword, community  in keyword_community_dict.items():
    community_keyword_dict[community].append(keyword)

### Step 3 : Query expansion by clustering keywords

In [38]:
def expand_query(G, input_papers):
    input_keywords = set()
    for paper in input_papers:
        #input_keywords.update(paper['keywords_kea'])
        input_keywords.update(paper['keywords_kpminer'])
        
    communities = set([keyword_community_dict[keyword] for keyword in input_keywords])
        
    expand_keywords = set()
    for community in communities:
        expand_keywords.update(community_keyword_dict[community])
                
    expand_papers = set()
    for keyword in expand_keywords:
        #expand_papers.update(keyword_kea_paper_dict[keyword])
        expand_papers.update(keyword_kpminer_paper_dict[keyword])
        
    return G.subgraph(expand_papers)

### Step 4 : Retrieving diverse and relevant citations

In [39]:
def recommend(G, input_papers, alpha_c = 0.25, lambda_c = 0.9, n_iter = 100, limit = 100):
    input_paper_ids = set([p['id'] for p in input_papers])
    p_star = 1/G.number_of_nodes()
    
    p_0_uv = {}
    for u in G.nodes():
        p_0_uv[u] = {}
        node_degree = G.out_degree(u)
        for v in G.neighbors(u):
            p_0_uv[u][v] = alpha_c / node_degree
        p_0_uv[u][u] = 1 - alpha_c
    
    # init for first iter
    p_t_u = dict([(node, p_star) for node in G.nodes()])
    p_t_uv = p_0_uv
    for _ in range(20):
        d_t_u = {}
        for u in G.nodes():
            d_t_u[u] = sum([p_0_uv[u][v] * p_t_u[v] for v in G.neighbors(u)])
        
        p_t1_u = {}
        for u in G.nodes():
            p_t1_u[u] = sum([p_t_uv[u][v] * p_t_u[u] for v in G.neighbors(u)])
        
        
        p_t1_uv = {}
        for u in G.nodes():
            p_t1_uv[u] = {}
            for v in G.neighbors(u):
                p_t1_uv[u][v] = (1-lambda_c) * p_star + lambda_c * p_0_uv[u][v] * p_t_u[v] / d_t_u[u]
                
        p_t_u = p_t1_u
        p_t_uv = p_t1_uv
    
    candidate_scores = [(p, v) for p, v in p_t_u.items() if p not in input_paper_ids]
    candidate_scores.sort(key=lambda e: e[1], reverse=True)
    limit = min(limit, len(candidate_scores))
    
    return [e[0] for e in candidate_scores[:limit]]

## Compute tests

In [40]:
with open('./data/aan_test_tripletfromref.json') as f:
    test_set = []
    for input_papers in json.load(f):
        test_set.append([full_set_dict[paper['id']] for paper in input_papers])

In [41]:
# LocDiSCern
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    
    subgraph = expand_query(citation_graph, input_papers)    
    result['output'] = recommend(subgraph, input_papers)
    
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [42]:
#with open('./results/tripletfromref_stateofart_method7_kea_loc.json', 'w') as f:
with open('./results/tripletfromref_stateofart_method7_kpminer_loc.json', 'w') as f:
    json.dump(results, f)

In [43]:
# GloDiSCern
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    
    candidate_nodes = set()
    for input_paper in input_papers:
        #for keyword in input_paper['keywords_kea']:
        for keyword in input_paper['keywords_kpminer']:
            #candidate_nodes.update(keyword_kea_paper_dict[keyword])
            candidate_nodes.update(keyword_kpminer_paper_dict[keyword])
                
    result['output'] = recommend(citation_graph.subgraph(candidate_nodes), input_papers)
        
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [44]:
#with open('./results/tripletfromref_stateofart_method7_kea_glo.json', 'w') as f:
with open('./results/tripletfromref_stateofart_method7_kpminer_glo.json', 'w') as f:
    json.dump(results, f)