In [10]:
%%capture
import json
import networkx as nx
from tqdm.notebook import tqdm

In [2]:
FULL_SET_PATH = './data/aan_full.json'
TEST_SET_PATH = './data/aan_test.json'
DIRECTED_OUTPUT_PATH = './results/base_personalizedpagerank_directed_aan.json'
UNDIRECTED_OUTPUT_PATH = './results/base_personalizedpagerank_undirected_aan.json'

In [11]:
FULL_SET_PATH = './data/dblp_full.json'
TEST_SET_PATH = './data/dblp_test.json'
DIRECTED_OUTPUT_PATH = './results/base_personalizedpagerank_directed_dblp.json'
UNDIRECTED_OUTPUT_PATH = './results/base_personalizedpagerank_undirected_dblp.json'

In [12]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)
    
directed_citation_graph = nx.DiGraph()
for paper in full_set:
    for ref_id in paper['references']:
        directed_citation_graph.add_edge(paper['id'], ref_id)
        
undirected_citation_graph = directed_citation_graph.to_undirected()

In [13]:
nx.is_connected(undirected_citation_graph)

True

In [14]:
with open(TEST_SET_PATH) as f:
    test_set = json.load(f)

# directed variant

In [15]:
results = []
personalization_dict = dict([(paper['id'], 0) for paper in full_set])

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    for input_paper in input_papers:
        personalization_dict[input_paper['id']] = 1
    
    candidate_scores = list(nx.pagerank_scipy(directed_citation_graph, personalization=personalization_dict).items())
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    for input_paper in input_papers:
        personalization_dict[input_paper['id']] = 0
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [16]:
with open(DIRECTED_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)

# undirected variant

In [17]:
results = []
personalization_dict = dict([(paper['id'], 0) for paper in full_set])

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]
    input_paper_ids_set = set(result['input'])
    
    for input_paper in input_papers:
        personalization_dict[input_paper['id']] = 1
    
    candidate_scores = list(nx.pagerank_scipy(undirected_citation_graph, personalization=personalization_dict).items())
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_candidate_scores = [cs for cs in candidate_scores if cs[0] not in input_paper_ids_set]
    
    for input_paper in input_papers:
        personalization_dict[input_paper['id']] = 0
    
    result['output'] = [cs[0] for cs in filtered_candidate_scores[:100]]
    results.append(result)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [18]:
with open(UNDIRECTED_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)