In [40]:
# Done with Google Colab
!pip install infomap



In [None]:
%%capture
import json
import numpy as np
import networkx as nx
from sklearn.preprocessing import normalize
import infomap
from tqdm.notebook import tqdm
import random

In [2]:
FULL_SET_PATH = './aan_full.json'
TEST_SET_PATH = './aan_test.json'
CLASSIC_OUTPUT_PATH = './stateofart_method11_classic_aan.json'
EXPERT_OUTPUT_PATH = './stateofart_method11_expert_aan.json'
SERENDIPITY_OUTPUT_PATH = './stateofart_method11_serendipity_aan.json'

In [15]:
FULL_SET_PATH = './dblp_full.json'
TEST_SET_PATH = './dblp_test.json'
CLASSIC_OUTPUT_PATH = './stateofart_method11_classic_dblp.json'
EXPERT_OUTPUT_PATH = './stateofart_method11_expert_dblp.json'
SERENDIPITY_OUTPUT_PATH = './stateofart_method11_serendipity_dblp.json'

In [42]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)

with open(TEST_SET_PATH) as f:
    test_set = json.load(f)

15366

In [None]:
# create citation network
citation_graph = nx.DiGraph()
nodes_set = set()
for paper in full_set:
    nodes_set.update(paper['references'])
    nodes_set.update(paper['citations'])
    for ref_id in paper['references']:
        citation_graph.add_edge(paper['id'], ref_id)
    for cit_id in paper['citations']:
        citation_graph.add_edge(cit_id, paper['id'])
        
nodelist = citation_graph.nodes()

In [None]:
# compute ALEF scores and rank nodes
Z = nx.adjacency_matrix(citation_graph)
w = (Z + Z.transpose()).sum(axis=1)
H = normalize(Z, norm='l1', axis=1)
ALEF = len(citation_graph) * H.transpose() * w / (H.transpose() * w).sum()

ALEF_score_tuples = [(node, score) for node, score in zip(nodelist, ALEF.tolist())]
ALEF_score_tuples.sort(key=lambda e:e[1])
ranked_nodes = [e[0] for e in ALEF_score_tuples]
rank_dict = dict([(node, i) for i, node in enumerate(ranked_nodes, start=1)])
reversed_rank_dict = dict(enumerate(ranked_nodes, start=1))

In [62]:
# hierarchically cluster nodes (see https://www.mapequation.org/ and infomap package)
# source : https://github.com/mapequation/infomap/blob/master/examples/python/infomap-networkx.py

im = infomap.Infomap("--two-level")

print("Building Infomap network from a NetworkX graph...")
for source, target in citation_graph.edges:
    im.add_link(rank_dict[source], rank_dict[target])

print("Find communities with Infomap...")
im.run()

print(f"Found {im.num_top_modules} modules with codelength: {im.codelength}")

com_lvl1 = im.get_modules(depth_level=1)
com_lvl2 = im.get_modules(depth_level=2)
com_lvl1_dict = dict([(reversed_rank_dict[rank], community) for rank, community in com_lvl1.items()])
com_lvl2_dict = dict([(reversed_rank_dict[rank], community) for rank, community in com_lvl2.items()])

Building Infomap network from a NetworkX graph...
Find communities with Infomap...
Found 495 modules with codelength: 10.337280777367477


In [80]:
reversed_com_lvl1_dic = {}
for node, community in com_lvl1_dict.items():
    if community in reversed_com_lvl1_dic:
        reversed_com_lvl1_dic[community].append(node)
    else:
        reversed_com_lvl1_dic[community] = [node]

reversed_com_lvl2_dic = {}
for node, community in com_lvl2_dict.items():
    if community in reversed_com_lvl2_dic:
        reversed_com_lvl2_dic[community].append(node)
    else:
        reversed_com_lvl2_dic[community] = [node]

print('nb communities of lvl 1:', str(len(reversed_com_lvl1_dic)))
print('nb communities of lvl 2:', str(len(reversed_com_lvl2_dic)))

nb communities of lvl 1: 495
nb communities of lvl 2: 1


In [None]:
# compute recommandations
def compute_recs(input_papers, com_dict, reversed_com_dict, rank_dict, limit, serendipity_mode):
    input_paper_ids = set([paper['id'] for paper in input_papers])
    candidate_communities = set()
    for input_paper_id in input_paper_ids:
        candidate_communities.add(com_dict[input_paper_id])

    candidate_papers = set()
    for community in candidate_communities:
        candidate_papers.update(reversed_com_dict[community])

    limit = min(limit, len(candidate_papers))

    if serendipity_mode:
        return random.sample(candidate_papers, limit)
    else:  
        candidate_scores = [(paper_id, rank_dict[paper_id]) for paper_id in candidate_papers if paper_id not in input_paper_ids]
        candidate_scores.sort(key=lambda e: e[1], reverse=True)

        return [e[0] for e in candidate_scores[:limit]]

In [76]:
# classic mode
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]    
    result['output'] = compute_recs(input_papers, com_lvl2_dict, reversed_com_lvl2_dic, rank_dict, 100, False)
    results.append(result)

with open(CLASSIC_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)

HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))




In [77]:
# expert mode
results = []

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]    
    result['output'] = compute_recs(input_papers, com_lvl1_dict, reversed_com_lvl1_dic, rank_dict, 100, False)
    results.append(result)

with open(EXPERT_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)

HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))




In [78]:
# serendipity mode
results = []
random.seed(42)

for input_papers in tqdm(test_set):
    result = {}
    result['input'] = [input_paper['id'] for input_paper in input_papers]    
    result['output'] = compute_recs(input_papers, com_lvl1_dict, reversed_com_lvl1_dic, rank_dict, 100, True)
    results.append(result)

with open(SERENDIPITY_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)

HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))


