In [1]:
%%capture
import json
import os
import numpy as np
from tqdm.notebook import tqdm

In [2]:
RESULTS_DIR_PATH = './results/'
TEST_SET_SIZE = 2000

In [3]:
DATASET_NAME = 'aan'
FULL_SET_PATH = './data/aan_full_with_fos_w.json'
RAW_OUTPUT_PATH = './results/raw_results_aan_v2.json'
FILTERED_OUTPUT_PATH = './results/filtered_results_aan_v2.json'

In [24]:
DATASET_NAME = 'dblp'
FULL_SET_PATH = './data/dblp_full_with_fos_w.json'
RAW_OUTPUT_PATH = './results/raw_results_dblp_v2.json'
FILTERED_OUTPUT_PATH = './results/filtered_results_dblp_v2.json'

In [25]:
with open(FULL_SET_PATH) as f:
    full_set = json.load(f)
    
for paper in full_set:
    paper['fos_w_dict'] = {t['name']:t['w'] for t in paper['fos_w']}
    paper['ref_cit_set'] = set(paper['references'] + paper['citations'])
    
full_set_dict = dict([(ref['id'], ref) for ref in full_set])

In [26]:
results = {}
result_files = [f for f in os.listdir(RESULTS_DIR_PATH) if '.json' in f and 'results' not in f and DATASET_NAME in f]

for filename in result_files:
    method_name = filename.split('_' + DATASET_NAME)[0]
    if method_name in results:
        with open(RESULTS_DIR_PATH + filename) as file:
            results[method_name]['recs'] = json.load(file)
    else:
        result = {}
        result['method_name'] = method_name
    
        with open(RESULTS_DIR_PATH + filename) as file:
            result['recs'] = json.load(file)
        
        results[method_name] = result

In [27]:
def merge_fos_lists(papers):
    fos_dict = {}
    for paper in papers:
        for t in paper['fos_w']:
            if t['name'] in fos_dict:
                fos_dict[t['name']].append(t['w'])
            else:
                fos_dict[t['name']] = [t['w']]
    
    for name, weights in fos_dict.items():
        fos_dict[name] = np.mean(weights)
        
    return fos_dict 

In [28]:
def content_similarity(fos_dict1, fos_dict2):
    dot_product = sum([weight1*fos_dict2[name1] for name1, weight1 in fos_dict1.items() if name1 in fos_dict2])

    norm1 = sum([weight*weight for weight in fos_dict1.values()]) ** 0.5
    norm2 = sum([weight*weight for weight in fos_dict2.values()]) ** 0.5
    
    return dot_product / (norm1 * norm2)

In [29]:
def graph_similarity(paper_set1, paper_set2):
    intersection = paper_set1.intersection(paper_set2)
    union = paper_set1.union(paper_set2)
    
    return len(intersection) / len(union)

# Accuracy

### Content similarity with user profil

In [30]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:   
        if rec['output']:
            partial_scores = []
            input_fos_dict = merge_fos_lists([full_set_dict[paper_id] for paper_id in rec['input']])
            
            for output_paper_id in rec['output']:
                partial_scores.append(content_similarity(input_fos_dict, full_set_dict[output_paper_id]['fos_w_dict']))
 
            score = np.mean(partial_scores)        

            scores.append(score)
        else:
            scores.append(0)
        
    result['accuracy_content_foscosine'] = scores

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




### Graph similarity with user profil

In [31]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output']:
            input_ref_set = set()
            for paper_id in rec['input']:
                input_ref_set.update(full_set_dict[paper_id]['references'])
                input_ref_set.update(full_set_dict[paper_id]['citations'])

            partial_scores = []
            for paper_id in rec['output']:
                partial_scores.append(graph_similarity(input_ref_set, full_set_dict[paper_id]['ref_cit_set']))

            score = np.mean(partial_scores)
            scores.append(score)
        else:
            scores.append(0)
        
    result['accuracy_graph_cosine'] = scores

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




# Coverage

### Global items coverage

In [32]:
for result in tqdm(results.values()):
    rec_ids_set = set()
    for rec in result['recs']:
        rec_ids_set.update(rec['output'])
        
    result['coverage_item_global'] = len(rec_ids_set) / len(full_set)

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




### Global users coverage

In [33]:
for result in tqdm(results.values()):
    scores = {}
    scores['n_complete'] = len([_ for rec in result['recs'] if len(rec['output']) == 100])
    scores['n_empty'] = len([_ for rec in result['recs'] if not rec['output']])
    scores['n_incomplete'] = len(result['recs']) - scores['n_complete'] - scores['n_empty']
        
    result['coverage_user_global'] = scores

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




# Diversity

### Infra list content dissimilarity

In [34]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output'] and len(rec['output']) > 1:
            partial_scores = []

            ids_stack = rec['output'].copy()
            current_id = ids_stack.pop()
            while ids_stack:
                for other_id in ids_stack:
                    partial_scores.append(1-content_similarity(full_set_dict[current_id]['fos_w_dict'],
                                                               full_set_dict[other_id]['fos_w_dict']))
                
                current_id = ids_stack.pop()
            
            score = np.mean(partial_scores)
            scores.append(score)
        else:
            scores.append(0)
        
    result['diversity_content_foscosine'] = scores

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




### Infra list graph dissimilarity

In [35]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output'] and len(rec['output']) > 1:
            partial_scores = []
            
            ids_stack = rec['output'].copy()
            current_id = ids_stack.pop()
            while ids_stack:
                for other_id in ids_stack:
                    partial_scores.append(1-graph_similarity(full_set_dict[current_id]['ref_cit_set'],
                                                             full_set_dict[other_id]['ref_cit_set']))
                                         
                    current_id = ids_stack.pop()

            score = np.mean(partial_scores)
            scores.append(score)
        else:
            scores.append(0)
        
    result['diversity_graph_cosine'] = scores

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




### Global diversity

In [36]:
for result in tqdm(results.values()):
    freq_recs = dict([(paper['id'], 0) for paper in full_set])
    
    for rec in result['recs']:
        for paper_id in rec['output']:
            freq_recs[paper_id] += 1
            
    total_recs = sum(freq_recs.values())
    
    p_recs = [freq / total_recs for freq in freq_recs.values() if freq > 0]
        
    result['diversity_global'] = -1 * sum([p_rec * np.log2(p_rec) for p_rec in p_recs])

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




# Novelty

### Average publication year

In [37]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output']:
            partial_scores = []
            for paper_id in rec['output']:
                partial_scores.append(full_set_dict[paper_id]['year'])

            score = np.mean(partial_scores)
            scores.append(score)
        else:
            scores.append(0)
        
    result['novelty_average_pub_year'] = scores

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




### Inverse popularity (approx. by citations)

In [38]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output']:
            score = []
            for paper_id in rec['output']:
                n_citations = len(full_set_dict[paper_id]['citations'])
                score.append(np.log2((n_citations+1) / len(full_set)))
            scores.append(np.mean(score) * -1)
        else:
            scores.append(0)
        
    result['novelty_inverse_popularity'] = scores

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




# generate results file

### raw results

In [39]:
for result in results.values():
    result.pop('recs')

In [40]:
with open(RAW_OUTPUT_PATH, 'w') as f:
    json.dump(results, f)

### filtered results

In [41]:
invalid_recs_pos = set()

for filename in result_files:
    with open(RESULTS_DIR_PATH + filename) as file:
        for i, rec in enumerate(json.load(file)):
            if not rec['output'] or len(rec['output']) < 50:
                invalid_recs_pos.add(i)
            
valid_recs_pos = [i for i in range(TEST_SET_SIZE) if i not in invalid_recs_pos]

In [42]:
filtered_results = {}

for method_name, result in results.items():
    filtered_result = {}
    filtered_result['method_name'] = method_name
    filtered_result['coverage_item_global'] = result['coverage_item_global']
    filtered_result['coverage_user_global'] = result['coverage_user_global']
    filtered_result['diversity_global'] = result['diversity_global']
    
    filtered_result['accuracy_content_foscosine'] = [result['accuracy_content_foscosine'][i] for i in valid_recs_pos]
    filtered_result['accuracy_graph_cosine'] = [result['accuracy_graph_cosine'][i] for i in valid_recs_pos]
    filtered_result['novelty_average_pub_year'] = [result['novelty_average_pub_year'][i] for i in valid_recs_pos]
    filtered_result['novelty_inverse_popularity'] = [result['novelty_inverse_popularity'][i] for i in valid_recs_pos]
    filtered_result['diversity_graph_cosine'] = [result['diversity_graph_cosine'][i] for i in valid_recs_pos]
    filtered_result['diversity_content_foscosine'] = [result['diversity_content_foscosine'][i] for i in valid_recs_pos]
    
    filtered_results[method_name] = filtered_result

In [43]:
with open(FILTERED_OUTPUT_PATH, 'w') as f:
    json.dump(filtered_results, f)