In [1]:
%%capture
import json
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import numpy as np
import scipy as sp
from tqdm.notebook import tqdm

In [2]:
with open('./data/aan_full.json') as f:
    full_set = json.load(f)
    
full_set_dict = dict([(ref['id'], ref) for ref in full_set])

In [3]:
results = {}
results_dir_path = './results/'
result_files = [f for f in os.listdir(results_dir_path) if '.json' in f and 'results' not in f and ('method7_kea' in f or 'method20' in f)]

for filename in result_files:
    method_name = filename.split('.')[0]
    if method_name in results:
        with open(results_dir_path + filename) as file:
            results[method_name]['recs'] = json.load(file)
    else:
        result = {}
        result['method_name'] = method_name
    
        with open(results_dir_path + filename) as file:
            result['recs'] = json.load(file)
        
        results[method_name] = result

In [5]:
corpus = [ref['title'] + ' ' + ref['abstract'] for ref in full_set]
vectorizer = TfidfVectorizer(stop_words='english').fit(corpus)

In [6]:
for ref in tqdm(full_set):
    document = [ref['title'] + ' ' + ref['abstract']]
    ref['tfidf_vector'] = vectorizer.transform(document)

HBox(children=(FloatProgress(value=0.0, max=15366.0), HTML(value='')))




# Accuracy

### Content similarity with user profil

In [8]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output']:
            input_document = []
            for paper_id in rec['input']:
                input_document.append(full_set_dict[paper_id]['title'] + ' ' + full_set_dict[paper_id]['abstract'])
            input_document = ' '.join(input_document)
            input_tfidf_array = vectorizer.transform([input_document])

            output_tfidf_vectors = [full_set_dict[paper_id]['tfidf_vector'] for paper_id in rec['output']]
            output_tfidf_array = sp.sparse.vstack(output_tfidf_vectors)

            score = np.mean(cosine_similarity(input_tfidf_array,output_tfidf_array))        

            scores.append(score)
        else:
            scores.append(0)
        
    result['accuracy_content_tfidf'] = scores

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




### Graph similarity with user profil

In [9]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output']:
            input_ref_set = set()
            for paper_id in rec['input']:
                input_ref_set.update(full_set_dict[paper_id]['references'])
                input_ref_set.update(full_set_dict[paper_id]['citations'])

            partial_scores = []
            for paper_id in rec['output']:
                ref_set = set()
                ref_set.update(full_set_dict[paper_id]['references'])
                ref_set.update(full_set_dict[paper_id]['citations'])

                intersection = input_ref_set.intersection(ref_set)
                union = input_ref_set.union(ref_set)

                partial_scores.append(len(intersection)/len(union))

            score = np.mean(partial_scores)
            scores.append(score)
        else:
            scores.append(0)
        
    result['accuracy_graph_jaccard'] = scores

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




# Coverage

### Global items coverage

In [10]:
for result in tqdm(results.values()):
    rec_ids_set = set()
    for rec in result['recs']:
        rec_ids_set.update(rec['output'])
        
    result['coverage_item_global'] = len(rec_ids_set) / len(full_set)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




### Global users coverage

In [11]:
for result in tqdm(results.values()):
    scores = {}
    scores['n_complete'] = len([_ for rec in result['recs'] if len(rec['output']) == 100])
    scores['n_empty'] = len([_ for rec in result['recs'] if not rec['output']])
    scores['n_incomplete'] = len(result['recs']) - scores['n_complete'] - scores['n_empty']
        
    result['coverage_user_global'] = scores

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




# Diversity

### Infra list content dissimilarity

In [12]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output'] and len(rec['output']) > 1:
            partial_scores = []

            for current_paper_id in rec['output']:            
                current_tfidf_array = full_set_dict[current_paper_id]['tfidf_vector']
                other_tfidf_vectors = [full_set_dict[paper_id]['tfidf_vector'] for paper_id in rec['output'] if paper_id != current_paper_id]
                other_tfidf_array = sp.sparse.vstack(other_tfidf_vectors)
                partial_scores.append(np.mean(cosine_distances(current_tfidf_array,other_tfidf_array)))

            score = np.mean(partial_scores)
            scores.append(score)
        else:
            scores.append(0)
        
    result['diversity_content_tfidf'] = scores

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




### Infra list graph dissimilarity

In [14]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output'] and len(rec['output']) > 1:
            partial_scores = []

            for current_paper_id in rec['output']:
                current_ref_set = set()
                current_ref_set.update(full_set_dict[current_paper_id]['references'])
                current_ref_set.update(full_set_dict[current_paper_id]['citations'])

                for other_paper_id in filter(lambda e: e != current_paper_id, rec['output']):
                    other_ref_set = set()
                    other_ref_set.update(full_set_dict[other_paper_id]['references'])
                    other_ref_set.update(full_set_dict[other_paper_id]['citations'])

                    intersection = input_ref_set.intersection(other_ref_set)
                    union = input_ref_set.union(other_ref_set)

                    partial_scores.append(1 - len(intersection)/len(union))

            score = np.mean(partial_scores)
            scores.append(score)
        else:
            scores.append(0)
        
    result['diversity_graph_jaccard'] = scores

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




### Global diversity

In [15]:
for result in tqdm(results.values()):
    freq_recs = dict([(paper['id'], 0) for paper in full_set])
    
    for rec in result['recs']:
        for paper_id in rec['output']:
            freq_recs[paper_id] += 1
            
    total_recs = sum(freq_recs.values())
    
    p_recs = [freq / total_recs for freq in freq_recs.values() if freq > 0]
        
    result['diversity_global'] = -1 * sum([p_rec * np.log2(p_rec) for p_rec in p_recs])

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




# Novelty

### Average publication year

In [16]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output']:
            partial_scores = []
            for paper_id in rec['output']:
                partial_scores.append(full_set_dict[paper_id]['year'])

            score = np.mean(partial_scores)
            scores.append(score)
        else:
            scores.append(0)
        
    result['novelty_average_pub_year'] = scores

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




### Inverse popularity (approx. by citations)

In [17]:
for result in tqdm(results.values()):
    scores = []
    
    for rec in result['recs']:
        if rec['output']:
            score = []
            for paper_id in rec['output']:
                n_citations = len(full_set_dict[paper_id]['citations'])
                score.append(np.log2((n_citations+1) / len(full_set)))
            scores.append(np.mean(score) * -1)
        else:
            scores.append(0)
        
    result['novelty_inverse_popularity'] = scores

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




# generate results file

In [18]:
for result in results.values():
    result.pop('recs')

In [19]:
with open('./results/evaloff_results.json') as f:
    old_results = json.load(f)

for k, v in results.items():
    old_results[k] = v

In [20]:
with open(results_dir_path + 'evaloff_results.json', 'w') as f:
    json.dump(old_results, f)

# generate filtered results file

In [23]:
invalid_recs_pos = set()
result_files = [f for f in os.listdir(results_dir_path) if '.json' in f and 'results' not in f]

for filename in result_files:
    with open(results_dir_path + filename) as file:
        for i, rec in enumerate(json.load(file)):
            if not rec['output'] or len(rec['output']) < 50:
                invalid_recs_pos.add(i)
            
valid_recs_pos = [i for i in range(3000) if i not in invalid_recs_pos]

In [26]:
filtered_results = {}

for method_name, old_result in old_results.items():
    filtered_result = {}
    filtered_result['method_name'] = method_name
    filtered_result['coverage_item_global'] = old_result['coverage_item_global']
    filtered_result['coverage_user_global'] = old_result['coverage_user_global']
    filtered_result['diversity_global'] = old_result['diversity_global']
    
    filtered_result['accuracy_content_tfidf'] = [old_result['accuracy_content_tfidf'][i] for i in valid_recs_pos]
    filtered_result['accuracy_graph_jaccard'] = [old_result['accuracy_graph_jaccard'][i] for i in valid_recs_pos]
    filtered_result['novelty_average_pub_year'] = [old_result['novelty_average_pub_year'][i] for i in valid_recs_pos]
    filtered_result['novelty_inverse_popularity'] = [old_result['novelty_inverse_popularity'][i] for i in valid_recs_pos]
    filtered_result['diversity_graph_jaccard'] = [old_result['diversity_graph_jaccard'][i] for i in valid_recs_pos]
    filtered_result['diversity_content_tfidf'] = [old_result['diversity_content_tfidf'][i] for i in valid_recs_pos]
    
    filtered_results[method_name] = filtered_result

In [28]:
with open(results_dir_path + 'evaloff_filtered_results.json', 'w') as f:
    json.dump(filtered_results, f)