In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy as sp
import seaborn as sns
from IPython.display import display, HTML
from matplotlib.backends.backend_pdf import PdfPages
from os import listdir
from os.path import isfile, join

In [None]:
collections = ['TREC3', 'TREC5', 'TREC6', 'TREC7', 'TREC8', 'TREC2001', 'R04', 'TB06', 'TB06M', 'WEB14']

In [None]:
'''
Depth to which we want the runs to be cut
'''
pool_D = 1000

# Create "annotated" runs

In [None]:
'''
Support methods and variables used in building the runs and computing WUC
'''
global weight
weight = np.array([])

# Custom function to add to a dictionary
def myadd(d, key, system, position):
    if key in d.keys():
        d[key][str(system)] = position
    else:
        d[key] = {system: position}

# Compute weights of reference documents according to WUC formula
def get_weight(j):
    if j in weight:
        return weight.get(j)
    else:
        for m in range(1, 1001):
            for k in range(1, 5):
                if (m * 5) == j:
                    return round(zeta_function(200) - zeta_function(m - 1), 2)
                if ((5 * m) - k) == j:
                    return round(get_weight(5 * m) - (1 / m) + (5 / j), 2)

# Compute Zeta function (used in computing the weights)
def zeta_function(m):
    res = 1
    if m == 0:
        return 0
    else:
        for i in range(1, m + 1):
            res = res + (1 / i)
    return res

# Set up a dictionary that, for each position, returns the corresponding weight
for i in range(1, 1001):
    weight = np.append(weight, get_weight(i))

In [None]:
'''
Method that reads the runs, build dictionaries used for WUC implementations and saves those dictionaries in .pickles
files
'''
def get_runs(systems, topics, RUN_PATH, pool_depth):
    # Load the runs' filepatsh in a list
    files = [f for f in listdir(RUN_PATH) if (isfile(join(RUN_PATH, f))) & (f != ".DS_Store")]
    
    df_run = pd.DataFrame(columns=['topic', 'system', 'rank_list'])
    df_doc_sys = pd.DataFrame(columns=['topic', 'doc_system_dict'])
    df_doc_score = pd.DataFrame(columns=['topic', 'doc_system_dict'])
    
    # Read the runs
    for i_file, file in enumerate(files):
        system = file.replace("input.","").replace(".gz","")
        if system in systems:
            print("system {}/{}, ".format(i_file + 1, len(files)), end='')
            file_content = pd.read_csv(join(RUN_PATH, file), compression='gzip', header=None, sep='\t')
            file_content.columns  = ['topic', 'zero', 'doc', 'rank', 'rel_value', 'system']
            file_content.drop('zero', 1, inplace=True)
            file_content['topic'] = file_content['topic'].astype(str)
            file_content['system'] = file_content['system'].astype(str)
            file_content.set_index('topic', inplace=True)
            
            # For each topic, build the dictionary containing the set of documents retrieved by a system and the
            # position where they have been retrieved and the dictionary containing the set of documents retrieved
            # by a system and their normalized relevance score
            for i, topic in enumerate(topics):
                if i_file == 0:
                    dicti = {}
                    dictib = {}
                else:
                    dicti = df_doc_sys[df_doc_sys['topic'] == str(topic)]['doc_system_dict'].values[0]
                    dictib = df_doc_score[df_doc_score['topic'] == str(topic)]['doc_system_dict'].values[0]
                file_content_top = file_content.loc[str(topic)]
                file_content_top = file_content_top.head(pool_depth)
                try:
                    file_content_top.reset_index(inplace=True)
                except:
                    file_content_top = file_content_top.to_frame().T
                file_content_top.set_index('doc', inplace=True)
                df_run.loc[len(df_run)] = [topic, system, file_content_top.index.values]
                rel_value_mean = file_content_top['rel_value'].mean()
                rel_value_std = file_content_top['rel_value'].std()
                if rel_value_std == 0 or math.isnan(rel_value_std):
                    rel_value_std = 1
                if math.isnan(rel_value_mean):
                    rel_value_mean = 0
                    
                for posizione, doc in enumerate(file_content_top.index.values):
                    # WUC V4 uses a "normalized score", we interpret this as the normalized relevance score of the
                    # documents retrieved from a system (this value can be found in the runs and varies from system to
                    # system)
                    rel_value = file_content_top.loc[doc]['rel_value']
                    rel_value = round(((rel_value - rel_value_mean) / rel_value_std), 3)
                    if math.isnan(rel_value):
                        rel_value = 0
                    myadd(dicti, doc, str(system), int(posizione + 1))
                    myadd(dictib, doc, str(system), rel_value)
                
                df_doc_sys.loc[len(df_doc_sys)] = [topic, dicti]
                df_doc_score.loc[len(df_doc_score)] = [topic, dictib]
    
    # Save the dictionaries in .pickles files      
    df_run.set_index(['topic', 'system'], inplace=True)
    df_doc_sys = df_doc_sys.groupby('topic').last()
    df_run.to_pickle('../../pickles/pickles_results/{}_WUC_df_run.pickle'.format(COLLECTION))
    df_doc_sys.to_pickle('../../pickles/pickles_results/{}_WUC_df_doc_sys.pickle'.format(COLLECTION))
    df_doc_score.to_pickle('../../pickles/pickles_results/{}_WUC_df_doc_score.pickle'.format(COLLECTION))

In [None]:
'''
Create the dictionaries and save them
'''
for COLLECTION in COLLECTIONS:
    print('Working on {}'.format(COLLECTION))
    
    # Real table and run path
    CSV_TABLE = '../../src/Tables/{}.csv'.format(COLLECTION)
    RUN_PATH = '../run/{}-input/'.format(COLLECTION)
    
    # Read real table
    real_table = pd.read_csv(CSV_TABLE, sep=',', header=0, index_col=0)
    systems = real_table.index.values
    topics = real_table.columns.values
    
    get_runs(systems, topics, RUN_PATH, pool_D)
    
    print('done for {}'.format(COLLECTION))

# Compute WUC

In [None]:
'''
Compute WUC for each collection
'''
for COLLECTION in COLLECTIONS:
    print('Working on {}'.format(COLLECTION))
    
    # Real table path
    CSV_TABLE = '../../src/Tables/{}.csv'.format(COLLECTION)
    
    # Read real table and get the topics' numbers
    real_table = pd.read_csv(CSV_TABLE, sep=',', header=0, index_col=0)
    topics = real_table.columns.values
    
    # Read the previously computed dictionaries
    df_run = pd.read_pickle('../../pickles/pickles_results/{}_WUC_df_run.pickle'.format(COLLECTION))
    df_doc_sys = pd.read_pickle('../../pickles/pickles_results/{}_WUC_df_doc_sys.pickle'.format(COLLECTION))
    df_doc_score = pd.read_pickle('../../pickles/pickles_results/{}_WUC_df_doc_score.pickle'.format(COLLECTION))
    df_doc_score.set_index('topic', inplace=True)
    
    df_total = pd.DataFrame(columns=['topic', 'system', 'BASIC', 'V1', 'V2', 'V3', 'V4'])
    
    # For each topic, compute the five WUC variants for each system
    for i_topic, topic in enumerate(topics):
        df_run_top = df_run.loc[topic]
        dict_doc_sys_top = df_doc_sys.loc[topic].values[0]
        dict_doc_score_top = df_doc_score.loc[topic].values[0][0]
        
        for system, ranked_list in df_run_top.iterrows():
            ranked_list = ranked_list[0]
            
            # Keep only the set of documents retrieved by every system except the one considered
            dict_doc_sys_top_count = dict_doc_sys_top.copy()
            dict_doc_score_top_count = dict_doc_score_top.copy()
            unwanted = set(dict_doc_sys_top_count.keys()) - set(ranked_list)
            for unwanted_key in unwanted: del dict_doc_sys_top_count[unwanted_key]
            for unwanted_key in unwanted: del dict_doc_score_top_count[unwanted_key]
            
            sum_basic = 0
            sum_V1 = 0
            sum_V2 = 0
            sum_V3 = 0
            sum_V4 = 0
            
            # Compute WUC values for each retrieved document
            for i_doc, dict_syst_rank in enumerate(dict_doc_sys_top_count.values()):
                # WUC Basic counts how many systems retrieve a specific document
                ref_count = len(dict_syst_rank) - 1
                sum_basic += ref_count
                
                # WUC V2 adds a weight to the number of systems that retrieve a specific document
                sum_V2 += (ref_count * weight[i_doc])

                for sys_name, rank_of_doc_retr in dict_syst_rank.items():
                    if sys_name != system:
                        # WUC V1 weights the rank at which a document is retrieved
                        sum_V1 += (1501 - rank_of_doc_retr)
                        # WUC V3 weights both the rank at which a document is retrieved and the rank at which
                        # the document is found in the specific system's set of retrieved documents
                        sum_V3 += (1501 - rank_of_doc_retr) * weight[i_doc]
                                   
            for i_doc, dict_syst_score in enumerate(dict_doc_score_top_count.values()):
                for sys_name, score_of_doc_retr in dict_syst_score.items():
                    if sys_name != system:
                        # WUC V3 weights the normalized score of a document according to its rank in the specific
                        # system's set of retrieved documents
                        sum_V4 += score_of_doc_retr * weight[i_doc]

            df_total.loc[len(df_total)] = [topic, system, sum_basic, sum_V1, sum_V2, sum_V3, sum_V4]
    
    # Split the dataframe containing all methods into single tables, one for each WUC version
    basic = df_total.pivot('system', 'topic', 'BASIC')
    v1 = df_total.pivot('system', 'topic', 'V1')
    v2 = df_total.pivot('system', 'topic', 'V2')
    v3 = df_total.pivot('system', 'topic', 'V3')
    v4 = df_total.pivot('system', 'topic', 'V4')

    # Save WUC tables to csv
    basic.to_csv('../../pickles/WUC_1000_bis/Table/{}_BASIC.csv'.format(COLLECTION), index=True, header=True)
    v1.to_csv('../../pickles/WUC_1000_bis/Table/{}_V1.csv'.format(COLLECTION),index=True, header=True)
    v2.to_csv('../../pickles/WUC_1000_bis/Table/{}_V2.csv'.format(COLLECTION),index=True, header=True)
    v3.to_csv('../../pickles/WUC_1000_bis/Table/{}_V3.csv'.format(COLLECTION),index=True, header=True)
    v4.to_csv('../../pickles/WUC_1000_bis/Table/{}_V4.csv'.format(COLLECTION),index=True, header=True)
    
    print('\ndone for {}.'.format(COLLECTION))