In [None]:
import numpy as np
import pandas as pd
from os.path import isfile, join

COLLECTIONS = ['TREC3', 'TREC5', 'TREC6', 'TREC7', 'TREC8', 'TREC2001', 'R04', 'TB06', 'TB06M', 'WEB14']
CSV_TABLE_PATH = '../../src/Tables/'
RUN_PATH = '../../run/'
QRELS_PATH = '../../src/qrels/'

In [None]:
'''
Read runs and build a pool using only the top 100 documents retrieved for each topic
'''
def get_pools(systems, topics, RUN_PATH, top=False):
    df_pool = pd.DataFrame()
    for i_system, system in enumerate(systems):
        file = 'input.{}'.format(system)
        print('system {}'.format(system))
        file_content = pd.read_csv(join(RUN_PATH, file + '.gz'), compression='gzip', header=None, delimiter=r"\s+")
        file_content.columns = ['topic', 'zero', 'doc', 'rank', 'rel_value', 'system']
        file_content.drop('zero', 1, inplace=True)
        file_content['topic'] = file_content['topic'].astype(str)
        file_content['system'] = file_content['system'].astype(str)
        if top:
            for i_topic, topic in enumerate(topics):
                file_content_top = file_content[file_content['topic'] == str(topic)]
                file_content_top = file_content_top.head(100)
                df_pool = df_pool.append(file_content_top, ignore_index=True)
        else:
            df_pool = df_pool.append(file_content, ignore_index=True)
    return df_pool

In [None]:
'''
Build "our" qrels taking the top 100 documents retrieved by each system on each topic, form a pool with them and remove
duplicates. To compute mu and sigma, use the relevance given to a document in the real qrels (if the document is not
there, then the document gets removed)
'''
for collection in COLLECTIONS:
    real_table = pd.read_csv(CSV_TABLE_PATH + collection + '.csv', sep=',', header=0, index_col=0)
    systems = real_table.index.values
    topics = real_table.columns.values
    
    df_run = get_pools(systems, topics, RUN_PATH + collection, top=True)
    df_run.drop(['rel_value', 'system', 'rank'], axis=1, inplace=True)
    df_run['topic'] = df_run['topic'].astype(str)
    df_run.set_index('topic', inplace=True)
    
    qrels = pd.read_csv(QRELS_PATH + 'qrels.' + collection + '.txt', sep=' ', header=None)
    qrels.columns = ['topic', 'zero', 'doc', 'relevant']
    qrels['topic'] = qrels['topic'].astype(str)
    qrels.set_index('topic', inplace=True)
    
    new_qrels = pd.DataFrame()
    means = []
    for topic in pd.unique(df_run.index.values):
        df_run_sub = df_run.loc[topic]
        df_run_sub.drop_duplicates(keep='first', inplace=True)
        qrels_sub = qrels.loc[topic]
        
        df_merged = df_run_sub.merge(qrels_sub, on='doc', how='left')
        df_merged.fillna({'topic': topic, 'relevant': 0}, inplace=True)
        df_merged.dropna(axis=0, inplace=True)
        df_merged['relevant'] = df_merged['relevant'].astype(int)
        
        means.append(np.mean(df_merged['relevant']))
    
    print(collection, len(topics), np.mean(means), np.std(means))
    # assert False

In [None]:
'''
Compute mu and sigma using real qrels
'''
for collection in COLLECTIONS:
    qrels = pd.read_csv(QRELS_PATH + 'qrels.' + collection + '.txt', sep=' ', header=None)
    qrels.columns = ['topic', 'zero', 'doc', 'relevant']
    qrels.set_index('topic', inplace=True)
    means = []
    for topic in pd.unique(qrels.index.values):
        qrels_sub = qrels.loc[topic]
        means.append(round(np.mean(qrels_sub['relevant']), 2))
    print(collection, 100 * np.mean(means), round(np.std(means), 3))