In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from IPython.display import display, HTML
import seaborn as sns
import pylab as plt
from matplotlib.backends.backend_pdf import PdfPages
from tqdm import tqdm_notebook as tqdm
from os import listdir
from os.path import isfile, join
sns.set_style('whitegrid')
import matplotlib
%matplotlib inline
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from pathlib import Path
import multiprocessing as mp

def makedir(path):
    Path(path).mkdir(parents=True, exist_ok=True)

def majority_vote(x):
    c = Counter(x)
    value, count = c.most_common()[0]
    return value
def nothing(x):
    if 1 != len(np.unique(x.values)):
        assert False, "error on aggregation"
    else:
        return x.values[0]
import itertools
    
topics_to_consider = ['402', '403', '405', '407', '408', '410', '415', '416', '418', '420', '421', '427', '428', '431', '440', '442', '445', '448']
topics_to_consider_moffat = ['402', '403', '405', '407', '408', '415', '416', '431', '440']
#topics_to_consider = topics_to_consider_moffat[:]

In [None]:
import json
import pickle

def save_json(obj, path):
    with open(path, "w") as f:
        json.dump(obj, f, indent=4)

def load_json(path):
    with open(path, "r") as f:
        data = json.load(f)
    return data

def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, "rb") as f:
        data = pickle.load(f)
    return data

def makedir(PATH):
    from pathlib import Path
    Path(PATH).mkdir(parents=True, exist_ok=True)
    
import xml.etree.ElementTree

topics = '../../src/topics.xml' # <- File from which the inforamtion are taken
e = xml.etree.ElementTree.parse(topics).getroot()
gold_dict = {}
    
for child in e:
    topic = child.attrib["id"]    
    gold_dict[topic,"N"] = child.find('N').text
    gold_dict[topic,"H"] = child.find('H').text
    
assert len(gold_dict), "Gold should be 36"

In [None]:
''' do the qrels '''
DATASET = 'ME'
df = pd.read_csv(f'../../src/crowd-data/{DATASET}.csv').reset_index()
#df[f'{DATASET}_rel'] = df[f'{DATASET}_rel'].astype(int)
df = df[df["doc_id"].isin(gold_dict.values()) == False]
df = df[['topic', 'doc_id', f'{DATASET}_rel', 'rel_TREC', 'rel_Sormunen']].groupby(['topic', 'doc_id']).agg('mean').reset_index()
df['zero'] = 0
df = df[['topic', 'zero', 'doc_id', f'{DATASET}_rel']]
topics = np.unique(df['topic'])

df.to_csv(f'./pers_script/dataset-evaluation/qrels_{DATASET}.txt', header=False, index=False, sep=' ')

In [None]:
''' do the qrels for moffat '''
DD = 'ratio'
#DD = 'rele'
#DD = 'pref'
DATASET = f'moffat{DD}'
df = pd.read_csv(f'../../../AAA_DATASET/Moffat-and-co/OneDrive_1_25-11-2020/result_files/qrel_pref+rele+ratio.csv').reset_index()
df = df[df["doc"].isin(gold_dict.values()) == False]
df = df[['topic', 'doc', f'{DD}']]#.groupby(['topic', 'doc']).agg('mean').reset_index()
df['zero'] = 0
df = df[['topic', 'zero', 'doc', f'{DD}']]
df[f'doc'] = df[f'doc'].astype(str)
df[f'doc'] = [x.strip() for x in df[f'doc']]
df[f'{DD}'] = np.round(df[f'{DD}'],2)
df[f'{DD}'] *= 100

topics = np.unique(df['topic'])
df.to_csv(f'./pers_script/dataset-evaluation/qrels_{DATASET}.txt', header=False, index=False, sep=' ')
df

In [None]:
''' do the qrels for S100toPref '''
DATASET = 'S4toPref'
df = pd.read_csv(f'../../src/S100toPreference/S4_pairwise_full_scores.csv')
df = df[df["document"].isin(gold_dict.values()) == False]
df = df[['topic', 'document', f'score']].groupby(['topic', 'document']).agg('mean').reset_index()
df['zero'] = 0
df = df[['topic', 'zero', 'document', f'score']]
topics = np.unique(df['topic'])

df.to_csv(f'./pers_script/dataset-evaluation/qrels_{DATASET}.txt', header=False, index=False, sep=' ')

In [None]:
"""--- run trec_eval  --- """
import os

#for DATASET in tqdm([f'S100','ME','S100toPref','MEtoPref','moffatratio'], desc="DATASET loop", leave=True):
for DATASET in tqdm([f'S2','S4','S2toPref','S4toPref'], desc="DATASET loop", leave=True):

    QRELS_PATH = f'./pers_script/dataset-evaluation/qrels_{DATASET}.txt'
    OFFICIAL_QRELS_PATH = '../../../AAA_TREC-DATA/TREC-qrels/qrels.AH99.txt'
    UNCOMPRESSED_PATH = '../../../AAA_TREC-DATA/Uncompressed/AH99/input/'

    files = [f for f in listdir(UNCOMPRESSED_PATH) if (isfile(join(UNCOMPRESSED_PATH, f))) & (f != ".DS_Store")]
    runs = [f.replace('input.','') for f in files]

    for k in tqdm([2,3,4,5,6,7,8,9,10], desc="K loop", leave=False):
        SAVE_PATH = f'./pers_script/dataset-evaluation/eval_{k}/'
        makedir(f'{SAVE_PATH}/official')
        makedir(f'{SAVE_PATH}/{DATASET}')
        for run in tqdm(runs, desc="run loop", leave=False):

            os.system(f"trec_eval -q -m ndcg -M {k} {OFFICIAL_QRELS_PATH} {UNCOMPRESSED_PATH}/input.{run} > ./{SAVE_PATH}/official/{run}.txt ")
            os.system(f"trec_eval -q -m ndcg -M {k} {QRELS_PATH} {UNCOMPRESSED_PATH}/input.{run} > ./{SAVE_PATH}/{DATASET}/{run}.txt ")

            #assert False
print('\t done computing teval')

In [None]:
"""--- evaluate (also multiple datasets)  --- """

df_ev = pd.DataFrame(columns=['kind','topic', 'system', 'metric','score'])

def parses(path, kind, k):
    df_ev = pd.DataFrame(columns=['kind','topic', 'system', 'metric','score'])
    RUN_PATH = path
    files = [f for f in listdir(RUN_PATH) if (isfile(join(RUN_PATH, f))) & (f != ".DS_Store")]
    for i, f in enumerate(files):
        sys = f.replace('summary.{}.input.'.format(kind), '').replace('.txt', '')
        df =  pd.read_csv(join(RUN_PATH, f), sep='\t', header=None)
        df.columns = ['metric', 'topic', 'score']
        df['metric'] = [x.strip() for x in df['metric']]
        df = df[df['metric']=='ndcg']
        df = df[df['topic']!='all']
        df['kind'] = kind
        df['metric'] = f'ndcg@{k}'
        df['system'] = sys
               
        df_ev = pd.concat([df_ev, df])
    print(f"done for {kind}--{k}")
    return df_ev

SAVE_PATH = f'./pers_script/dataset-evaluation/'
df_eval = pd.concat([
parses(f'./{SAVE_PATH}/eval_2/official/', 'TREC8', 2),
parses(f'./{SAVE_PATH}/eval_3/official/', 'TREC8', 3),
parses(f'./{SAVE_PATH}/eval_4/official/', 'TREC8', 4),
parses(f'./{SAVE_PATH}/eval_5/official/', 'TREC8', 5),
parses(f'./{SAVE_PATH}/eval_6/official/', 'TREC8', 6),
parses(f'./{SAVE_PATH}/eval_7/official/', 'TREC8', 7),
parses(f'./{SAVE_PATH}/eval_8/official/', 'TREC8', 8),
parses(f'./{SAVE_PATH}/eval_9/official/', 'TREC8', 9),
parses(f'./{SAVE_PATH}/eval_10/official/', 'TREC8', 10),
#
parses(f'./{SAVE_PATH}/eval_2/S2/', f'S2', 2),
parses(f'./{SAVE_PATH}/eval_3/S2/', f'S2', 3),
parses(f'./{SAVE_PATH}/eval_4/S2/', f'S2', 4),
parses(f'./{SAVE_PATH}/eval_5/S2/', f'S2', 5),
parses(f'./{SAVE_PATH}/eval_6/S2/', f'S2', 6),
parses(f'./{SAVE_PATH}/eval_7/S2/', f'S2', 7),
parses(f'./{SAVE_PATH}/eval_8/S2/', f'S2', 8),
parses(f'./{SAVE_PATH}/eval_9/S2/', f'S2', 9),
parses(f'./{SAVE_PATH}/eval_10/S2/', f'S2', 10),
#
parses(f'./{SAVE_PATH}/eval_2/S4/', f'S4', 2),
parses(f'./{SAVE_PATH}/eval_3/S4/', f'S4', 3),
parses(f'./{SAVE_PATH}/eval_4/S4/', f'S4', 4),
parses(f'./{SAVE_PATH}/eval_5/S4/', f'S4', 5),
parses(f'./{SAVE_PATH}/eval_6/S4/', f'S4', 6),
parses(f'./{SAVE_PATH}/eval_7/S4/', f'S4', 7),
parses(f'./{SAVE_PATH}/eval_8/S4/', f'S4', 8),
parses(f'./{SAVE_PATH}/eval_9/S4/', f'S4', 9),
parses(f'./{SAVE_PATH}/eval_10/S4/', f'S4', 10),
#
parses(f'./{SAVE_PATH}/eval_2/S100/', f'S100', 2),
parses(f'./{SAVE_PATH}/eval_3/S100/', f'S100', 3),
parses(f'./{SAVE_PATH}/eval_4/S100/', f'S100', 4),
parses(f'./{SAVE_PATH}/eval_5/S100/', f'S100', 5),
parses(f'./{SAVE_PATH}/eval_6/S100/', f'S100', 6),
parses(f'./{SAVE_PATH}/eval_7/S100/', f'S100', 7),
parses(f'./{SAVE_PATH}/eval_8/S100/', f'S100', 8),
parses(f'./{SAVE_PATH}/eval_9/S100/', f'S100', 9),
parses(f'./{SAVE_PATH}/eval_10/S100/', f'S100', 10),
#
parses(f'./{SAVE_PATH}/eval_2/ME/', f'ME', 2),
parses(f'./{SAVE_PATH}/eval_3/ME/', f'ME', 3),
parses(f'./{SAVE_PATH}/eval_4/ME/', f'ME', 4),
parses(f'./{SAVE_PATH}/eval_5/ME/', f'ME', 5),
parses(f'./{SAVE_PATH}/eval_6/ME/', f'ME', 6),
parses(f'./{SAVE_PATH}/eval_7/ME/', f'ME', 7),
parses(f'./{SAVE_PATH}/eval_8/ME/', f'ME', 8),
parses(f'./{SAVE_PATH}/eval_9/ME/', f'ME', 9),
parses(f'./{SAVE_PATH}/eval_10/ME/', f'ME', 10),
#
parses(f'./{SAVE_PATH}/eval_2/moffatratio/', f'moffatratio', 2),
parses(f'./{SAVE_PATH}/eval_3/moffatratio/', f'moffatratio', 3),
parses(f'./{SAVE_PATH}/eval_4/moffatratio/', f'moffatratio', 4),
parses(f'./{SAVE_PATH}/eval_5/moffatratio/', f'moffatratio', 5),
parses(f'./{SAVE_PATH}/eval_6/moffatratio/', f'moffatratio', 6),
parses(f'./{SAVE_PATH}/eval_7/moffatratio/', f'moffatratio', 7),
parses(f'./{SAVE_PATH}/eval_8/moffatratio/', f'moffatratio', 8),
parses(f'./{SAVE_PATH}/eval_9/moffatratio/', f'moffatratio', 9),
parses(f'./{SAVE_PATH}/eval_10/moffatratio/', f'moffatratio', 10),
#
#parses(f'./{SAVE_PATH}/eval_3/moffatrele/', f'moffatrele', 3),
#parses(f'./{SAVE_PATH}/eval_5/moffatrele/', f'moffatrele', 5),
#parses(f'./{SAVE_PATH}/eval_10/moffatrele/', f'moffatrele', 10),
# #
#parses(f'./{SAVE_PATH}/eval_3/moffatpref/', f'moffatpref', 3),
#parses(f'./{SAVE_PATH}/eval_5/moffatpref/', f'moffatpref', 5),
#parses(f'./{SAVE_PATH}/eval_10/moffatpref/', f'moffatpref', 10),
#
parses(f'./{SAVE_PATH}/eval_2/S100toPref/', f'S100toPref', 2),
parses(f'./{SAVE_PATH}/eval_3/S100toPref/', f'S100toPref', 3),
parses(f'./{SAVE_PATH}/eval_4/S100toPref/', f'S100toPref', 4),
parses(f'./{SAVE_PATH}/eval_5/S100toPref/', f'S100toPref', 5),
parses(f'./{SAVE_PATH}/eval_6/S100toPref/', f'S100toPref', 6),
parses(f'./{SAVE_PATH}/eval_7/S100toPref/', f'S100toPref', 7),
parses(f'./{SAVE_PATH}/eval_8/S100toPref/', f'S100toPref', 8),
parses(f'./{SAVE_PATH}/eval_9/S100toPref/', f'S100toPref', 9),
parses(f'./{SAVE_PATH}/eval_10/S100toPref/', f'S100toPref', 10),
#
parses(f'./{SAVE_PATH}/eval_2/MEtoPref/', f'MEtoPref', 2),
parses(f'./{SAVE_PATH}/eval_3/MEtoPref/', f'MEtoPref', 3),
parses(f'./{SAVE_PATH}/eval_4/MEtoPref/', f'MEtoPref', 4),
parses(f'./{SAVE_PATH}/eval_5/MEtoPref/', f'MEtoPref', 5),
parses(f'./{SAVE_PATH}/eval_6/MEtoPref/', f'MEtoPref', 6),
parses(f'./{SAVE_PATH}/eval_7/MEtoPref/', f'MEtoPref', 7),
parses(f'./{SAVE_PATH}/eval_8/MEtoPref/', f'MEtoPref', 8),
parses(f'./{SAVE_PATH}/eval_9/MEtoPref/', f'MEtoPref', 9),
parses(f'./{SAVE_PATH}/eval_10/MEtoPref/', f'MEtoPref', 10),
#
parses(f'./{SAVE_PATH}/eval_2/S2toPref/', f'S2toPref', 2),
parses(f'./{SAVE_PATH}/eval_3/S2toPref/', f'S2toPref', 3),
parses(f'./{SAVE_PATH}/eval_4/S2toPref/', f'S2toPref', 4),
parses(f'./{SAVE_PATH}/eval_5/S2toPref/', f'S2toPref', 5),
parses(f'./{SAVE_PATH}/eval_6/S2toPref/', f'S2toPref', 6),
parses(f'./{SAVE_PATH}/eval_7/S2toPref/', f'S2toPref', 7),
parses(f'./{SAVE_PATH}/eval_8/S2toPref/', f'S2toPref', 8),
parses(f'./{SAVE_PATH}/eval_9/S2toPref/', f'S2toPref', 9),
parses(f'./{SAVE_PATH}/eval_10/S2toPref/', f'S2toPref', 10),
#
parses(f'./{SAVE_PATH}/eval_2/S4toPref/', f'S4toPref', 2),
parses(f'./{SAVE_PATH}/eval_3/S4toPref/', f'S4toPref', 3),
parses(f'./{SAVE_PATH}/eval_4/S4toPref/', f'S4toPref', 4),
parses(f'./{SAVE_PATH}/eval_5/S4toPref/', f'S4toPref', 5),
parses(f'./{SAVE_PATH}/eval_6/S4toPref/', f'S4toPref', 6),
parses(f'./{SAVE_PATH}/eval_7/S4toPref/', f'S4toPref', 7),
parses(f'./{SAVE_PATH}/eval_8/S4toPref/', f'S4toPref', 8),
parses(f'./{SAVE_PATH}/eval_9/S4toPref/', f'S4toPref', 9),
parses(f'./{SAVE_PATH}/eval_10/S4toPref/', f'S4toPref', 10),
])
df_eval['topic'] = df_eval['topic'].astype(str)
df_eval = df_eval[df_eval['topic'].isin(topics_to_consider)]
df_eval['score'] = df_eval['score'].astype(float)

df_eval.to_csv(f'./pers_script/dataset-evaluation/evaluation-results.txt', header=True, index=False, sep=',')

display(df_eval.head())
display(df_eval.tail())

In [None]:
''' compute correlations '''
from tauAP import tauAP, rank
import scipy.stats

df_eval = pd.read_csv(f'./pers_script/dataset-evaluation/evaluation-results.txt', sep=',')

print(np.unique(df_eval['kind']))
print(np.unique(df_eval['topic']))
print(np.unique(df_eval['metric']))

df_plot = df_eval.groupby(['kind', 'system','metric']).agg('mean').reset_index()
ddres = pd.DataFrame(columns=['metric','x','y','rho','tau','tauAP','tau10'])

for metric in ['ndcg@2','ndcg@3','ndcg@4','ndcg@5','ndcg@6','ndcg@7','ndcg@8','ndcg@9','ndcg@10']:
    sub = df_plot[df_plot['metric']==metric]

    ords = [('TREC8',f'S100'),('TREC8',f'ME'),('TREC8',f'moffatratio'),('TREC8',f'S100toPref'),('TREC8',f'MEtoPref'),
           ('TREC8',f'S2'),('TREC8',f'S4'),('TREC8',f'S2toPref'),('TREC8',f'S4toPref')]
    #ords = [('TREC8',f'S100')]
    for i, (x_val, y_val) in enumerate(ords):
#         print(x_val,y_val)
#         print(sub[sub['kind']==y_val])
        x = sub[sub['kind']==x_val]['score'].values 
        y = sub[sub['kind']==y_val]['score'].values
        
        rho, rhoval = sp.stats.pearsonr(x,y)
        tau, tauval = sp.stats.kendalltau(x,y)

        tauap = tauAP(x,y, top_heavy=True)

        xten = x
        yten = y
        xten, yten = zip(*sorted(zip(xten, yten)))
        xten = list(xten[::-1])
        yten = list(yten[::-1])
        xten = xten[:10]
        yten = yten[:10]
        tauten, tauvalten = sp.stats.kendalltau(xten,yten)      

        ddres.loc[len(ddres)] = [metric,x_val, y_val, rho, tau, tauap, tauten]
            
display(ddres)
ddres.to_csv(f'./pers_script/dataset-evaluation/correlations-all-topics.csv', index=False)

In [None]:
''' compute correlations '''
from tauAP import tauAP, rank
import scipy.stats

df_eval = pd.read_csv(f'./pers_script/dataset-evaluation/evaluation-results.txt', sep=',')

df_plot = df_eval[df_eval['topic'].isin(topics_to_consider_moffat)].groupby(['kind', 'system','metric']).agg('mean').reset_index()
ddres = pd.DataFrame(columns=['metric','x','y','rho','tau','tauAP','tau10'])

for metric in ['ndcg@3','ndcg@5','ndcg@10']:
    sub = df_plot[df_plot['metric']==metric]

    ords = [('TREC8',f'S100'),('TREC8',f'ME'),('TREC8',f'S2'),('TREC8',f'S4'),('TREC8',f'moffatratio'),
           ('TREC8',f'S2'),('TREC8',f'S4'),('TREC8',f'S2toPref'),('TREC8',f'S4toPref')]
    for i, (x_val, y_val) in enumerate(ords):
        print(x_val,y_val)
        x = sub[sub['kind']==x_val]['score'].values 
        y = sub[sub['kind']==y_val]['score'].values
        
        rho, rhoval = sp.stats.pearsonr(x,y)
        tau, tauval = sp.stats.kendalltau(x,y)

        tauap = tauAP(x,y, top_heavy=True)

        xten = x
        yten = y
        xten, yten = zip(*sorted(zip(xten, yten)))
        xten = list(xten[::-1])
        yten = list(yten[::-1])
        xten = xten[:10]
        yten = yten[:10]
        tauten, tauvalten = sp.stats.kendalltau(xten,yten)      

        ddres.loc[len(ddres)] = [metric,x_val, y_val, rho, tau, tauap, tauten]
            
display(ddres)
ddres.to_csv(f'./pers_script/dataset-evaluation/correlations-moffat-topics.csv', index=False)