#### Imports

In [1]:
import re
import json
from pathlib import Path
from functools import partial

import numpy as np
import pandas as pd
import scipy.stats
import pytrec_eval

from IPython.display import display


QRELS_PATH = "../../data/test/word2vec/qrels.txt"
RUN_FILE_PATHS = list(Path("../../data/test/word2vec/").glob("**/runfiles/*.txt"))

#### Functions

In [76]:
def get_vector_size(index):
    match = re.search("d\d+", index)
    return int(match.group()[1:])


def get_negative_samples(index):
    match = re.search("n\d+", index)
    return int(match.group()[1:])


def get_window_size(index):
    match = re.search("w\d+", index)
    return int(match.group()[1:])


def get_epochs(index):
    match = re.search("ep\d+", index)
    return int(match.group()[2:])


def get_ratio(index):
    match = re.search("(?:\(r)(.*)(?:\))", index)
    return float(match.group(1))

    
def get_stategy(index):
    return index.split(".")[-1]


def filter_irrelevant_columns(df, suffixes=["_100", "_200", "_500", "_1000"]):
    is_irrelevant = lambda x: any([x.endswith(suffix) for suffix in suffixes])
    cols_to_drop = [col for col in df.columns if is_irrelevant(col)]
    return df.drop(columns=cols_to_drop)


def infer_columns(df):
    # Vector size
    df["vector_size"] = df.index.map(get_vector_size)
    df["ns"] = df.index.map(get_negative_samples)
    df["w"] = df.index.map(get_window_size)
    df["epochs"] = df.index.map(get_epochs)
    df["ratio"] = df.index.map(get_ratio)
    df["strategy"] = df.index.map(get_stategy)
    df = filter_irrelevant_columns(df)
    
    return df


def read_qrels(qrels_path=QRELS_PATH):
    with open(QRELS_PATH, "r") as fp:
        qrels = pytrec_eval.parse_qrel(fp)
    return qrels


def read_runfiles(runfile_paths=RUN_FILE_PATHS, pattern=None):
    if pattern:
        relevant_runfiles = [p for p in runfile_paths if pattern in str(p)]
    else:
        relevant_runfiles = runfile_paths
        
    runs = dict()
    
    for run_file in relevant_runfiles:
        run_id = run_file.stem
        with open(run_file, "r") as fp:
            run = pytrec_eval.parse_run(fp)
        runs[run_id] = run   
        
    return runs
    
    
def evaluate(qrels, runs, measures):
    run_results_by_query = dict()
    run_results = dict()
    for run_id, run in runs.items():
        evaluator = pytrec_eval.RelevanceEvaluator(qrels, measures)
        results = evaluator.evaluate(run)
        
        mean_results = {measure: np.mean([v[measure] for _, v in results.items()]) 
                        for measure in results["0"].keys()}
        run_results_by_query[run_id] = results
        run_results[run_id] = mean_results

    return run_results_by_query, run_results


### Reading qrels and listing supported evaluation measures

In [77]:
qrels = read_qrels()
display(pytrec_eval.supported_measures)

{'11pt_avg',
 'G',
 'P',
 'Rndcg',
 'Rprec',
 'Rprec_mult',
 'binG',
 'bpref',
 'gm_bpref',
 'gm_map',
 'infAP',
 'iprec_at_recall',
 'map',
 'map_cut',
 'ndcg',
 'ndcg_cut',
 'ndcg_rel',
 'num_nonrel_judged_ret',
 'num_q',
 'num_rel',
 'num_rel_ret',
 'num_ret',
 'recall',
 'recip_rank',
 'relative_P',
 'relstring',
 'runid',
 'set_F',
 'set_P',
 'set_map',
 'set_recall',
 'set_relative_P',
 'success',
 'utility'}

### Inspecting results (by run)

In [120]:
pattern = "(r3.0e-01).summary"
runs = read_runfiles(pattern = pattern)
run_results_by_query, run_results = evaluate(qrels, runs, ["ndcg", "map", "recall"])

results_df = infer_columns(pd.DataFrame(run_results).T)
results_df = results_df.sort_values("ndcg", ascending=False)

pd.set_option("max_rows", 90)
display(results_df)

Unnamed: 0,map,recall_5,recall_10,recall_15,recall_20,recall_30,ndcg,vector_size,ns,w,epochs,ratio,strategy
"run_Doc2Vec(dm-c,d200,n10,w2,mc5,s1e-05,t4,ep20).(r3.0e-01).summary",0.575597,0.551923,0.694368,0.833883,0.879121,0.972436,0.708152,200,10,2,20,0.3,summary
"run_Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40).(r3.0e-01).summary",0.560205,0.530403,0.725092,0.847161,0.925092,0.985897,0.704817,150,20,3,40,0.3,summary
"run_Doc2Vec(dm-c,d200,n10,w3,mc5,s1e-05,t4,ep20).(r3.0e-01).summary",0.552853,0.547253,0.687179,0.779075,0.863553,0.958608,0.701604,200,10,3,20,0.3,summary
"run_Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40).(r3.0e-01).summary",0.567286,0.542445,0.70783,0.839332,0.905678,0.983333,0.691238,100,20,3,40,0.3,summary
"run_Doc2Vec(dm-c,d300,n10,w2,mc5,s1e-05,t4,ep20).(r3.0e-01).summary",0.549767,0.48924,0.670971,0.782738,0.874771,0.971612,0.687028,300,10,2,20,0.3,summary
"run_Doc2Vec(dm-c,d200,n10,w5,mc5,s1e-05,t4,ep20).(r3.0e-01).summary",0.506629,0.505357,0.665293,0.765522,0.821841,0.932005,0.674523,200,10,5,20,0.3,summary
"run_Doc2Vec(dm-c,d300,n10,w3,mc5,s1e-05,t4,ep20).(r3.0e-01).summary",0.540775,0.500137,0.638095,0.769322,0.869002,0.957051,0.673388,300,10,3,20,0.3,summary
"run_Doc2Vec(dm-c,d200,n10,w4,mc5,s1e-05,t4,ep20).(r3.0e-01).summary",0.511278,0.507189,0.655723,0.774222,0.875733,0.958791,0.664316,200,10,4,20,0.3,summary


### Inspecting results (by query)

In [51]:
res_query = run_results_by_query
res = [res_query[run] for run in res_query.keys()]
res_query_df = pd.DataFrame.from_dict(res[0], orient="index")

for i in range(1, len(res)):
    res_query_df += pd.DataFrame.from_dict(res[i], orient="index")

res_query_df = filter_irrelevant_columns(res_query_df / len(res))

Unnamed: 0,map,recall_5,recall_10,recall_15,recall_20,recall_30,ndcg
38,0.069599,0.0,0.052326,0.186047,0.331395,0.872093,0.256007
46,0.082186,0.02907,0.127907,0.27907,0.453488,0.866279,0.285121
36,0.10764,0.069767,0.168605,0.325581,0.523256,0.889535,0.306739
0,0.125731,0.023256,0.124031,0.44186,0.74031,0.984496,0.328778
51,0.15958,0.313953,0.55814,0.72093,0.860465,0.976744,0.337528
48,0.141657,0.065891,0.178295,0.387597,0.732558,1.0,0.339341
45,0.136798,0.116279,0.302326,0.511628,0.633721,0.947674,0.352524
49,0.197868,0.325581,0.697674,0.953488,1.0,1.0,0.37312
42,0.145367,0.319767,0.482558,0.52907,0.552326,0.72093,0.382051
2,0.197324,0.09593,0.311047,0.584302,0.831395,0.994186,0.386936


In [53]:
res_query_df = res_query_df.sort_values("ndcg")

print("Queries with lowest average ndcg")
display(res_query_df.head(10))
print("Queries with highest average ndcg")
display(res_query_df.tail(10))

Queries with lowest average ndcg


Unnamed: 0,map,recall_5,recall_10,recall_15,recall_20,recall_30,ndcg
38,0.069599,0.0,0.052326,0.186047,0.331395,0.872093,0.256007
46,0.082186,0.02907,0.127907,0.27907,0.453488,0.866279,0.285121
36,0.10764,0.069767,0.168605,0.325581,0.523256,0.889535,0.306739
0,0.125731,0.023256,0.124031,0.44186,0.74031,0.984496,0.328778
51,0.15958,0.313953,0.55814,0.72093,0.860465,0.976744,0.337528
48,0.141657,0.065891,0.178295,0.387597,0.732558,1.0,0.339341
45,0.136798,0.116279,0.302326,0.511628,0.633721,0.947674,0.352524
49,0.197868,0.325581,0.697674,0.953488,1.0,1.0,0.37312
42,0.145367,0.319767,0.482558,0.52907,0.552326,0.72093,0.382051
2,0.197324,0.09593,0.311047,0.584302,0.831395,0.994186,0.386936


Queries with highest average ndcg


Unnamed: 0,map,recall_5,recall_10,recall_15,recall_20,recall_30,ndcg
22,0.732231,0.688953,0.755814,0.851744,0.898256,0.985465,0.91684
7,0.755156,0.860465,0.947674,0.976744,0.982558,1.0,0.92304
28,0.98062,1.0,1.0,1.0,1.0,1.0,0.929916
11,0.8194,0.724806,0.802326,0.829457,0.868217,0.963178,0.940138
16,0.805406,0.660853,0.802326,0.891473,0.932171,0.988372,0.945929
43,0.982558,1.0,1.0,1.0,1.0,1.0,0.950137
27,0.930699,0.976744,0.988372,1.0,1.0,1.0,0.977027
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Significance testing

In [5]:
def test_significance(run_1, run_2, measure, test_f):
    query_ids = list(qrels.keys())
    first_scores = [run_results_by_query[run_1][query_id][measure] for query_id in query_ids]
    second_scores = [run_results_by_query[run_2][query_id][measure] for query_id in query_ids]
    return test_f(first_scores, second_scores)

In [None]:
# RETRAIN WITH w4 Doc2Vec(dm-c,d300,n10,w5,mc5,s1e-05,t4,ep20)


In [127]:
first_run = "run_Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40).(r3.0e-01).summary"
second_run = "run_Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40).(r3.0e-01).summary"

# first_run = results_df.index[0]
# second_run = results_df.index[-1]

print(results_df.loc[first_run], "\n")
print(results_df.loc[second_run])

test_significance(first_run, second_run, "ndcg", scipy.stats.ttest_rel)

map            0.560205
recall_5       0.530403
recall_10      0.725092
recall_15      0.847161
recall_20      0.925092
recall_30      0.985897
ndcg           0.704817
vector_size         150
ns                   20
w                     3
epochs               40
ratio               0.3
strategy        summary
Name: run_Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40).(r3.0e-01).summary, dtype: object 

map            0.567286
recall_5       0.542445
recall_10       0.70783
recall_15      0.839332
recall_20      0.905678
recall_30      0.983333
ndcg           0.691238
vector_size         100
ns                   20
w                     3
epochs               40
ratio               0.3
strategy        summary
Name: run_Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40).(r3.0e-01).summary, dtype: object


Ttest_relResult(statistic=0.6637414888702673, pvalue=0.5098452918811456)

In [115]:
run_len = len(results_df)
for i in range(1, run_len):
    _, pval = test_significance(results_df.index[0], results_df.index[i], "ndcg", scipy.stats.ttest_rel)
    if (pval < 0.05):
        print(i, pval)

32 0.04352996313875534
35 0.03150365301944694
38 0.02009345904684654
39 0.014752415384600889
41 0.006077111865774459
43 0.03649957013122876
44 0.019240436028498432
45 0.010363810313886456
46 0.005403975893743976
47 0.012436511079080047
48 0.04421204938420015
49 0.01641324186738046
50 0.02618756525686019
51 0.0029230192476966013
52 0.0029853006407142718
53 0.013333621338185093
54 0.03646257664614458
55 0.0013410531897114213
56 0.004291153237617687
57 0.01029005714656239
58 0.02320805339662533
59 0.027700766465355894
60 0.0005450527209628524
61 0.012465099249766928
62 0.02753188439596516
63 0.004691827503554912
64 0.011459321891172393
65 0.003266701203222538
66 0.007306681310822138
67 0.0009728934853142734
68 0.009473647359099821
69 0.003586243737546933
70 0.007679706050645118
71 0.0009804504200287745
72 0.01836537796454979
73 0.00142019780224696
74 0.010803370226025153
75 0.012682866159047231
76 0.005455519954641297
77 0.013845445305309428
78 0.0062789376642711945
79 0.00034288400671196

In [119]:
first_run = results_df.index[0]
second_run = results_df.index[39]

print(results_df.loc[first_run], "\n")
print(results_df.loc[second_run])

map            0.588638
recall_5       0.590888
recall_10      0.739744
recall_15      0.808196
recall_20      0.859982
recall_30      0.969505
ndcg           0.727413
vector_size         150
ns                   20
w                     3
epochs               40
ratio               0.5
strategy           full
Name: run_Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40).(r5.0e-01).full, dtype: object 

map            0.530942
recall_5       0.478342
recall_10      0.611172
recall_15      0.747161
recall_20      0.832143
recall_30      0.949359
ndcg           0.678347
vector_size         300
ns                   10
w                     2
epochs               20
ratio               0.2
strategy           full
Name: run_Doc2Vec(dm-c,d300,n10,w2,mc5,s1e-05,t4,ep20).(r2.0e-01).full, dtype: object
