#### Imports

In [94]:
import re
import json
from pathlib import Path
from functools import partial

import numpy as np
import pandas as pd
from scipy.stats import ttest_rel, ttest_1samp
import pytrec_eval

from IPython.display import display


QRELS_PATH = "../../data/test/doc2vec/qrels.txt"
RUN_FILE_PATHS = list(Path("../../data/test/doc2vec/").glob("**/run_files/*.txt"))

#### Functions

In [2]:
def get_vector_size(index):
    match = re.search("d\d+", index)
    return int(match.group()[1:])


def get_negative_samples(index):
    match = re.search("n\d+", index)
    return int(match.group()[1:])


def get_window_size(index):
    match = re.search("w\d+", index)
    return int(match.group()[1:])


def get_epochs(index):
    match = re.search("ep\d+", index)
    return int(match.group()[2:])


def get_ratio(index):
    match = re.search("(?:r\()(.*)(?:\))", index)
    return float(match.group(1))

    
def get_stategy(index):
    return index.split(".")[-1]


def filter_irrelevant_columns(df, suffixes=["_100", "_200", "_500", "_1000"]):
    is_irrelevant = lambda x: any([x.endswith(suffix) for suffix in suffixes])
    cols_to_drop = [col for col in df.columns if is_irrelevant(col)]
    return df.drop(columns=cols_to_drop)


def infer_columns(df):
    # Vector size
    df["vector_size"] = df.index.map(get_vector_size)
    df["ns"] = df.index.map(get_negative_samples)
    df["w"] = df.index.map(get_window_size)
    df["epochs"] = df.index.map(get_epochs)
    df["ratio"] = df.index.map(get_ratio)
    df["strategy"] = df.index.map(get_stategy)
    df = filter_irrelevant_columns(df)
    
    return df


def read_qrels(qrels_path=QRELS_PATH):
    with open(QRELS_PATH, "r") as fp:
        qrels = pytrec_eval.parse_qrel(fp)
    return qrels


def read_runfiles(runfile_paths=RUN_FILE_PATHS, pattern=None):
    if pattern:
        relevant_runfiles = [p for p in runfile_paths if pattern in str(p)]
    else:
        relevant_runfiles = runfile_paths
        
    runs = dict()
    
    for run_file in relevant_runfiles:
        run_id = run_file.stem
        with open(run_file, "r") as fp:
            run = pytrec_eval.parse_run(fp)
        runs[run_id] = run   
        
    return runs
    
    
def evaluate(qrels, runs, measures):
    run_results_by_query = dict()
    run_results = dict()
    for run_id, run in runs.items():
        evaluator = pytrec_eval.RelevanceEvaluator(qrels, measures)
        results = evaluator.evaluate(run)       
        mean_results = {measure: np.mean([v[measure] for _, v in results.items()]) 
                        for measure in results["0"].keys()}
        run_results_by_query[run_id] = results
        run_results[run_id] = mean_results

    return run_results_by_query, run_results


### Reading qrels and listing supported evaluation measures

In [3]:
qrels = read_qrels()

### Inspecting results (by run)

In [53]:
pattern = "r(0.7).full"
# pattern = None
runs = read_runfiles(pattern=pattern)
run_results_by_query, run_results = evaluate(qrels, runs, ["map", "ndcg"])

results_df = infer_columns(pd.DataFrame(run_results).T)
results_df = results_df.sort_values("ndcg", ascending=False)

pd.set_option("max_rows", 3000)
display(results_df)

Unnamed: 0,map,ndcg,vector_size,ns,w,epochs,ratio,strategy
"run.Doc2Vec(dm-c,d100,n30,w2,mc2,s0.0001,t16,ep40).r(0.7).full",0.687507,0.815685,100,30,2,40,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40).r(0.7).full",0.682093,0.806227,100,20,1,40,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w2,mc5,s1e-05,t4,ep40).r(0.7).full",0.688281,0.803327,100,20,2,40,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep20).r(0.7).full",0.689935,0.801636,100,20,1,20,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w2,mc5,s1e-05,t4,ep20).r(0.7).full",0.677361,0.798055,100,20,2,20,0.7,full
"run.Doc2Vec(dm-c,d50,n20,w1,mc5,s1e-05,t4,ep20).r(0.7).full",0.669805,0.792846,50,20,1,20,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40).r(0.7).full",0.649834,0.790786,100,20,3,40,0.7,full
"run.Doc2Vec(dm-c,d75,n20,w2,mc5,s1e-05,t4,ep20).r(0.7).full",0.676457,0.789988,75,20,2,20,0.7,full
"run.Doc2Vec(dm-c,d150,n20,w1,mc5,s1e-05,t4,ep20).r(0.7).full",0.645018,0.785468,150,20,1,20,0.7,full
"run.Doc2Vec(dm-c,d200,n20,w2,mc5,s1e-05,t8,ep20).r(0.7).full",0.633714,0.780897,200,20,2,20,0.7,full


### Inspecting results (by query)

In [91]:
res_query = run_results_by_query
res = [res_query[run] for run in res_query.keys()]
res_query_df = pd.DataFrame.from_dict(res[0], orient="index")

for i in range(1, len(res)):
    res_query_df += pd.DataFrame.from_dict(res[i], orient="index")

res_query_df = filter_irrelevant_columns(res_query_df / len(res))
res_query_df = res_query_df.sort_values("ndcg")

print("Queries with lowest average ndcg")
display(res_query_df.head(10))
print("Queries with highest average ndcg")
display(res_query_df.tail(10))

lowest_ndcg_doc_ids = res_query_df.head(20).index

Queries with lowest average ndcg


Unnamed: 0,map,ndcg
51,0.04599,0.219469
38,0.151769,0.322975
21,0.204899,0.438181
0,0.254702,0.455078
2,0.302968,0.460733
46,0.21716,0.471764
42,0.231804,0.491752
33,0.313877,0.521887
32,0.353702,0.548597
48,0.304213,0.566281


Queries with highest average ndcg


Unnamed: 0,map,ndcg
29,0.776383,0.921241
28,0.915985,0.930737
5,0.938,0.95333
49,0.941667,0.956469
7,0.900205,0.957197
27,0.878772,0.958371
43,0.962857,0.980265
8,0.976667,0.982619
10,0.98,0.985237
9,0.99,0.992619


### Significance testing

In [41]:
def test_significance(run_1=None, run_2=None, measure="ndcg"):
    query_ids = list(qrels.keys())
    first_scores = [run_results_by_query[run_1][query_id][measure] for query_id in query_ids]
    second_scores = [run_results_by_query[run_2][query_id][measure] for query_id in query_ids]
    return ttest_rel(first_scores, second_scores)

**Vector dimensionality (everything else static):**
* `d100` -> `d150` not significant
* `d100` -> `d200`, `d300` significant 

Conclusions this far: best models should have vector dimensionality `d100` or `d150`

**Negative sampling:**
* (for `d100`) `n20` -> `n10` not significant for `w1`, `w2`, `w3`, significant for `w4`, `w5`
* (for `d150`) `n20` -> `n10` not significant for all window sizes except for `w4`

Conclusions this far: best models should have vector dimensionality `d100` and negative sampling value of `n20` with window sizes `w1`, `w2`, `w3`

**Window size:**
* (for `d100`, `n20`) `w1` -> `w2` not significant, `w1` -> `w3`, `w4`, `w5` significant

Conclusions this far: best models should have vector dimensionality `d100` and negative sampling value of `n20` with window sizes `w1`, `w2`

**Epochs**
Only four models were trained with `e40`:
* `Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40)`
* `Doc2Vec(dm-c,d100,n20,w2,mc5,s1e-05,t4,ep40)`
* `Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40)`
* `Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40)`

Significance tests showed insignificant differences in perfomance between these models and their counterparts trained with 20 epochs

In [43]:
first_run = "run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40).r(0.8).full"
second_run = "run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep20).r(0.8).full"

# first_run = results_df.index[0]
# second_run = results_df.index[-1]

display(results_df.loc[first_run])
display(results_df.loc[second_run])

test_significance(first_run, second_run)

map            0.646164
ndcg           0.792668
vector_size         100
ns                   20
w                     3
epochs               40
ratio               0.8
strategy           full
Name: run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40).r(0.8).full, dtype: object

map            0.629034
ndcg           0.765107
vector_size         100
ns                   20
w                     3
epochs               20
ratio               0.8
strategy           full
Name: run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep20).r(0.8).full, dtype: object

Ttest_relResult(statistic=1.5276509332384145, pvalue=0.1327780322212468)

In [14]:
results_df.index[0]

'run.Doc2Vec(dm-c,d100,n30,w2,mc2,s0.0001,t16,ep40).r(0.7).full'

In [68]:
results_df.iloc[0:13]

Unnamed: 0,map,ndcg,vector_size,ns,w,epochs,ratio,strategy
"run.Doc2Vec(dm-c,d100,n30,w2,mc2,s0.0001,t16,ep40).r(0.7).full",0.687507,0.815685,100,30,2,40,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40).r(0.7).full",0.682093,0.806227,100,20,1,40,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w2,mc5,s1e-05,t4,ep40).r(0.7).full",0.688281,0.803327,100,20,2,40,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep20).r(0.7).full",0.689935,0.801636,100,20,1,20,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w2,mc5,s1e-05,t4,ep20).r(0.7).full",0.677361,0.798055,100,20,2,20,0.7,full
"run.Doc2Vec(dm-c,d50,n20,w1,mc5,s1e-05,t4,ep20).r(0.7).full",0.669805,0.792846,50,20,1,20,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40).r(0.7).full",0.649834,0.790786,100,20,3,40,0.7,full
"run.Doc2Vec(dm-c,d75,n20,w2,mc5,s1e-05,t4,ep20).r(0.7).full",0.676457,0.789988,75,20,2,20,0.7,full
"run.Doc2Vec(dm-c,d150,n20,w1,mc5,s1e-05,t4,ep20).r(0.7).full",0.645018,0.785468,150,20,1,20,0.7,full
"run.Doc2Vec(dm-c,d200,n20,w2,mc5,s1e-05,t8,ep20).r(0.7).full",0.633714,0.780897,200,20,2,20,0.7,full


In [65]:
run_len = len(results_df)
for i in range(1, run_len):
    _, pval = test_significance(results_df.index[0], results_df.index[i], "ndcg")
    if (pval < 0.05):
        status_msg = "{:2}\t{}\t{:.6f}\t{:.3f}".format(i, results_df.index[i],
                                                    results_df.loc[results_df.index[i], "ndcg"], 
                                                    pval)
        print(status_msg)

13	run.Doc2Vec(dm-c,d200,n10,w1,mc5,s1e-05,t8,ep20).r(0.7).full	0.776087	0.026
14	run.Doc2Vec(dm-c,d150,n10,w1,mc5,s1e-05,t4,ep20).r(0.7).full	0.773705	0.049
15	run.Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40).r(0.7).full	0.771663	0.043
16	run.Doc2Vec(dm-c,d100,n20,w4,mc5,s1e-05,t4,ep20).r(0.7).full	0.770709	0.049
17	run.Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep20).r(0.7).full	0.770454	0.044
18	run.Doc2Vec(dm-c,d100,n10,w2,mc5,s1e-05,t4,ep20).r(0.7).full	0.769760	0.028
19	run.Doc2Vec(dm-c,d200,n20,w1,mc5,s1e-05,t8,ep20).r(0.7).full	0.768711	0.017
20	run.Doc2Vec(dm-c,d150,n10,w2,mc5,s1e-05,t4,ep20).r(0.7).full	0.768529	0.026
21	run.Doc2Vec(dm-c,d150,n20,w2,mc5,s1e-05,t4,ep20).r(0.7).full	0.766998	0.036
22	run.Doc2Vec(dm-c,d300,n20,w1,mc5,s1e-05,t8,ep20).r(0.7).full	0.764467	0.007
23	run.Doc2Vec(dm-c,d200,n10,w2,mc5,s1e-05,t4,ep20).r(0.7).full	0.764233	0.009
24	run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep20).r(0.7).full	0.763863	0.017
25	run.Doc2Vec(dm-c,d200,n10,w3,mc5,s1e-05,t4,ep20).

In [71]:
run_results_by_query[results_df.index[0]]

{'0': {'map': 0.712121212121212, 'ndcg': 0.7930309626934754},
 '1': {'map': 0.6635897435897435, 'ndcg': 0.887867753934023},
 '2': {'map': 0.5404761904761904, 'ndcg': 0.6347311442118588},
 '3': {'map': 0.9428571428571428, 'ndcg': 0.9864454785297619},
 '4': {'map': 0.6934782608695652, 'ndcg': 0.9105262007718226},
 '5': {'map': 1.0, 'ndcg': 1.0},
 '6': {'map': 1.0, 'ndcg': 1.0},
 '7': {'map': 1.0, 'ndcg': 1.0},
 '8': {'map': 1.0, 'ndcg': 1.0},
 '9': {'map': 1.0, 'ndcg': 1.0},
 '10': {'map': 1.0, 'ndcg': 1.0},
 '11': {'map': 0.8333333333333334, 'ndcg': 0.9569112159672715},
 '12': {'map': 0.9444444444444445, 'ndcg': 0.8710289829732358},
 '13': {'map': 0.7138888888888889, 'ndcg': 0.8340144578144808},
 '14': {'map': 0.9006211180124224, 'ndcg': 0.9751554440904016},
 '15': {'map': 0.8809523809523809, 'ndcg': 0.9693441132226912},
 '16': {'map': 0.9583333333333334, 'ndcg': 0.8582780385554273},
 '17': {'map': 0.7291666666666666, 'ndcg': 0.9214099920026009},
 '18': {'map': 0.298989898989899, 'ndcg'

In [76]:
run_results_by_query[results_df.index[12]]

{'0': {'map': 0.611111111111111, 'ndcg': 0.7139760675935648},
 '1': {'map': 0.7666666666666666, 'ndcg': 0.8405763632912088},
 '2': {'map': 0.47619047619047616, 'ndcg': 0.5549249671870674},
 '3': {'map': 0.7753968253968253, 'ndcg': 0.9360536678650264},
 '4': {'map': 0.6555555555555556, 'ndcg': 0.8971237117673005},
 '5': {'map': 0.2, 'ndcg': 0.38685280723454163},
 '6': {'map': 1.0, 'ndcg': 1.0},
 '7': {'map': 1.0, 'ndcg': 1.0},
 '8': {'map': 1.0, 'ndcg': 1.0},
 '9': {'map': 1.0, 'ndcg': 1.0},
 '10': {'map': 0.5, 'ndcg': 0.6309297535714575},
 '11': {'map': 0.8722222222222222, 'ndcg': 0.882471016868511},
 '12': {'map': 1.0, 'ndcg': 0.8838469780329911},
 '13': {'map': 0.8176767676767676, 'ndcg': 0.8655686564508995},
 '14': {'map': 0.8893662728249193, 'ndcg': 0.8652805039804249},
 '15': {'map': 0.7857142857142857, 'ndcg': 0.9415282608745102},
 '16': {'map': 0.9047619047619048, 'ndcg': 0.9767115136988043},
 '17': {'map': 0.6782407407407408, 'ndcg': 0.9051460435679907},
 '18': {'map': 0.306062

In [102]:
challenger_model = pd.DataFrame.from_dict(run_results_by_query[results_df.index[1]], orient="index")
top_model = pd.DataFrame.from_dict(run_results_by_query[results_df.index[0]], orient="index")

In [103]:
measure_diffs = top_model - challenger_model

display(measure_diffs)

Unnamed: 0,map,ndcg
0,0.473514,0.303294
1,0.034066,0.202202
2,-0.044048,0.021399
3,0.17619,0.052397
4,-0.045411,-0.014701
5,0.0,0.0
6,0.0,0.140281
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [104]:
display(measure_diffs.loc[lowest_ndcg_doc_ids])

Unnamed: 0,map,ndcg
51,0.056522,0.070961
38,0.050855,0.073445
21,-0.06666,-0.081752
0,0.473514,0.303294
2,-0.044048,0.021399
46,-0.058533,-0.08483
42,0.235929,0.271113
33,0.083333,0.036798
32,0.108463,0.072739
48,0.163399,0.288229


In [105]:
measure_diffs.loc[lowest_ndcg_doc_ids].mean()

map     0.055088
ndcg    0.035364
dtype: float64

In [109]:
ttest_1samp(measure_diffs.loc[lowest_ndcg_doc_ids].map, 0)

Ttest_1sampResult(statistic=1.276260062960509, pvalue=0.21724431936884642)