#### Imports

In [1]:
import re
import json
from pathlib import Path

import numpy as np
import pandas as pd
import scipy.stats
import pytrec_eval

from IPython.display import display


QRELS_PATH = "../../data/test/doc2vec/qrels.txt"
RUN_FILE_PATHS = list(Path("../../data/test/doc2vec/").glob("**/run_files/*.txt"))

#### Functions

In [2]:
def get_vector_size(index):
    match = re.search("d\d+", index)
    return int(match.group()[1:])


def get_negative_samples(index):
    match = re.search("n\d+", index)
    return int(match.group()[1:])


def get_window_size(index):
    match = re.search("w\d+", index)
    return int(match.group()[1:])


def get_epochs(index):
    match = re.search("ep\d+", index)
    return int(match.group()[2:])


def get_ratio(index):
    match = re.search("(?:r\()(.*)(?:\))", index)
    return float(match.group(1))

    
def get_stategy(index):
    return index.split(".")[-1]


def filter_irrelevant_columns(df, suffixes=["_100", "_200", "_500", "_1000"]):
    is_irrelevant = lambda x: any([x.endswith(suffix) for suffix in suffixes])
    cols_to_drop = [col for col in df.columns if is_irrelevant(col)]
    return df.drop(columns=cols_to_drop)


def infer_columns(df):
    # Vector size
    df["vector_size"] = df.index.map(get_vector_size)
    df["ns"] = df.index.map(get_negative_samples)
    df["w"] = df.index.map(get_window_size)
    df["epochs"] = df.index.map(get_epochs)
    df["ratio"] = df.index.map(get_ratio)
    df["strategy"] = df.index.map(get_stategy)
    df = filter_irrelevant_columns(df)
    
    return df


def read_qrels(qrels_path=QRELS_PATH):
    with open(QRELS_PATH, "r") as fp:
        qrels = pytrec_eval.parse_qrel(fp)
    return qrels


def read_runfiles(runfile_paths=RUN_FILE_PATHS, pattern=None):
    if pattern:
        relevant_runfiles = [p for p in runfile_paths if pattern in str(p)]
    else:
        relevant_runfiles = runfile_paths
        
    runs = dict()
    
    for run_file in relevant_runfiles:
        run_id = run_file.stem
        with open(run_file, "r") as fp:
            run = pytrec_eval.parse_run(fp)
        runs[run_id] = run   
        
    return runs
    
    
def evaluate(qrels, runs, measures):
    run_results_by_query = dict()
    run_results = dict()
    for run_id, run in runs.items():
        evaluator = pytrec_eval.RelevanceEvaluator(qrels, measures)
        results = evaluator.evaluate(run)       
        mean_results = {measure: np.mean([v[measure] for _, v in results.items()]) 
                        for measure in results["0"].keys()}
        run_results_by_query[run_id] = results
        run_results[run_id] = mean_results

    return run_results_by_query, run_results


### Reading qrels and listing supported evaluation measures

In [3]:
qrels = read_qrels()

### Inspecting results (by run)

In [4]:
# pattern = "r(0.8).full"
pattern = None
runs = read_runfiles(pattern=pattern)
run_results_by_query, run_results = evaluate(qrels, runs, ["map", "ndcg"])

results_df = infer_columns(pd.DataFrame(run_results).T)
results_df = results_df.sort_values("ndcg", ascending=False)

pd.set_option("max_rows", 3000)
display(results_df)

Unnamed: 0,map,ndcg,vector_size,ns,w,epochs,ratio,strategy
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40).r(0.8).full",0.690265,0.815255,100,20,1,40,0.8,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40).r(0.85).full",0.688167,0.814401,100,20,1,40,0.85,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40).r(0.95).full",0.682708,0.810531,100,20,1,40,0.95,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40).r(1).full",0.682688,0.810502,100,20,1,40,1.0,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40).r(0.9).full",0.685134,0.81045,100,20,1,40,0.9,full
"run.Doc2Vec(dm-c,d100,n20,w2,mc5,s1e-05,t4,ep40).r(0.8).full",0.68492,0.806648,100,20,2,40,0.8,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40).r(0.7).full",0.682093,0.806227,100,20,1,40,0.7,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep20).r(0.8).full",0.691488,0.805743,100,20,1,20,0.8,full
"run.Doc2Vec(dm-c,d100,n20,w2,mc5,s1e-05,t4,ep40).r(0.6).full",0.694341,0.805709,100,20,2,40,0.6,full
"run.Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep20).r(0.85).full",0.690096,0.805269,100,20,1,20,0.85,full


### Inspecting results (by query)

In [None]:
res_query = run_results_by_query
res = [res_query[run] for run in res_query.keys()]
res_query_df = pd.DataFrame.from_dict(res[0], orient="index")

for i in range(1, len(res)):
    res_query_df += pd.DataFrame.from_dict(res[i], orient="index")

res_query_df = filter_irrelevant_columns(res_query_df / len(res))
res_query_df = res_query_df.sort_values("ndcg")

print("Queries with lowest average ndcg")
display(res_query_df.head(10))
print("Queries with highest average ndcg")
display(res_query_df.tail(10))

### Significance testing

In [8]:
def test_significance(run_1, run_2, measure, test_f):
    query_ids = list(qrels.keys())
    first_scores = [run_results_by_query[run_1][query_id][measure] for query_id in query_ids]
    second_scores = [run_results_by_query[run_2][query_id][measure] for query_id in query_ids]
    return test_f(first_scores, second_scores)

**Vector dimensionality (everything else static):**
* `d100` -> `d150` not significant
* `d100` -> `d200`, `d300` significant 

Conclusions this far: best models should have vector dimensionality `d100` or `d150`

**Negative sampling:**
* (for `d100`) `n20` -> `n10` not significant for `w1`, `w2`, `w3`, significant for `w4`, `w5`
* (for `d150`) `n20` -> `n10` not significant for all window sizes except for `w4`

Conclusions this far: best models should have vector dimensionality `d100` and negative sampling value of `n20` with window sizes `w1`, `w2`, `w3`

**Window size:**
* (for `d100`, `n20`) `w1` -> `w2` not significant, `w1` -> `w3`, `w4`, `w5` significant

Conclusions this far: best models should have vector dimensionality `d100` and negative sampling value of `n20` with window sizes `w1`, `w2`

**Epochs**
Only four models were trained with `e40`:
* `Doc2Vec(dm-c,d100,n20,w1,mc5,s1e-05,t4,ep40)`
* `Doc2Vec(dm-c,d100,n20,w2,mc5,s1e-05,t4,ep40)`
* `Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40)`
* `Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40)`

Significance tests showed insignificant differences in perfomance between these models and their counterparts trained with 20 epochs

In [16]:
first_run = "run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40).r(0.8).full"
second_run = "run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep20).r(0.8).full"

# first_run = results_df.index[0]
# second_run = results_df.index[-1]

display(results_df.loc[first_run])
display(results_df.loc[second_run])

test_significance(first_run, second_run, "ndcg", scipy.stats.ttest_rel)

map            0.646164
ndcg           0.792668
vector_size         100
ns                   20
w                     3
epochs               40
ratio               0.8
strategy           full
Name: run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep40).r(0.8).full, dtype: object

map            0.629034
ndcg           0.765107
vector_size         100
ns                   20
w                     3
epochs               20
ratio               0.8
strategy           full
Name: run.Doc2Vec(dm-c,d100,n20,w3,mc5,s1e-05,t4,ep20).r(0.8).full, dtype: object

Ttest_relResult(statistic=1.5276509332384145, pvalue=0.1327780322212468)

In [10]:
run_len = len(results_df)
for i in range(1, run_len):
    _, pval = test_significance(results_df.index[0], results_df.index[i], "ndcg", scipy.stats.ttest_rel)
    if (pval < 0.05):
        status_msg = "{:2} {}\t\t{}\t{:.3f}".format(i, results_df.index[i],
                                                    results_df.loc[results_df.index[i], 
                                                    "ndcg"], pval)
        print(status_msg)

63 run.Doc2Vec(dm-c,d200,n20,w2,mc5,s1e-05,t8,ep20).r(0.7).full		0.7808969650705719	0.045
66 run.Doc2Vec(dm-c,d150,n20,w1,mc5,s1e-05,t4,ep20).r(0.9).full		0.7799926066319288	0.039
68 run.Doc2Vec(dm-c,d150,n20,w1,mc5,s1e-05,t4,ep20).r(0.85).full		0.7797518356328195	0.045
70 run.Doc2Vec(dm-c,d150,n20,w1,mc5,s1e-05,t4,ep20).r(1).full		0.7795099836830109	0.042
71 run.Doc2Vec(dm-c,d150,n20,w1,mc5,s1e-05,t4,ep20).r(0.8).full		0.7794190029650605	0.043
74 run.Doc2Vec(dm-c,d150,n20,w1,mc5,s1e-05,t4,ep20).r(0.95).full		0.778666743385241	0.039
75 run.Doc2Vec(dm-c,d75,n20,w1,mc5,s1e-05,t4,ep20).r(0.7).full		0.7786652484699075	0.045
77 run.Doc2Vec(dm-c,d200,n20,w2,mc5,s1e-05,t8,ep20).r(0.8).full		0.7763434703194092	0.019
78 run.Doc2Vec(dm-c,d200,n10,w1,mc5,s1e-05,t8,ep20).r(0.7).full		0.7760873185221426	0.018
79 run.Doc2Vec(dm-c,d200,n20,w2,mc5,s1e-05,t8,ep20).r(0.6).full		0.7754151835062117	0.026
81 run.Doc2Vec(dm-c,d200,n20,w2,mc5,s1e-05,t8,ep20).r(0.5).full		0.7746441433638865	0.022
82 run.Doc2V

676 run.Doc2Vec(dm-c,d150,n10,w4,mc5,s1e-05,t4,ep20).r(0.4).summary		0.7045437548318054	0.000
677 run.Doc2Vec(dm-c,d75,n20,w2,mc5,s1e-05,t4,ep20).r(0.2).full		0.7043699150673124	0.000
678 run.Doc2Vec(dm-c,d150,n10,w3,mc5,s1e-05,t4,ep20).r(0.2).full		0.7043585538809028	0.000
679 run.Doc2Vec(dm-c,d100,n20,w4,mc5,s1e-05,t4,ep20).r(0.15).summary		0.7041944538628415	0.001
680 run.Doc2Vec(dm-c,d150,n10,w1,mc5,s1e-05,t4,ep20).r(0.6).summary		0.7040984340584127	0.000
681 run.Doc2Vec(dm-c,d300,n20,w1,mc5,s1e-05,t8,ep20).r(0.15).full		0.7039811451719756	0.001
682 run.Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40).r(0.6).summary		0.7039332190649268	0.000
683 run.Doc2Vec(dm-c,d100,n10,w2,mc5,s1e-05,t4,ep20).r(0.5).summary		0.7039326702420421	0.000
684 run.Doc2Vec(dm-c,d150,n10,w1,mc5,s1e-05,t4,ep20).r(0.3).summary		0.7039269743570025	0.000
685 run.Doc2Vec(dm-c,d150,n10,w2,mc5,s1e-05,t4,ep20).r(0.2).full		0.7039166866047536	0.000
686 run.Doc2Vec(dm-c,d150,n20,w5,mc5,s1e-05,t4,ep20).r(0.15).full		0.70

1289 run.Doc2Vec(dm-c,d100,n10,w1,mc5,s1e-05,t4,ep20).r(0.1).summary		0.652009086655428	0.000
1290 run.Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep20).r(0.05).summary		0.6518811958044802	0.000
1291 run.Doc2Vec(dm-c,d200,n20,w4,mc5,s1e-05,t8,ep20).r(0).summary		0.6517445389228703	0.000
1292 run.Doc2Vec(dm-c,d200,n20,w4,mc5,s1e-05,t8,ep20).r(0).full		0.6517445389228703	0.000
1293 run.Doc2Vec(dm-c,d300,n20,w4,mc5,s1e-05,t8,ep20).r(0.95).summary		0.6516880917215764	0.000
1294 run.Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40).r(0).summary		0.6514632840889768	0.000
1295 run.Doc2Vec(dm-c,d150,n20,w3,mc5,s1e-05,t4,ep40).r(0).full		0.6514632840889768	0.000
1296 run.Doc2Vec(dm-c,d150,n20,w1,mc5,s1e-05,t4,ep20).r(0.05).full		0.6514539875678224	0.000
1297 run.Doc2Vec(dm-c,d300,n20,w4,mc5,s1e-05,t8,ep20).r(1).summary		0.6514282609141576	0.000
1298 run.Doc2Vec(dm-c,d100,n20,w5,mc5,s1e-05,t4,ep20).r(1).summary		0.6512406514866981	0.000
1299 run.Doc2Vec(dm-c,d75,n20,w2,mc5,s1e-05,t4,ep20).r(0).full		0.651

In [None]:
results_df.loc[results_df.index[i], "ndcg"]

In [None]:
first_run = results_df.index[0]
second_run = results_df.index[11]

print(results_df.loc[first_run], "\n")
print(results_df.loc[second_run])