# Evaluation Notebook 

In [9]:
%load_ext autoreload
%autoreload 2
import json
import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import numpy as np
import seaborn as sns
sns.set()

import sys
import logging
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

from preprocessing import apply_pipeline, Corpus
from evaluation import *
from tfidf_retrieval import TfidfRetrieval
from word_embedding_retrieval import WordEmbeddingRetrieval

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Load corpus using different preprocessing pipelines

In [4]:
base_file =  "../data/kit_expert_2017_papers.csv"
papers_unprocessed = Corpus(base_file, [])

INFO:preprocessing.pipeline:Start preprocessing pipeline "" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Finished preprocessing pipeline. Saved preprocessed corpus to cache file ../data/kit_expert_2017_papers_


#### Load keywords to use as test data

In [11]:
with open("../data/kit_expert_2017_keywords.json", "r") as file:
    keywords = json.load(file)

In [12]:
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1])
specific_keywords = ("specific_keywords", [k for k in keywords if k["level"]>=2][:5000])

In [None]:
def compare_models(model1, model2, test_data):
    model1_result = average_precision_table(model1[1], test_data)
    model1_result.rename({"average precision": "ap " + model1[0]})
    model2_result = average_precision_table(model2[1], test_data)
    model2_result.rename({"average precision": "ap " + model2[0]})
    result = model1_result.merge(model2_result, on="query")
    result["difference"] = result["ap " + model1[0]] - result["ap " + model2[0]]
    print(f"Queries where {model1[0]} better than {model2[0]}")
    display(result.sort_values(by="difference", ascending=False).head())
    print(f"Queries where {model2[0]} better than {model1[0]}")
    display(result.sort_values(by="difference", ascending=True).head())
    return result

In [None]:
bm25_model = TfidfRetrieval.load("../data/models/tfidf/bm25_lemmatized_bigram.model")
bm25_qe_model = TfidfRetrieval.load("../data/models/tfidf/bm25_lemmatized_bigram_qe.model")
fasttext_model = WordEmbeddingRetrieval.load("../data/models/fasttext/fasttext_w29_s400_tfidf.model")
fasttext_qe_model = WordEmbeddingRetrieval.load("../data/models/fasttext/qe_best_fasttext_exponce.model")

In [None]:
print("---general keywords---")
print("bm25 mAP score:")
mean_average_precision(bm25_model, general_keywords)
print("bm25 qe mAP score:")
mean_average_precision(bm25_qe_model, general_keywords)
print("fasttext mAP score:")
mean_average_precision(fasttext_model, general_keywords)
print("fasttext qe mAP score:")
mean_average_precision(fasttext_qe_model, general_keywords)

In [None]:
compare_models(("bm25 model", bm25_model), ("bm25 qe model", bm25_qe_model), general_keywords)

In [None]:
compare_models(("fasttext model", fasttext_model), ("fasttext qe model", fasttext_qe_model), general_keywords)

In [None]:
compare_models(("bm25 qe model", bm25_qe_model), ("fasttext qe model", fasttext_qe_model), general_keywords)

In [None]:
print("---specific keywords---")
print("bm25 mAP score:")
mean_average_precision(bm25_model, specific_keywords)
print("bm25 qe mAP score:")
mean_average_precision(bm25_qe_model, specific_keywords)
print("fasttext mAP score:")
mean_average_precision(fasttext_model, specific_keywords)
print("fasttext qe mAP score:")
mean_average_precision(fasttext_qe_model, specific_keywords)