# Evaluation Notebook 

In [13]:
%load_ext autoreload
%autoreload 2
import json
import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import numpy as np
import seaborn as sns
sns.set()

import sys
import logging
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

from preprocessing import apply_pipeline, Corpus
from evaluation import *
from tfidf_retrieval import TfidfRetrieval
from word_embedding_retrieval import WordEmbeddingRetrieval

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Load corpus using different preprocessing pipelines

In [7]:
base_file =  "../data/kit_expert_2017_papers.csv"
papers_unprocessed = Corpus(base_file, [])

INFO:preprocessing.pipeline:Start preprocessing pipeline "" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_


#### Load keywords to use as test data

In [8]:
with open("../data/kit_expert_2017_keywords.json", "r") as file:
    keywords = json.load(file)

In [9]:
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1])
specific_keywords = ("specific_keywords", [k for k in keywords if k["level"]>=2][:5000])

In [10]:
def compare_models(model1, model2, test_data):
    model1_result = average_precision_table(model1[1], test_data)
#     model1_result = model1_result.rename({"average precision": "ap " + model1[0]})
    model2_result = average_precision_table(model2[1], test_data)
#     model2_result = model2_result.rename({"average precision": "ap " + model2[0]})
    result = model1_result.merge(model2_result, on="query")
    print(result.columns)
    result["difference"] = result["average precision_x"] - result["average precision_y"]
    print(f"Queries where {model1[0]} better than {model2[0]}")
    display(result[result["difference"]>0].sort_values(by="difference", ascending=False).head())
    print(f"Queries where {model2[0]} better than {model1[0]}")
    display(result[result["difference"]<0].sort_values(by="difference", ascending=True).head())
    return result

In [11]:
bm25_model = TfidfRetrieval.load("../data/models/tfidf/bm25_lemmatized_bigram.model")
bm25_qe_model = TfidfRetrieval.load("../data/models/tfidf/bm25_lemmatized_bigram_qe.model")
fasttext_model = WordEmbeddingRetrieval.load("../data/models/fasttext/fasttext_w29_s400_tfidf.model")
fasttext_qe_model = WordEmbeddingRetrieval.load("../data/models/fasttext/qe_best_fasttext_exponce.model")

In [41]:
print("---general keywords---")
print("bm25 mAP score:")
print(mean_average_precision(bm25_model, general_keywords[1]))
print("bm25 qe mAP score:")
print(mean_average_precision(bm25_qe_model, general_keywords[1]))

---general keywords---
bm25 mAP score:
0.05674491285486938
bm25 qe mAP score:
0.1472279000440685


In [49]:
print("fasttext mAP score:")
print(mean_average_precision(fasttext_model, general_keywords[1]))
print("fasttext qe mAP score:")
print(mean_average_precision(fasttext_qe_model, general_keywords[1]))

fasttext mAP score:
INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors
0.12721676819176342
fasttext qe mAP score:
INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors
0.16422939214650667


In [35]:
compare_models(("bm25 model", bm25_model), ("bm25 qe model", bm25_qe_model), general_keywords[1]);

Index(['query', 'average precision_x', 'average precision_y'], dtype='object')
Queries where bm25 model better than bm25 qe model


Unnamed: 0,query,average precision_x,average precision_y,difference
96,law,0.560085,0.002284,0.557801
198,advertising,0.666667,0.126907,0.53976
286,accounting,0.322548,0.085283,0.237266
236,traditional medicine,0.5,0.277778,0.222222
241,operating system,0.23599,0.014624,0.221366


Queries where bm25 qe model better than bm25 model


Unnamed: 0,query,average precision_x,average precision_y,difference
77,particle physics,0.080656,0.684261,-0.603605
118,seismology,0.026706,0.599036,-0.57233
61,computer science,0.026823,0.576932,-0.550109
49,orthodontics,0.0,0.500604,-0.500604
18,optometry,0.0,0.5,-0.5


In [51]:
compare_models(("fasttext model", fasttext_model), ("fasttext qe model", fasttext_qe_model), general_keywords[1]);

Index(['query', 'average precision_x', 'average precision_y'], dtype='object')
Queries where fasttext model better than fasttext qe model


Unnamed: 0,query,average precision_x,average precision_y,difference
104,emergency medicine,1.0,0.058824,0.941176
198,advertising,0.958333,0.025799,0.932534
106,combinatorics,0.589147,0.121906,0.467242
159,humanities,0.507576,0.091429,0.416147
174,paleontology,0.507719,0.224305,0.283414


Queries where fasttext qe model better than fasttext model


Unnamed: 0,query,average precision_x,average precision_y,difference
197,ophthalmology,0.0,0.608187,-0.608187
57,quantum electrodynamics,0.015693,0.502188,-0.486496
122,information retrieval,0.112949,0.57722,-0.464271
75,condensed matter physics,0.013005,0.44095,-0.427945
157,zoology,0.170317,0.571755,-0.401438


Unnamed: 0,query,average precision_x,average precision_y,difference
0,pedagogy,0.190256,0.204198,-0.013943
1,discrete mathematics,0.054327,0.085763,-0.031435
2,computational chemistry,0.040560,0.326750,-0.286190
3,natural language processing,0.405128,0.552995,-0.147867
4,acoustics,0.029056,0.106793,-0.077736
...,...,...,...,...
288,immunology,0.347287,0.328205,0.019082
289,internal medicine,0.001493,0.100666,-0.099173
290,management,0.000363,0.000363,0.000000
291,architectural engineering,0.000029,0.179734,-0.179706


In [12]:
compare_models(("bm25 qe model", bm25_qe_model), ("fasttext qe model", fasttext_qe_model), general_keywords[1]);

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors
Index(['query', 'average precision_x', 'average precision_y'], dtype='object')
INFO:numexpr.utils:Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Queries where bm25 qe model better than fasttext qe model


Unnamed: 0,query,average precision_x,average precision_y,difference
61,computer science,0.571841,0.059548,0.512293
18,optometry,0.5,0.041667,0.458333
164,macroeconomics,0.335598,0.001976,0.333622
86,international trade,0.365993,0.041963,0.32403
49,orthodontics,0.500604,0.180751,0.319853


Queries where fasttext qe model better than bm25 qe model


Unnamed: 0,query,average precision_x,average precision_y,difference
247,statistics,0.062028,0.535666,-0.473638
103,water resource management,0.057382,0.439843,-0.382461
47,mining engineering,0.007003,0.389085,-0.382081
13,general surgery,0.071667,0.401108,-0.329442
249,cardiology,0.389133,0.679105,-0.289972


In [None]:
print("---specific keywords---")
print("bm25 mAP score:")
mean_average_precision(bm25_model, specific_keywords)
print("bm25 qe mAP score:")
mean_average_precision(bm25_qe_model, specific_keywords)
print("fasttext mAP score:")
mean_average_precision(fasttext_model, specific_keywords)
print("fasttext qe mAP score:")
mean_average_precision(fasttext_qe_model, specific_keywords)