# Evaluation of best model on test datasets

This notebook evaluates the final ensemble model on the test dataset to check if the model genearlizes to unseen data or has overfitted on the data used for tuning the parameters.

In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import pickle
import json
import logging
logging.basicConfig(level=logging.INFO, handlers=[logging.FileHandler("ensemble.log"), logging.StreamHandler(sys.stdout)])

import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from evaluation import *
from preprocessing import Corpus, BasicPreprocessing, BigramPreprocessor, SpacyPreprocessor, StopWordPreprocessor
from retrieval_algorithms.ensemble_retrieval_algorithm import EnsembleRetrievalAlgorithm

In [2]:
with open("../../data/kit_expert_2019_all_keywords.json", "r") as file:
    keywords = json.load(file)

In [3]:
general_keywords = [k for k in keywords if k["level"]<=1]
specific_keywords = [k for k in keywords if k["level"]>=2 and len(k["paper_ids"])>=10]

general_keywords_val = ("general keywords validation", general_keywords[0:int(len(general_keywords)*0.8)])
specific_keywords_val = ("specific keywords validation", specific_keywords[0:int(len(specific_keywords)*0.8)])
general_keywords_test = ("general keywords test", general_keywords[int(len(general_keywords)*0.8):])
specific_keywords_test = ("specific keywords test", specific_keywords[int(len(specific_keywords)*0.8):])

In [4]:
bm25_file_path = "../../data/models/tfidf/bm25_oqe.model"
sent2vec_file_path = "../../data/models/sent2vec/sent2vec_oqe.model"
ensemble_file_path = "../../data/models/ensemble_model.model"

In [20]:
with open(bm25_file_path, "rb") as file:
    best_bm25_model = pickle.load(file)
with open(sent2vec_file_path, "rb") as file:
    best_sent2vec_model = pickle.load(file)  
with open(ensemble_file_path, "rb") as file:
    best_ensemble_model = pickle.load(file)
best_ensemble_model.weight = 0.04
best_ensemble_model.model1 = best_bm25_model
best_ensemble_model.model2 = best_sent2vec_model

In [None]:
best_models = [
    ("Best ensemble model", best_ensemble_model, None),
]
best_models_val_results = evaluate_models(best_models, [general_keywords_val, specific_keywords_val], n_jobs=1)
best_models_test_results = evaluate_models(best_models, [general_keywords_test, specific_keywords_test], n_jobs=1)

In [24]:
best_models_val_results

Unnamed: 0_level_0,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation
Unnamed: 0_level_1,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref
Unnamed: 0_level_2,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err
Best ensemble model,0.383,0.04,0.364,0.036,0.333,0.033,0.226,0.02,0.183,0.019,0.177,0.018,0.737,0.007,0.682,0.007,0.578,0.006,0.569,0.006,0.583,0.006,0.564,0.007


In [37]:
best_models_test_results


Unnamed: 0_level_0,general keywords test,general keywords test,general keywords test,general keywords test,general keywords test,general keywords test,general keywords test,general keywords test,general keywords test,general keywords test,general keywords test,general keywords test,specific keywords test,specific keywords test,specific keywords test,specific keywords test,specific keywords test,specific keywords test,specific keywords test,specific keywords test,specific keywords test,specific keywords test,specific keywords test,specific keywords test
Unnamed: 0_level_1,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref
Unnamed: 0_level_2,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err
Best ensemble model,0.407,0.084,0.39,0.073,0.366,0.07,0.26,0.045,0.23,0.048,0.218,0.046,0.737,0.014,0.687,0.013,0.587,0.013,0.575,0.011,0.587,0.012,0.568,0.013


In [26]:
best_models_val_results.to_csv("../../data/results/best_models_val_results.csv")
best_models_test_results.to_csv("../../data/results/best_models_test_results.csv")
