# TF-IDF Evaluation

In [1]:
%load_ext autoreload
%autoreload 2
import json
import sys
import os
import pickle
import logging
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


from tqdm.notebook import tqdm
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tqdm.pandas()

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from evaluation import *
from preprocessing import Corpus, BasicPreprocessing, BigramPreprocessor, SpacyPreprocessor, StopWordPreprocessor
from retrieval_algorithms import TfIdfRetrievalAlgorithm

## Load corpus using different preprocessing pipelines

In [3]:
base_file =  "../../data/kit_expert_2019_all_papers.csv"

p = [BasicPreprocessing(), StopWordPreprocessor()]
papers_basic = Corpus(base_file, p)

# p = [BasicPreprocessing(), StopWordPreprocessor(), SpacyPreprocessor(lemmatization="all")]
# papers_basic_lemmatization_all = Corpus(base_file, p, load_from_cache=True, n_jobs=16)

# p = [BasicPreprocessing(), StopWordPreprocessor(), SpacyPreprocessor(lemmatization="nouns")]
# papers_basic_lemmatization_nouns = Corpus(base_file, p, load_from_cache=True, n_jobs=16)

INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_NoStopWords" for file ../../data/kit_expert_2019_all_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../../data/kit_expert_2019_all_papers_basic_NoStopWords


## Load keywords to use as test data

In [4]:
with open("../../data/kit_expert_2019_all_keywords.json", "r") as file:
    keywords = json.load(file)

In [5]:
general_keywords = [k for k in keywords if k["level"]<=1]
specific_keywords = [k for k in keywords if k["level"]>=2 and len(k["paper_ids"])>=10]

general_keywords_val = ("general keywords validation", general_keywords[0:int(len(general_keywords)*0.8)])
specific_keywords_val = ("specific keywords validation", specific_keywords[0:int(len(specific_keywords)*0.8)])
general_keywords_test = ("general keywords test", general_keywords[int(len(general_keywords)*0.8):])
specific_keywords_test = ("specific keywords test", specific_keywords[int(len(specific_keywords)*0.8):])

In [6]:
len(general_keywords), len(specific_keywords)

(300, 7972)

## Test simple tf-idf models on unigrams

In [None]:
unigram_tfidf_models = [
    ("tf linear", TfIdfRetrievalAlgorithm(max_ngram=1, use_idf=False, sublinear_tf=False, min_df=2), papers_basic),
    ("tf sublinear", TfIdfRetrievalAlgorithm(max_ngram=1, use_idf=False, sublinear_tf=True, min_df=2), papers_basic),
    ("tf-idf linear", TfIdfRetrievalAlgorithm(max_ngram=1, use_idf=True, sublinear_tf=False, min_df=2), papers_basic),
    ("tf-idf sublinear", TfIdfRetrievalAlgorithm(max_ngram=1, use_idf=True, sublinear_tf=True, min_df=2), papers_basic),
]

In [None]:
unigram_tfidf_results = train_evaluate_models(unigram_tfidf_models, [general_keywords_val, specific_keywords_val], n_jobs=4)

In [67]:
unigram_tfidf_results.to_csv("../../data/results/tfidf_unigram_results.csv")
print("Scores for unigram tfidf models:")
unigram_tfidf_results

Scores for unigram tfidf models:


Unnamed: 0_level_0,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation
Unnamed: 0_level_1,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref
Unnamed: 0_level_2,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err
tf linear,0.253,0.04,0.22,0.034,0.19,0.03,0.07,0.011,0.038,0.008,0.054,0.009,0.482,0.009,0.423,0.008,0.35,0.007,0.314,0.007,0.306,0.007,0.292,0.007
tf sublinear,0.277,0.041,0.236,0.035,0.2,0.031,0.072,0.012,0.04,0.008,0.056,0.01,0.588,0.009,0.518,0.008,0.426,0.007,0.386,0.006,0.375,0.007,0.362,0.007
tf-idf linear,0.248,0.039,0.215,0.033,0.185,0.029,0.071,0.011,0.037,0.008,0.054,0.009,0.529,0.009,0.471,0.008,0.392,0.007,0.356,0.006,0.348,0.007,0.331,0.007
tf-idf sublinear,0.267,0.04,0.231,0.034,0.194,0.03,0.071,0.012,0.039,0.008,0.055,0.009,0.601,0.009,0.538,0.008,0.451,0.007,0.416,0.006,0.411,0.007,0.393,0.007


Results:
- Performs very bad on general keywords
- Performs ok on specific keywords
- Use of inverse document frequency improves result 
- Use of sublinear scaling of term frequency improves results 

## Test tf-idf models on n-grams

In [None]:
ngram_tfidf_models = [
    ("tf-idf sublinear 2-gram", TfIdfRetrievalAlgorithm(max_ngram=2, min_df=2), papers_basic),
    ("tf-idf sublinear 3-gram", TfIdfRetrievalAlgorithm(max_ngram=3, min_df=2), papers_basic),
    ("tf-idf sublinear 4-gram", TfIdfRetrievalAlgorithm(max_ngram=4, min_df=2), papers_basic),
]
ngram_tfidf_results = train_evaluate_models(ngram_tfidf_models, [general_keywords_val, specific_keywords_val], n_jobs=3)

In [68]:
ngram_tfidf_results.to_csv("../../data/results/ngram_tfidf_results.csv")
print("Scores for ngram tfidf models:")
ngram_tfidf_results

Scores for ngram tfidf models:


Unnamed: 0_level_0,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation
Unnamed: 0_level_1,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref
Unnamed: 0_level_2,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err
tf-idf sublinear 2-gram,0.299,0.04,0.265,0.036,0.213,0.031,0.076,0.012,0.043,0.008,0.06,0.01,0.74,0.008,0.674,0.007,0.56,0.007,0.534,0.006,0.541,0.007,0.527,0.007
tf-idf sublinear 3-gram,0.299,0.04,0.26,0.035,0.212,0.031,0.076,0.012,0.043,0.008,0.06,0.01,0.742,0.008,0.673,0.007,0.559,0.007,0.534,0.006,0.54,0.007,0.527,0.007
tf-idf sublinear 4-gram,0.298,0.04,0.258,0.035,0.211,0.031,0.076,0.012,0.042,0.008,0.059,0.01,0.741,0.008,0.671,0.007,0.556,0.007,0.53,0.006,0.537,0.007,0.523,0.007


Results:
- No significant change for general keywords
- Bigrams provide great improvement for specific keywords
- 3 and 4-grams do not lead to significant further improvements

## Test lemmatization

In [None]:
lemmatization_tfidf_models = [
    ("tf-idf 2-gram lematization all", TfIdfRetrievalAlgorithm(max_ngram=2, min_df=2), papers_basic_lemmatization_all),
    ("tf-idf 2-gram lematization nouns", TfIdfRetrievalAlgorithm(max_ngram=2, min_df=2), papers_basic_lemmatization_nouns),
]

In [None]:
lemmatization_tfidf_results = train_evaluate_models(lemmatization_tfidf_models, [general_keywords_val, specific_keywords_val], n_jobs=2)

In [69]:
prf_resultzation_tfidf_results.to_csv("../../data/results/lemmatization_tfidf_results.csv")
print("Scores for lemmatization tfidf models:")
lemmatization_tfidf_results

Scores for lemmatization tfidf models:


Unnamed: 0_level_0,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,general keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation,specific keywords validation
Unnamed: 0_level_1,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref,p@5,p@5,p@10,p@10,p@20,p@20,R-prec,R-prec,mAP,mAP,bpref,bpref
Unnamed: 0_level_2,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err,avg,err
tf-idf 2-gram lematization all,0.295,0.039,0.253,0.034,0.206,0.03,0.079,0.012,0.044,0.008,0.061,0.01,0.716,0.008,0.656,0.007,0.551,0.007,0.528,0.006,0.537,0.007,0.521,0.007
tf-idf 2-gram lematization nouns,0.298,0.039,0.256,0.034,0.213,0.031,0.081,0.013,0.045,0.008,0.062,0.01,0.731,0.008,0.669,0.007,0.563,0.007,0.54,0.006,0.551,0.006,0.534,0.007


## Export best model

In [5]:
best_tfidf_model = TfIdfRetrievalAlgorithm(max_ngram=2, use_idf=True, sublinear_tf=True, min_df=2)
best_tfidf_model.prepare(papers_basic_lemmatization_nouns)

In [8]:
file_path = "../../data/models/tfidf/tfidf_lemmatized_bigram.model"
with open(file_path, "wb") as file:
    pickle.dump(best_tfidf_model, file)

## Pseudo relevance feedback

In [6]:
from retrieval_algorithms.prf_wrapper import PRFWrapper

In [8]:
prf = PRFWrapper(best_tfidf_model, 10, 10, 0.8)

In [9]:
prf.prepare(papers_basic_lemmatization_nouns)

In [13]:
prf_result = evaluate_model(prf, [general_keywords_val, specific_keywords_val])

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=6617.0), HTML(value='')), layout=Layout(d…




In [None]:
tfif_result = evaluate_model(best_tfidf_model, [general_keywords_val, specific_keywords_val])