# TF-IDF Evaluation

In [5]:
%load_ext autoreload
%autoreload 2
import json
import sys
import os
import pickle
import logging
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


from tqdm.notebook import tqdm
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tqdm.pandas()

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from evaluation import *
from preprocessing import Corpus, BasicPreprocessing, BigramPreprocessor, SpacyPreprocessor, StopWordPreprocessor
from retrieval_algorithms import TfIdfRetrievalAlgorithm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Load corpus using different preprocessing pipelines

In [6]:
base_file =  "../../data/kit_expert_2019_all_papers.csv"

p = [BasicPreprocessing(), StopWordPreprocessor()]
papers_basic = Corpus(base_file, p)

p = [BasicPreprocessing(), StopWordPreprocessor(), SpacyPreprocessor(lemmatization="all")]
papers_basic_lemmatization_all = Corpus(base_file, p, load_from_cache=True, n_jobs=16)

p = [BasicPreprocessing(), StopWordPreprocessor(), SpacyPreprocessor(lemmatization="nouns")]
papers_basic_lemmatization_nouns = Corpus(base_file, p, load_from_cache=True, n_jobs=16)

INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_NoStopWords" for file ../../data/kit_expert_2019_all_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../../data/kit_expert_2019_all_papers_basic_NoStopWords
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_NoStopWords_spacy_lemmatization_all" for file ../../data/kit_expert_2019_all_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../../data/kit_expert_2019_all_papers_basic_NoStopWords_spacy_lemmatization_all
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_NoStopWords_spacy_lemmatization_nouns" for file ../../data/kit_expert_2019_all_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../../data/kit_expert_2019_all_papers_basic_NoStopWords_spacy_lemmatization_nouns


#### Load keywords to use as test data

In [9]:
with open("../../data/kit_expert_2019_all_keywords.json", "r") as file:
    keywords = json.load(file)

In [10]:
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1])
specific_keywords = ("specific_keywords", [k for k in keywords if k["level"]>=2 and len(k["paper_ids"])>=10][:5000])

#### Test simple tf-idf models on unigrams

In [11]:
unigram_tfidf_models = [
    ("tf linear", TfIdfRetrievalAlgorithm(max_ngram=1, use_idf=False, sublinear_tf=False, min_df=2), papers_basic),
    ("tf sublinear", TfIdfRetrievalAlgorithm(max_ngram=1, use_idf=False, sublinear_tf=True, min_df=2), papers_basic),
    ("tf-idf linear", TfIdfRetrievalAlgorithm(max_ngram=1, use_idf=True, sublinear_tf=False, min_df=2), papers_basic),
    ("tf-idf sublinear", TfIdfRetrievalAlgorithm(max_ngram=1, use_idf=True, sublinear_tf=True, min_df=2), papers_basic),
]

In [None]:
unigram_tfidf_results = train_evaluate_models(unigram_tfidf_models, [general_keywords, specific_keywords], n_jobs=4)

In [21]:
print("mAP scores for unigram tfidf models:")
unigram_tfidf_results

mAP scores for unigram tfidf models:


Unnamed: 0_level_0,general keywords,general keywords,general keywords,general keywords,general keywords,general keywords,specific_keywords,specific_keywords,specific_keywords,specific_keywords,specific_keywords,specific_keywords
Unnamed: 0_level_1,p@5,p@10,p@20,R-prec,mAP,bpref,p@5,p@10,p@20,R-prec,mAP,bpref
tf linear,0.261,0.221,0.191,0.076,0.042,0.058,0.482,0.423,0.351,0.313,0.304,0.29
tf sublinear,0.283,0.241,0.203,0.077,0.044,0.06,0.586,0.517,0.425,0.384,0.372,0.359
tf-idf linear,0.258,0.224,0.191,0.077,0.042,0.059,0.533,0.473,0.394,0.355,0.348,0.33
tf-idf sublinear,0.275,0.236,0.199,0.077,0.043,0.06,0.601,0.54,0.453,0.416,0.411,0.393


Results:
- Performs very bad on general keywords
- Performs ok on specific keywords
- Use of inverse document frequency improves result 
- Use of sublinear scaling of term frequency improves results 

#### Test tf-idf models on n-grams

In [None]:
ngram_tfidf_models = [
    ("tf-idf sublinear 2-gram", TfIdfRetrievalAlgorithm(max_ngram=2, min_df=2), papers_basic),
    ("tf-idf sublinear 3-gram", TfIdfRetrievalAlgorithm(max_ngram=3, min_df=2), papers_basic),
    ("tf-idf sublinear 4-gram", TfIdfRetrievalAlgorithm(max_ngram=4, min_df=2), papers_basic),
]
ngram_tfidf_results = train_evaluate_models(ngram_tfidf_models, [general_keywords, specific_keywords], n_jobs=3)

In [22]:
print("mAP scores for ngram tfidf models:")
ngram_tfidf_results

mAP scores for ngram tfidf models:


Unnamed: 0_level_0,general keywords,general keywords,general keywords,general keywords,general keywords,general keywords,specific_keywords,specific_keywords,specific_keywords,specific_keywords,specific_keywords,specific_keywords
Unnamed: 0_level_1,p@5,p@10,p@20,R-prec,mAP,bpref,p@5,p@10,p@20,R-prec,mAP,bpref
tf-idf sublinear 2-gram,0.304,0.27,0.22,0.082,0.048,0.065,0.74,0.672,0.559,0.532,0.538,0.524
tf-idf sublinear 3-gram,0.304,0.264,0.219,0.083,0.048,0.065,0.742,0.672,0.558,0.531,0.537,0.524
tf-idf sublinear 4-gram,0.302,0.264,0.217,0.083,0.048,0.065,0.741,0.67,0.555,0.528,0.534,0.52


Results:
- No significant change for general keywords
- Bigrams provide great improvement for specific keywords
- 3 and 4-grams do not lead to significant further improvements

In [None]:
lemmatization_tfidf_models = [
    ("tf-idf 2-gram lematization all", TfIdfRetrievalAlgorithm(max_ngram=2, min_df=2), papers_basic_lemmatization_all),
    ("tf-idf 2-gram lematization nouns", TfIdfRetrievalAlgorithm(max_ngram=2, min_df=2), papers_basic_lemmatization_nouns),
]

In [None]:
lemmatization_tfidf_results = train_evaluate_models(lemmatization_tfidf_models, [general_keywords, specific_keywords], n_jobs=2)

In [23]:
print("mAP scores for lemmatization tfidf models:")
lemmatization_tfidf_results

mAP scores for lemmatization tfidf models:


Unnamed: 0_level_0,general keywords,general keywords,general keywords,general keywords,general keywords,general keywords,specific_keywords,specific_keywords,specific_keywords,specific_keywords,specific_keywords,specific_keywords
Unnamed: 0_level_1,p@5,p@10,p@20,R-prec,mAP,bpref,p@5,p@10,p@20,R-prec,mAP,bpref
tf-idf 2-gram lematization all,0.3,0.258,0.211,0.085,0.048,0.066,0.714,0.653,0.55,0.526,0.534,0.518
tf-idf 2-gram lematization nouns,0.304,0.264,0.218,0.087,0.05,0.067,0.73,0.667,0.562,0.539,0.549,0.532


#### Export best model

In [None]:
best_tfidf_model = TfIdfRetrievalAlgorithm(max_ngram=2, use_idf=True, sublinear_tf=True, min_df=2)
best_tfidf_model.prepare(papers_basic_lemmatization_nouns)

In [None]:
file_path = "../data/models/tfidf/tfidf_lemmatized_bigram.model"
with open(file_path, "wb") as file:
    pickle.dump(best_tfidf_model, file)