In [1]:
%load_ext autoreload
%autoreload 2
import json
from gensim.summarization.bm25 import BM25
from gensim.models.phrases import Phrases, Phraser
import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import numpy as np
from multiprocessing import Pool
from functools import partial
import matplotlib.pyplot as plt
import spacy
import seaborn as sns
sns.set()


from tqdm.notebook import tqdm
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tqdm.pandas()

import sys
import logging
logging.basicConfig(level=logging.INFO, stream=sys.stdout)


from tfidf_retrieval import TfidfRetrieval
from evaluation import *
from preprocessing import apply_pipeline, Corpus, BasicPreprocessing, BigramPreprocessor, SpacyPreprocessor

In [3]:
base_file =  "../data/kit_expert_2017_papers.csv"

p = [BasicPreprocessing()]
papers_basic = Corpus(base_file, p)

p = [BasicPreprocessing(), BigramPreprocessor()]
papers_basic_bigram = Corpus(base_file, p)

p = [BasicPreprocessing(), SpacyPreprocessor(lemmatization=True)]
papers_basic_lemmatization = Corpus(base_file, p, load_from_cache=True)

p = [BasicPreprocessing(), SpacyPreprocessor(lemmatization=True), BigramPreprocessor()]
papers_basic_lemmatization_bigram = Corpus(base_file, p)


INFO:preprocessing.pipeline:Start preprocessing pipeline "basic" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_bigrams" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic_bigrams
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_spacy_lemmatization" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic_spacy_lemmatization
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_spacy_lemmatization_bigrams" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic_spacy_lemmatization_bigrams


In [4]:
with open("../data/kit_expert_2017_keywords.json", "r") as file:
    keywords = json.load(file)

In [7]:
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1])
specific_keywords = ("specific_keywords", [k for k in keywords if k["level"]>=2][:1000])

In [None]:
unigram_tfidf_models = [
    ("tf linear", partial(TfidfRetrieval, corpus = papers_basic, max_ngrams=1, use_idf=False, sublinear_tf=False)),
    ("tf sublinear", partial(TfidfRetrieval, corpus = papers_basic, max_ngrams=1, use_idf=False, sublinear_tf=True)),
    ("tf-idf linear", partial(TfidfRetrieval, corpus = papers_basic, max_ngrams=1, use_idf=True, sublinear_tf=False)),
    ("tf-idf sublinear", partial(TfidfRetrieval, corpus = papers_basic, max_ngrams=1, use_idf=True, sublinear_tf=True)),
]

In [None]:
%%time
unigram_tfidf_results = train_evaluate_models(unigram_tfidf_models, [general_keywords, specific_keywords])

In [None]:
print("mAP scores for unigram tfidf models:")
unigram_tfidf_results

In [None]:
ngram_tfidf_models = [
    ("tf-idf sublinear 2-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=2, use_idf=True, sublinear_tf=True)),
    ("tf-idf sublinear 3-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=3, use_idf=True, sublinear_tf=True)),
    ("tf-idf sublinear 4-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=4, use_idf=True, sublinear_tf=True)),
    ("tf-idf sublinear 5-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=5, use_idf=True, sublinear_tf=True)),
    ("tf-idf sublinear 6-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=6, use_idf=True, sublinear_tf=True)),
]

In [None]:
%%time
ngram_tfidf_results = train_evaluate_models(ngram_tfidf_models, [general_keywords, specific_keywords])

In [None]:
print("mAP scores for ngram tfidf models:")
ngram_tfidf_results

In [None]:
k1_grid = np.arange(0.1,1.5,0.1)
search_k1_bm25_models = [(f"BM25 k1={k1:.2f}", partial(TfidfRetrieval, corpus=papers_basic, use_bm25=True, k1=k1)) for k1 in k1_grid]

In [None]:
%%time
search_k1_bm25_results = train_evaluate_models(search_k1_bm25_models, [general_keywords, specific_keywords])

In [None]:
print("mAP scores for bm25 k1 search models:")
search_k1_bm25_results

In [None]:
b_grid = np.arange(0.1,1.1,0.1)
search_b_bm25_models = [(f"BM25 b={b:.2f}", partial(TfidfRetrieval, corpus=papers_basic, use_bm25=True, b=b, k1=0.3)) for b in b_grid]

In [None]:
%%time
search_b_bm25_results = train_evaluate_models(search_b_bm25_models, [general_keywords, specific_keywords])

In [None]:
print("mAP scores for bm25 b search models:")
search_b_bm25_results

In [None]:
ngram_bm25_models = [
    ("bm25 2-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=2, use_bm25=True, k1=0.3, b=0.7)),
    ("bm25 3-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=3, use_bm25=True, k1=0.3, b=0.7)),
    ("bm25 4-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=4, use_bm25=True, k1=0.3, b=0.7)),
    ("bm25 5-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=5, use_bm25=True, k1=0.3, b=0.7)),
    ("bm25 6-gram", partial(TfidfRetrieval, corpus=papers_basic, max_ngrams=6, use_bm25=True, k1=0.3, b=0.7)),
]

In [None]:
%%time
ngram_bm25_results = train_evaluate_models(ngram_bm25_results, [general_keywords, specific_keywords])

In [None]:
print("mAP scores for ngram bm25 models:")
ngram_bm25_results