In [72]:
%load_ext autoreload
%autoreload 2
import json
import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import numpy as np
from functools import partial
import spacy
import seaborn as sns
sns.set()
import os
import pyLDAvis.gensim
pyLDAvis.enable_notebook()


from tqdm.notebook import tqdm
tqdm.pandas()

import sys


from lda_retrieval import LDARetrieval
from evaluation import average_precision, mean_average_precision, mean_average_precision_parallel
from preprocessing import apply_pipeline, Corpus, BasicPreprocessing, BigramPreprocessor, SpacyPreprocessor, StopWordPreprocessor

import logging
logging.basicConfig(level=logging.WARN, stream=sys.stdout)
logging.getLogger("preprocessing.pipeline").setLevel(logging.INFO)
logging.getLogger("lda_retrieval").setLevel(logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  from pandas import Panel


In [75]:
base_file =  "../data/kit_expert_2017_papers.csv"

p = [BasicPreprocessing()]
papers_basic = Corpus(base_file, p, n_jobs=16)

p = [BasicPreprocessing(), StopWordPreprocessor()]
papers_basic_nostopwords = Corpus(base_file, p, load_from_cache=True, n_jobs=16)

p = [BasicPreprocessing()]
papers_basic = Corpus(base_file, p, n_jobs=16)

p = [BasicPreprocessing(), SpacyPreprocessor(lemmatization=True)]
papers_basic_lemmatization = Corpus(base_file, p, load_from_cache=True, n_jobs=16)

INFO:preprocessing.pipeline:Start preprocessing pipeline "basic" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_NoStopWords" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic_NoStopWords
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_spacy_lemmatization" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic_spacy_lemmatization


In [11]:
with open("../data/kit_expert_2017_keywords.json", "r") as file:
    keywords = json.load(file)
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1])
specific_keywords = ("specific keywords", [k for k in keywords if k["level"]>=2])

In [12]:
def evaluate_model(model, test_sets):
    results = {}
    total = np.sum([len(test_set[:3000]) for _, test_set in test_sets])
    with tqdm(total=total, ncols='50%') as progress:
        for test_set_name, test_set in test_sets:
            data = [{
                "query": keyword_info["keyword"],
                "documents": keyword_info["paper_ids"]
            } for keyword_info in test_set[:3000]]
            mAP = mean_average_precision(model, data, progress.update)
            results[test_set_name + " mAP"] = mAP
    return results

In [6]:
topic_gird = [500,1000,1500,2000,2500,3000]
evaluation_results = []
for num_topics in tqdm(topic_gird):
    model_path = f"../data/models/lda_{num_topics}topics.model"
    if os.path.isfile(model_path):
        model = LDARetrieval.load(model_path)
    else:
        model = LDARetrieval(papers_basic_nostopwords, num_topics)
        model.save(f"../data/models/lda_{num_topics}topics.model")
    evaluation = evaluate_model(model, [general_keywords, specific_keywords])
    evaluation["coherence"] = model.get_coherence_score(papers_basic_nostopwords)
    evaluation["topics"] = num_topics
    evaluation_results.append(evaluation)


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3293), HTML(value='')), layout=Layout(display…




HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3293), HTML(value='')), layout=Layout(display…




HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3293), HTML(value='')), layout=Layout(display…




HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3293), HTML(value='')), layout=Layout(display…




HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3293), HTML(value='')), layout=Layout(display…




HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3293), HTML(value='')), layout=Layout(display…





In [8]:
pd.DataFrame(evaluation_results)

Unnamed: 0,general keywords mAP,specific keywords mAP,coherence,topics
0,0.030142,0.045634,-8.07786,500
1,0.032707,0.058135,-4.688202,1000
2,0.027411,0.037396,-9.693069,1500
3,0.018626,0.015314,-15.05793,2000
4,0.018925,0.004782,-16.459674,2500
5,0.01004,0.000626,-16.736424,3000
