In [5]:
%load_ext autoreload
%autoreload 2
import json
import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import numpy as np
from functools import partial
import spacy
import seaborn as sns
sns.set()


from tqdm.notebook import tqdm
tqdm.pandas()

import sys


from lda_retrieval import LDARetrieval
from evaluation import average_precision, mean_average_precision, mean_average_precision_parallel
from preprocessing import apply_pipeline, Corpus, BasicPreprocessing, BigramPreprocessor, SpacyPreprocessor, StopWordPreprocessor

import logging
logging.basicConfig(level=logging.WARN, stream=sys.stdout)
logging.getLogger("preprocessing.pipeline").setLevel(logging.INFO)
logging.getLogger("lda_retrieval").setLevel(logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
base_file =  "../data/kit_expert_2017_papers.csv"

p = [BasicPreprocessing()]
papers_basic = Corpus(base_file, p)

p = [BasicPreprocessing(), StopWordPreprocessor()]
papers_basic_nostopwords = Corpus(base_file, p, load_from_cache=True)

p = [BasicPreprocessing()]
papers_basic = Corpus(base_file, p)

p = [BasicPreprocessing(), SpacyPreprocessor(lemmatization=True)]
papers_basic_lemmatization = Corpus(base_file, p, load_from_cache=True)

INFO:preprocessing.pipeline:Start preprocessing pipeline "basic" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_NoStopWords" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic_NoStopWords
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic
INFO:preprocessing.pipeline:Start preprocessing pipeline "basic_spacy_lemmatization" for file ../data/kit_expert_2017_papers.csv.
INFO:preprocessing.pipeline:Loaded cached preprocessed corpus from ../data/kit_expert_2017_papers_basic_spacy_lemmatization


In [7]:
with open("../data/kit_expert_2017_keywords.json", "r") as file:
    keywords = json.load(file)
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1])
specific_keywords = ("specific_keywords", [k for k in keywords if k["level"]>=2])

In [7]:
lda_model_500_topics = LDARetrieval(papers_basic_nostopwords, 50)

INFO:lda_retrieval:Create Dictionary for corpus vocabulary
INFO:lda_retrieval:Start training lda model with 50 topics
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #10000/31240
DEBUG:gensim.models.ldamodel:450/10000 documents converged within 50 iterations
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #20000/31240
DEBUG:gensim.models.ldamodel:2872/10000 documents converged within 50 iterations
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #30000/31240


KeyboardInterrupt: 

In [22]:
lda_model_500_topics.lda_model.get_document_topics(lda_model_500_topics.id2word.doc2bow(["neural", "network"]))

[]

In [16]:
lda_model_500_topics.get_ranked_documents("neural network")

Unnamed: 0,id,score


In [48]:
lda_model_500_topics.get_coherence_score(papers_basic_nostopwords)

INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 1000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 2000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 3000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 4000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 5000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 6000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 7000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 8000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 9000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 10000 documents
INFO:gensim.topic_c

-4.661296250678928

In [50]:
def evaluate_model(model, test_sets):
    results = {}
    total = np.sum([len(test_set[:3000]) for _, test_set in test_sets])
    with tqdm(total=total, ncols='50%') as progress:
        for test_set_name, test_set in test_sets:
            data = [{
                "query": keyword_info["keyword"],
                "documents": keyword_info["paper_ids"]
            } for keyword_info in test_set[:3000]]
            mAP = mean_average_precision(model, data, progress.update)
            results[test_set_name] = mAP
            
    return results

In [56]:
evaluate_model(lda_model_500_topics, [general_keywords, specific_keywords])

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3293), HTML(value='')), layout=Layout(display…




{'general keywords': 0.029647059958427157,
 'specific_keywords': 0.04303393711029848}