In [17]:
import logging
import pickle
import pathlib
from wordcloud import WordCloud
from src.features.build_features import  vocabulary, X, X_tfidf, corpus, corpus_tfidf

from gensim.models import LdaModel
from gensim import corpora, similarities

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [12]:
id2word = {v:k for v, k in sorted((value, key) for (key,value) in vocabulary.items())}

In [13]:
# Train LDA model.


# Set training parameters.
num_topics = 25
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

1:32,454 : INFO : merging changes from 2000 documents into a model of 33734 documents
2020-05-27 15:31:32,493 : INFO : topic #21 (0.040): 0.057*"landbrug" + 0.037*"grønland" + 0.029*"natur" + 0.025*"færø" + 0.017*"hr" + 0.015*"grønlandsk" + 0.014*"erhverv" + 0.012*"miljø" + 0.012*"landmænd" + 0.010*"rigsfællesskab"
2020-05-27 15:31:32,495 : INFO : topic #15 (0.050): 0.082*"kvind" + 0.028*"mænd" + 0.024*"samfund" + 0.023*"ligestilling" + 0.021*"ytringsfri" + 0.018*"frihed" + 0.015*"ægteskab" + 0.013*"vold" + 0.013*"køn" + 0.012*"israel"
2020-05-27 15:31:32,496 : INFO : topic #2 (0.237): 0.042*"hr" + 0.026*"folketing" + 0.021*"folkeparti" + 0.019*"venstr" + 0.018*"parti" + 0.015*"debat" + 0.013*"fru" + 0.012*"stem" + 0.011*"flertal" + 0.010*"enhedslist"
2020-05-27 15:31:32,498 : INFO : topic #9 (0.672): 0.047*"lovforslag" + 0.011*"regl" + 0.011*"lov" + 0.010*"venstr" + 0.008*"ændring" + 0.008*"forbind" + 0.008*"gæld" + 0.007*"lovgivning" + 0.007*"ræk" + 0.007*"beslutningsforslag"
2020-05

In [18]:
model.save("gensim_lda_model.model")

2020-05-27 15:40:57,977 : INFO : saving LdaState object under gensim_lda_model.model.state, separately None
2020-05-27 15:40:58,016 : INFO : saved gensim_lda_model.model.state
2020-05-27 15:40:58,038 : INFO : saving LdaModel object under gensim_lda_model.model, separately ['expElogbeta', 'sstats']
2020-05-27 15:40:58,039 : INFO : storing np array 'expElogbeta' to gensim_lda_model.model.expElogbeta.npy
2020-05-27 15:40:58,084 : INFO : not storing attribute dispatcher
2020-05-27 15:40:58,086 : INFO : not storing attribute id2word
2020-05-27 15:40:58,087 : INFO : not storing attribute state
2020-05-27 15:40:58,100 : INFO : saved gensim_lda_model.model


In [19]:
model = LdaModel.load("gensim_lda_model.model")

2020-05-27 15:41:52,486 : INFO : loading LdaModel object from gensim_lda_model.model
2020-05-27 15:41:52,513 : INFO : loading expElogbeta from gensim_lda_model.model.expElogbeta.npy with mmap=None
2020-05-27 15:41:52,526 : INFO : setting ignored attribute dispatcher to None
2020-05-27 15:41:52,527 : INFO : setting ignored attribute id2word to None
2020-05-27 15:41:52,527 : INFO : setting ignored attribute state to None
2020-05-27 15:41:52,528 : INFO : loaded gensim_lda_model.model
2020-05-27 15:41:52,529 : INFO : loading LdaState object from gensim_lda_model.model.state
2020-05-27 15:41:52,554 : INFO : loaded gensim_lda_model.model.state


In [25]:
top_topics = model.top_topics(corpus)

2020-05-27 15:44:15,976 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2020-05-27 15:44:16,106 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2020-05-27 15:44:16,227 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2020-05-27 15:44:16,337 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2020-05-27 15:44:16,444 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2020-05-27 15:44:16,573 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2020-05-27 15:44:16,678 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2020-05-27 15:44:16,790 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2020-05-27 15:44:16,905 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2020-05-27 15:44:17,027 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2020-05-27 15:44:17,142 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2020-05-27 15:44:17

In [10]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2020-05-27 15:12:34,988 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2020-05-27 15:12:35,132 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2020-05-27 15:12:35,248 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2020-05-27 15:12:35,386 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2020-05-27 15:12:35,523 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2020-05-27 15:12:35,692 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2020-05-27 15:12:35,839 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2020-05-27 15:12:36,005 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2020-05-27 15:12:36,156 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2020-05-27 15:12:36,293 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2020-05-27 15:12:36,443 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2020-05-27 15:12:36

In [None]:
colors = [ 'Blues', 'Oranges', 'Greens', 'Reds' ]