## 4. Topic Modeling
### 1. Latent Dirichlet Allocation
#### 1. Create the model

In [None]:
import pickle

#LOAD LYRICS from disk
lyrics = list()
with open ('../dataset/lemma_lyrics', 'rb') as fp:
    lyrics = pickle.load(fp)

In [None]:
import gensim.corpora as corpora
from gensim.models import TfidfModel    

id2word = corpora.Dictionary(lyrics)
id2word.save("../dataset/lemma_lyrics_dict")

bow_corpus = list()

for lyric in lyrics:
    bow_corpus.append(id2word.doc2bow(lyric))

tfidf = TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]
print(bow_corpus[0])
print(tfidf_corpus[0])

In [None]:
from gensim.models.ldamulticore import LdaMulticore

lda_model = LdaMulticore(workers=4,
                   corpus=bow_corpus,
                   id2word=id2word,
                   num_topics=6, 
                   #random_state=100,
                   #update_every=1,
                   #chunksize=100,
                   #passes=10,
                   per_word_topics=False)
lda_model.save("../dataset/bow_lda/lda")

#### 2. Measure the model

In [None]:
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=lyrics, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

#### 3. Plot the model

In [None]:
import pyLDAvis
import pyLDAvis.gensim  

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis

#### 4. One more time, except this time use tfidf corpus

In [None]:
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import warnings

warnings.filterwarnings("ignore")

best_coherence = 0.0
best_number_topics = 0
for i in range(2,30,4):
    lda_model = LdaMulticore(workers=4,
                       corpus=tfidf_corpus,
                       id2word=id2word,
                       num_topics=5, 
                       #random_state=100,
                       #update_every=1,
                       #chunksize=100,
                       #passes=10,
                       per_word_topics=False)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=lyrics, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: '+ str(coherence_lda) + 'Topics: '+str(i))
    
    if(coherence_lda > best_coherence):
        best_coherence = coherence_lda
        best_number_topics = i
        lda_model.save("../dataset/tfidf_lda/lda")
        
print("Biggest coherence score: "+str(best_coherence)+" Number of topics: "+str(best_number_topics))

In [None]:
import pyLDAvis
import pyLDAvis.gensim  

lda_model = LdaMulticore.load("../dataset/tfidf_lda/lda")

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, tfidf_corpus, id2word)
vis

### 2. Hierarchical Dirichlet Process
#### 1. Create the model

In [None]:
from gensim.models import HdpModel

hdp = HdpModel(tfidf_corpus, id2word)
hdp.save("../dataset/hdp/hdp")

In [None]:
from gensim.models import HdpModel
hdp = HdpModel.load("../dataset/hdp/hdp")
lda_model = hdp.suggested_lda_model()
lda_model.save("../dataset/hdp_lda/lda")

In [None]:
topics = []
for topic_id, topic in hdp.show_topics(num_topics=6, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)
print(topics)

In [None]:
from gensim.models.coherencemodel import CoherenceModel
cm = CoherenceModel(texts=lyrics, topics=topics, dictionary=id2word, coherence='c_v')
cm.get_coherence()