## 4. Topic Modeling
### 1. Latent Dirichlet Allocation
#### 1. Create the model

In [9]:
import pickle

#LOAD LYRICS from disk
lyrics = list()
with open ('../dataset/lemma_lyrics', 'rb') as fp:
    lyrics = pickle.load(fp)

In [10]:
import gensim.corpora as corpora
from gensim.models import TfidfModel    

id2word = corpora.Dictionary(lyrics)
id2word.save("../dataset/lemma_lyrics_dict")

bow_corpus = list()

for lyric in lyrics:
    bow_corpus.append(id2word.doc2bow(lyric))

tfidf = TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]
print(bow_corpus[0])
print(tfidf_corpus[0])

[(0, 2), (1, 1), (2, 1), (3, 13), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 3), (15, 3), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 45), (23, 1), (24, 3), (25, 3), (26, 1), (27, 5), (28, 1), (29, 6), (30, 1), (31, 1), (32, 1), (33, 4), (34, 3), (35, 1), (36, 1), (37, 1), (38, 4), (39, 1), (40, 2), (41, 1), (42, 2), (43, 2), (44, 6), (45, 1), (46, 1), (47, 2), (48, 7), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 3), (56, 3), (57, 1), (58, 5), (59, 2), (60, 2), (61, 3), (62, 1), (63, 1), (64, 1), (65, 3), (66, 3), (67, 2), (68, 1), (69, 1), (70, 1), (71, 2), (72, 3), (73, 2), (74, 3), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1)]
[(0, 0.05693230226707338), (1, 0.027704914478609205), (2, 0.04549673720943144), (3, 0.0289448229988323), (4, 0.015806998693867935), (5, 0.024025594355196165), (6, 0.023098009819700793), (7, 0.025811526273479265), (8, 0.010569261638354028), (9, 0.006612118130594775), (10, 0.0244344778227

In [11]:
from gensim.models.ldamulticore import LdaMulticore

lda_model = LdaMulticore(workers=4,
                   corpus=bow_corpus,
                   id2word=id2word,
                   num_topics=6, 
                   #random_state=100,
                   #update_every=1,
                   #chunksize=100,
                   #passes=10,
                   per_word_topics=False)
lda_model.save("../dataset/bow_lda/lda")

#### 2. Measure the model

In [12]:
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=lyrics, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3603827261238352


#### 3. Plot the model

In [13]:
import pyLDAvis
import pyLDAvis.gensim  

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#### 4. One more time, except this time use tfidf corpus

In [14]:
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import warnings

warnings.filterwarnings("ignore")

best_coherence = 0.0
best_number_topics = 0
for i in range(2,30,4):
    lda_model = LdaMulticore(workers=4,
                       corpus=tfidf_corpus,
                       id2word=id2word,
                       num_topics=5, 
                       #random_state=100,
                       #update_every=1,
                       #chunksize=100,
                       #passes=10,
                       per_word_topics=False)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=lyrics, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: '+ str(coherence_lda) + 'Topics: '+str(i))
    
    if(coherence_lda > best_coherence):
        best_coherence = coherence_lda
        best_number_topics = i
        lda_model.save("../dataset/tfidf_lda/lda")
        
print("Biggest coherence score: "+str(best_coherence)+" Number of topics: "+str(best_number_topics))

Coherence Score: 0.3434801762608804Topics: 2
Coherence Score: 0.3672888336282116Topics: 6
Coherence Score: 0.36309391490353066Topics: 10
Coherence Score: 0.3650705526295381Topics: 14
Coherence Score: 0.3458492191408498Topics: 18
Coherence Score: 0.3585193452481235Topics: 22
Coherence Score: 0.38433909278983525Topics: 26
Biggest coherence score: 0.38433909278983525 Number of topics: 26


In [15]:
import pyLDAvis
import pyLDAvis.gensim  

lda_model = LdaMulticore.load("../dataset/tfidf_lda/lda")

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, tfidf_corpus, id2word)
vis

### 2. Hierarchical Dirichlet Process
#### 1. Create the model

In [16]:
from gensim.models import HdpModel

hdp = HdpModel(tfidf_corpus, id2word)
hdp.save("../dataset/hdp/hdp")

In [17]:
from gensim.models import HdpModel
hdp = HdpModel.load("../dataset/hdp/hdp")
lda_model = hdp.suggested_lda_model()
lda_model.save("../dataset/hdp_lda/lda")

In [18]:
topics = []
for topic_id, topic in hdp.show_topics(num_topics=6, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)
print(topics)

[['love', 'be', 'get', 'not', 'do', 'go', 'know', 'baby', 'have', 'want', 'time', 'would', 'let', 'never', 'ill', 'say', 'can', 'come', 's', 'make'], ['love', 'get', 'be', 'not', 'do', 'go', 'baby', 'know', 'want', 'have', 'let', 'come', 'time', 'say', 'never', 'ill', 'would', 's', 'can', 'feel'], ['love', 'be', 'get', 'not', 'do', 'go', 'baby', 'know', 'want', 'have', 'let', 'time', 'come', 'never', 'would', 'ill', 'say', 'can', 'feel', 's'], ['love', 'be', 'get', 'not', 'do', 'go', 'baby', 'know', 'want', 'have', 'time', 'let', 'come', 'never', 'would', 'ill', 'say', 's', 'can', 'feel'], ['love', 'be', 'get', 'not', 'do', 'go', 'baby', 'know', 'want', 'have', 'let', 'time', 'come', 'never', 'say', 'would', 'ill', 'feel', 'can', 's'], ['love', 'be', 'get', 'not', 'do', 'go', 'baby', 'know', 'want', 'have', 'let', 'time', 'never', 'come', 'ill', 'would', 'say', 'feel', 'can', 's']]


In [19]:
from gensim.models.coherencemodel import CoherenceModel
cm = CoherenceModel(texts=lyrics, topics=topics, dictionary=id2word, coherence='c_v')
cm.get_coherence()

0.3656684283248796