## 3. Topic Modeling
### 1. Latent Dirichlet Allocation

In [1]:
import numpy as np
from pprint import pprint

import matplotlib.pyplot as plt
%matplotlib inline


#### 1. Clean the data 
##### 1. Delete line breaks

In [58]:
import re
import pandas as pd

df = pd.read_csv('../dataset/Lyrics_en.csv')
#df = df.iloc[0:100]
df.dropna(axis='index', how='any', inplace=True)


for row in df.itertuples(index=True, name='Pandas'):
    lyric = getattr(row, "Lyrics")
    index = getattr(row, "Index")
    lyric = re.sub('\s+', ' ', lyric)
    lyric = re.sub("\'", "", lyric)
    lyric = re.sub("\"", "", lyric)
    df.at[index, 'Lyrics'] = lyric
df    

Unnamed: 0,Band,Lyrics,Song
0,Elijah Blake,"No, no I aint ever trapped out the bando But o...",Everyday
1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die
2,Elijah Blake,She dont live on planet Earth no more She foun...,The Otherside
3,Elijah Blake,"Trippin off that Grigio, mobbin, lights low Tr...",Pinot
4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds
5,Elijah Blake,I just want to ready your mind Cause Ill still...,Uno
6,Elijah Harris,To believe Or not to believe That is the quest...,Girlfriend (Main)
7,Elijah Levi,"No one here can love or understand me Oh, what...",Bye Bye Blackbird
8,Elijah Levi,"Lullaby of Birdland, thats what I Always hear ...",Lullaby of Birdland
9,Elijah Levi,I hate to see that evening sun go down I hate ...,St. Louis Blues


#### 2. Prepare de dictionary
##### 1. Tokenize

In [59]:
import gensim
from gensim.utils import simple_preprocess

for row in df.itertuples(index=True, name='Pandas'):
    lyric = getattr(row, "Lyrics")
    index = getattr(row, "Index")    
    df.at[index, 'Lyrics'] = gensim.utils.simple_preprocess(str(lyric), deacc=True)
df    

Unnamed: 0,Band,Lyrics,Song
0,Elijah Blake,"[no, no, aint, ever, trapped, out, the, bando,...",Everyday
1,Elijah Blake,"[the, drinks, go, down, and, smoke, goes, up, ...",Live Till We Die
2,Elijah Blake,"[she, dont, live, on, planet, earth, no, more,...",The Otherside
3,Elijah Blake,"[trippin, off, that, grigio, mobbin, lights, l...",Pinot
4,Elijah Blake,"[see, midnight, panther, so, gallant, and, so,...",Shadows & Diamonds
5,Elijah Blake,"[just, want, to, ready, your, mind, cause, ill...",Uno
6,Elijah Harris,"[to, believe, or, not, to, believe, that, is, ...",Girlfriend (Main)
7,Elijah Levi,"[no, one, here, can, love, or, understand, me,...",Bye Bye Blackbird
8,Elijah Levi,"[lullaby, of, birdland, thats, what, always, h...",Lullaby of Birdland
9,Elijah Levi,"[hate, to, see, that, evening, sun, go, down, ...",St. Louis Blues


##### 2. Create bigrams and trigrams models

In [60]:
from gensim.models.phrases import Phrases, Phraser

lyrics = list()
for row in df.itertuples(index=True, name='Pandas'):
    lyric = getattr(row, "Lyrics")
    lyrics.append(lyric)
    
bi_prhases = Phrases(lyrics, min_count=5, threshold=30)
tri_prhases = Phrases(bi_prhases[lyrics], threshold=30)  

bigram = Phraser(bi_prhases)
trigram = Phraser(tri_prhases)





##### 3. Remove stop words

In [61]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english')

new_lyrics = list()
for lyric in lyrics:
    new_lyric = list()
    for word in lyric:
        if(word not in stop_words):
            new_lyric.append(word)
    new_lyrics.append(new_lyric)
    
#print(lyrics[0])
#print(new_lyrics[0])
lyrics = new_lyrics
del new_lyrics

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juanp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##### 4. Lemmatize words

In [62]:
bigram_lyrics = list()
for lyric in lyrics:
    bigram_lyrics.append(bigram[lyric])
lyrics = bigram_lyrics
del bigram_lyrics

In [63]:
import spacy
import en_core_web_sm

# python -m spacy download en
nlp = en_core_web_sm.load(disable=['parser', 'ner'])

allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
lemma_lyrics = list()
for lyric in lyrics:
    doc = nlp(" ".join(lyric)) 
    lemma_lyric = list()
    for token in doc:
        if(token.pos_ in allowed_postags):
            lemma_lyric.append(token.lemma_)
    lemma_lyrics.append(lemma_lyric)


print(lemma_lyrics[:1])
lyrics = lemma_lyrics

[['be', 'not', 'ever', 'trap', 'bando', 'lord', 'do', 'not', 'get', 'wrong', 'know', 'couple', 'niggas', 'be', 'place', 'everybody', 'know', 'name', 'say', 'get', 'watch', 'attitude', 'see', 'money', 'man', 'start', 'actin', 'strange', 'fuck', 'one', 'fuck', 'never', 'say', 'be', 'brand_new', 'everyday', 'everyday', 'everyday', 'everyday', 'everyday', 'everyday', 'everyday', 'everyday', 'everyday', 'have', 'talkin', 'shit', 'nigga', 's', 'regular', 'have', 'love', 'thick', 'life', 'spectacular', 'spend', 'be', 'die', 'rich', 'nigga', 'be', 'flexin', 'everyday', 's', 'everyday', 's', 'everyday', 's', 'everyday', 's', 'everyday', 'everyday', 'see', 'wanna', 'hot', 'singer', 'swear', 'sound', 'start', 'bottom', 'far', 'motto', 'niggasll', 'never', 'drake', 'shout', 'ovo', 'prolly', 'do', 'not', 'know', 'stay', 'cut', 'do', 'not', 'fuck', 'body', 'that', 's', 'pun', 'nobody', 'know', 'name', 'runnin', 'dream', 'be', 'not', 'hard', 'break', 'bread', 'swear', 'pull', 'plate', 'eat', 'one', '

In [6]:
import pickle

#SAVE LYRICS to disk
#with open('../dataset/lemma_lyrics', 'wb') as fp:
#    pickle.dump(lemma_lyrics, fp)
#del lemma_lyrics

#LOAD LYRICS from disk
lyrics = list()
with open ('../dataset/lemma_lyrics', 'rb') as fp:
    lyrics = pickle.load(fp)

##### 5. Create corpus

In [7]:
import gensim.corpora as corpora
from gensim.models import TfidfModel

id2word = corpora.Dictionary(lyrics)
id2word.save("../dataset/lemma_lyrics_dict")

bow_corpus = list()

for lyric in lyrics:
    bow_corpus.append(id2word.doc2bow(lyric))

tfidf = TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]
print(bow_corpus[0])
print(tfidf_corpus[0])

[(0, 2), (1, 1), (2, 1), (3, 13), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 3), (15, 3), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 45), (23, 1), (24, 3), (25, 3), (26, 1), (27, 5), (28, 1), (29, 6), (30, 1), (31, 1), (32, 1), (33, 4), (34, 3), (35, 1), (36, 1), (37, 1), (38, 4), (39, 1), (40, 2), (41, 1), (42, 2), (43, 2), (44, 6), (45, 1), (46, 1), (47, 2), (48, 7), (49, 2), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 3), (57, 3), (58, 1), (59, 17), (60, 2), (61, 2), (62, 3), (63, 1), (64, 1), (65, 1), (66, 3), (67, 3), (68, 2), (69, 1), (70, 1), (71, 1), (72, 2), (73, 3), (74, 2), (75, 3), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1)]
[(0, 0.056697179813901745), (1, 0.028042242909374952), (2, 0.04581271950544631), (3, 0.029265705566199236), (4, 0.01583974215538755), (5, 0.024022863979568185), (6, 0.023007308350174782), (7, 0.02590899563691738), (8, 0.01086219161876116), (9, 0.006639665551084939), (10, 0.024

#### 3. Create the model

In [6]:
from gensim.models.ldamulticore import LdaMulticore

lda_model = LdaMulticore(workers=4,
                   corpus=bow_corpus,
                   id2word=id2word,
                   num_topics=6, 
                   #random_state=100,
                   #update_every=1,
                   #chunksize=100,
                   #passes=10,
                   per_word_topics=False)
lda_model.save("../dataset/bow_lda/lda")


#### 4. Measure the model

In [7]:
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=lyrics, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.35616974139409235


#### 5. Plot the model

In [8]:
import pyLDAvis
import pyLDAvis.gensim  

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#### 5. One more time, except this time use tfidf corpus

In [103]:
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import warnings

warnings.filterwarnings("ignore")

best_coherence = 0.0
best_number_topics = 0
for i in range(2,30,4):
    lda_model = LdaMulticore(workers=4,
                       corpus=tfidf_corpus,
                       id2word=id2word,
                       num_topics=5, 
                       #random_state=100,
                       #update_every=1,
                       #chunksize=100,
                       #passes=10,
                       per_word_topics=False)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=lyrics, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: '+ str(coherence_lda) + 'Topics: '+str(i))
    
    if(coherence_lda > best_coherence):
        best_coherence = coherence_lda
        best_number_topics = i
        lda_model.save("../dataset/tfidf_lda/lda")
        
print("Biggest coherence score: "+str(best_coherence)+" Number of topics: "+str(best_number_topics))

Coherence Score: 0.35828558401013155Topics: 2
Coherence Score: 0.3805929279940984Topics: 6
Coherence Score: 0.3624986150684909Topics: 10
Coherence Score: 0.3598868445491017Topics: 14
Coherence Score: 0.3766459402323628Topics: 18
Coherence Score: 0.3591502075527237Topics: 22
Coherence Score: 0.37701523415747806Topics: 26
Biggest coherence score: 0.3805929279940984 Number of topics: 6


In [105]:
import pyLDAvis
import pyLDAvis.gensim  

lda_model = LdaMulticore.load("../dataset/tfidf_lda/lda")

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, tfidf_corpus, id2word)
vis

### 2. Hierarchical Dirichlet Process

#### 1. Create the model

In [9]:
from gensim.models import HdpModel

hdp = HdpModel(tfidf_corpus, id2word)
hdp.save("../dataset/hdp/hdp")

In [1]:
from gensim.models import HdpModel
hdp = HdpModel.load("../dataset/hdp/hdp")
lda_model = hdp.suggested_lda_model()
lda_model.save("../dataset/hdp_lda/lda")



In [3]:
topics = []
for topic_id, topic in hdp.show_topics(num_topics=6, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)
print(topics)

[['love', 'get', 'be', 'not', 'do', 'go', 'know', 'baby', 's', 'have', 'want', 'time', 'never', 'say', 'let', 'would', 'ill', 'come', 'can', 'make'], ['love', 'be', 'get', 'not', 'do', 'go', 'baby', 'know', 'want', 's', 'have', 'time', 'let', 'come', 'say', 'would', 'never', 'ill', 'can', 'heart'], ['love', 'be', 'get', 'not', 'do', 'go', 'baby', 'know', 'want', 's', 'have', 'let', 'time', 'come', 'say', 'would', 'never', 'ill', 'can', 'heart'], ['love', 'be', 'get', 'not', 'do', 'go', 'baby', 'know', 'want', 's', 'have', 'let', 'time', 'come', 'would', 'ill', 'say', 'never', 'can', 'heart'], ['love', 'be', 'get', 'not', 'do', 'go', 'know', 'baby', 'good', 's', 'want', 'have', 'come', 'time', 'santa_claus', 'let', 'would', 'say', 'ill', 'never'], ['love', 'be', 'get', 'not', 'do', 'go', 'baby', 'know', 'come', 'let', 'want', 'have', 's', 'time', 'never', 'would', 'say', 'ill', 'can', 'heart']]


In [9]:
from gensim.models.coherencemodel import CoherenceModel
cm = CoherenceModel(texts=lyrics, topics=topics, dictionary=id2word, coherence='c_v')
cm.get_coherence()

0.3628959737720971