In [1]:
import pandas as pd
import pickle
import scipy.sparse
from gensim import matutils, models
from nltk import word_tokenize, pos_tag

In [2]:
data = pd.read_pickle('dtm_stop.pkl')
data_cleaned = pd.read_pickle('data_clean.pkl')
term_doc_matrix = data.transpose()
term_doc_matrix.head()

Unnamed: 0,2Pac,Cardi B,Eminem,J. Cole,Joyner Lucas,Juice WRLD,Kanye West,Lil Pump,Logic,Mac Miller,Nas,Nicki Minaj,Notorious B.I.G.
aa,0,0,1,0,0,0,1,0,1,0,0,0,0
aaaaaaaaaa,0,0,0,0,0,0,0,0,0,0,0,1,0
aaaaaaack,0,0,1,0,0,0,0,0,0,0,0,0,0
aaaaah,0,0,0,0,0,0,0,0,0,0,0,1,0
aaaaahhh,0,0,0,0,0,0,1,0,0,0,0,0,0


In [3]:
def nouns_adj_verbs(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ' or pos[:2] == 'VV'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)]
    return ' '.join(nouns_adj)

In [4]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

add_stop_words = ['im', 'got', 'like',
                 'dont', 'know', 'just',
                 'fuck', 'shit', 'yeah',
                 'aint', 'thats', 'make',
                 'bitch', 'love', 'wanna', 
                 'cause', 'niggas', 'nigga', 
                 'time', 'em', 'man', 
                  'want', 'let', 'come']

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

cv = CountVectorizer(stop_words=stop_words)

In [5]:
data_nouns_adj_verbs = pd.DataFrame(data_cleaned.lyrics.apply(nouns_adj_verbs))
data_nouns_adj_verbs

Unnamed: 0,lyrics
2Pac,aint nothin gangsta party eh light ahh nothin ...
Cardi B,whores house whores house whores house whores ...
Eminem,yeah i guess huh obvious eye eye funny much i ...
J. Cole,work growth famous important anything anything...
Joyner Lucas,fall fall i more i i werent cant picture someo...
Juice WRLD,nahnahnahnahnahnah smoke cigarettes cancer che...
Kanye West,hour hour power minute minute lord second seco...
Lil Pump,lyrics first snippet elliot dinner brr man ben...
Logic,lyrics song please song welcome pressure progr...
Mac Miller,youre young much matters something night dream...


In [6]:
cv = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cv = cv.fit_transform(data_nouns_adj_verbs.lyrics)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_nouns_adj_verbs.index
data_dtm

Unnamed: 0,aa,aaaaaaack,aaaaahhh,aaaaayyyyooooo,aaaah,aaaahh,aaaand,aaahhh,aaand,aaass,...,世界中で聴いてる,帰っていただいて結構,彼の行動が気になって仕方ないはず,感謝しています,最高だったでしょう,本当はロジックを愛してやまないんでしょう,楽しんでいただけたことを願っています,毎日,私たちは共に歴史を刻んできた,耳を塞ぐか
2Pac,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cardi B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Eminem,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
J. Cole,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Joyner Lucas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Juice WRLD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Kanye West,0,0,1,0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Lil Pump,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Logic,1,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Mac Miller,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
corpus = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtm.transpose()))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [8]:
lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=80)
lda.print_topics()

[(0,
  '0.002*"cmon" + 0.002*"logic" + 0.002*"outlaw" + 0.002*"cole" + 0.002*"pac" + 0.002*"heaven" + 0.002*"bam" + 0.001*"holla" + 0.001*"thoughts" + 0.001*"row"'),
 (1,
  '0.013*"nas" + 0.004*"buck" + 0.003*"queensbridge" + 0.002*"joyner" + 0.002*"bridge" + 0.002*"summer" + 0.002*"hip" + 0.002*"di" + 0.002*"queens" + 0.002*"represent"'),
 (2,
  '0.014*"woah" + 0.005*"racks" + 0.005*"codeine" + 0.005*"demons" + 0.005*"brr" + 0.004*"yuh" + 0.004*"dig" + 0.004*"percs" + 0.004*"choppa" + 0.003*"skrrt"'),
 (3,
  '0.011*"je" + 0.010*"nicki" + 0.006*"cardi" + 0.006*"que" + 0.005*"dem" + 0.005*"buck" + 0.004*"barbie" + 0.004*"minaj" + 0.004*"tu" + 0.003*"blazin"'),
 (4,
  '0.008*"shady" + 0.007*"biggie" + 0.005*"funk" + 0.003*"combs" + 0.003*"cmon" + 0.003*"dre" + 0.002*"poppa" + 0.002*"superman" + 0.002*"duh" + 0.002*"nuh"')]

In [9]:
corpus_transformed = lda[corpus]
list(zip([a[0][0] for a in corpus_transformed], data_dtm.index))

[(0, '2Pac'),
 (2, 'Cardi B'),
 (4, 'Eminem'),
 (0, 'J. Cole'),
 (1, 'Joyner Lucas'),
 (2, 'Juice WRLD'),
 (0, 'Kanye West'),
 (2, 'Lil Pump'),
 (0, 'Logic'),
 (0, 'Mac Miller'),
 (1, 'Nas'),
 (3, 'Nicki Minaj'),
 (4, 'Notorious B.I.G.')]

In [18]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
lda_sk = LDA(n_components=5, n_jobs=-1)
lda_sk.fit(data_dtm)

  and should_run_async(code)


LatentDirichletAllocation(n_components=5, n_jobs=-1)

In [19]:
words = cv.get_feature_names()
for topic_idx, topic in enumerate(lda_sk.components_):
    print("\nTopic #%d:" % topic_idx)
    print(" ".join([words[i] for i in topic.argsort()[:-11:-1]]))


Topic #0:
bam kanye monster yeezy roc hoo cmon hype williams ridiculous

Topic #1:
nas je nicki buck dem que queensbridge barbie queens minaj

Topic #2:
woah shady logic demons codeine dig smokin percs insane rhyme

Topic #3:
brr racks chyeah yuh vroom slatt goyard esskeetit pinky fasho

Topic #4:
biggie funk cmon outlaw pac cole cardi combs row dogg


  and should_run_async(code)


In [20]:
from pyLDAvis import sklearn as sklearn_lda
import os
import pyLDAvis

LDAvis_data_filepath = os.path.join('./ldavis')

LDAvis_prepared = sklearn_lda.prepare(lda_sk, data_cv, cv)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)
        
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    
pyLDAvis.save_html(LDAvis_prepared, './ldavis.html')

  and should_run_async(code)


In [23]:
from IPython.display import IFrame

IFrame(src='./ldavis.html', width=1080, height=720)

  and should_run_async(code)
