# Topic Modelling 

In [1]:
import pandas as pd
import pickle 

data = pd.read_pickle('dtm_stop.pkl')

In [5]:
from gensim import matutils, models
import scipy.sparse

In [6]:
tdm = data.transpose()

In [7]:
#put term document matrix into a new gensim format, df -> sparse matrix -> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [9]:
#gensim requires dictionary of all terms and their respective location in the tdm
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

### Now need to specify two other parameters: number of topics and number of passes. Let's try 3 topics

In [10]:
#3 topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.014*"nigga" + 0.013*"fuck" + 0.009*"know" + 0.009*"niggas" + 0.009*"cause" + 0.008*"bitch" + 0.008*"shit" + 0.007*"love" + 0.005*"life" + 0.005*"man"'),
 (1,
  '0.012*"know" + 0.009*"love" + 0.008*"cause" + 0.007*"shit" + 0.006*"world" + 0.006*"fuck" + 0.006*"man" + 0.005*"niggas" + 0.005*"nigga" + 0.005*"life"'),
 (2,
  '0.008*"yo" + 0.008*"man" + 0.007*"make" + 0.006*"clean" + 0.006*"know" + 0.006*"buggin" + 0.005*"shit" + 0.005*"niggas" + 0.005*"let" + 0.004*"nigga"')]

In [11]:
#4 topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.018*"fuck" + 0.012*"man" + 0.010*"yo" + 0.010*"shit" + 0.010*"know" + 0.009*"nigga" + 0.009*"cause" + 0.007*"niggas" + 0.006*"fuckin" + 0.006*"bitch"'),
 (1,
  '0.010*"nigga" + 0.010*"know" + 0.010*"love" + 0.008*"cause" + 0.007*"shit" + 0.007*"niggas" + 0.007*"bitch" + 0.006*"fuck" + 0.005*"life" + 0.005*"way"'),
 (2,
  '0.014*"buggin" + 0.010*"make" + 0.008*"girl" + 0.008*"relax" + 0.008*"settle" + 0.007*"bonita" + 0.007*"let" + 0.007*"check" + 0.006*"yo" + 0.005*"happen"'),
 (3,
  '0.009*"niggas" + 0.008*"know" + 0.008*"black" + 0.008*"murder" + 0.007*"need" + 0.007*"nigga" + 0.007*"love" + 0.006*"stop" + 0.005*"shit" + 0.005*"hard"')]

### Doesn't make too much sense at the moment, so play around with terms list
### Can also look at terms that are from one part of speech (nouns, adjectives). Try nouns first

In [12]:
#create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [13]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,Lyrics
2pac,hit ’em up suckaass i aint got no motherfuc...
A Tribe Called Quest,we the people qtip we dont believe you caus...
Immortal Technique,dance with the devil part i dance with the de...
Jay-z and Kanye West,niggas in paris were gonna skate to one song...
Kendrick Lamar,humble nobody pray for me it been that day f...
Logic,pre logic ive been on the low i been taking ...
N.W.A.,fuck tha police dr dre dj yella the doc d...
Nas,ny state of mind dj premier yeah yeah ayo b...
Outkast,hey ya andré one two three uh andré my b...
The Notorious B.I.G.,juicy the notorious big fuck all you hoes ge...


In [21]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.Lyrics.apply(nouns))

In [25]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['im','ive','dont','rza','aint','youre','ther','thats','ill','dr','ice','cube',
                 'nas','2pac','andré','gotta','tyler','creator','wanna','like','just','jayz','kanye',
                 'em','yall','cmon','uh','oh','la','ya','aa','raekwon','eazye','ren','ima','west',
                 'whats','da','ja','gon','got','gotta','imma','jay','logic','dre','inspectah','qtip',
                 'phife','hes','shes','aaaahhh','aaaahah','aah','technique','immortal','boi','biggie',
                 'notorious','uhoh','ooh', 'bastard', 'ol', 'gimme', 'kendrick', 'lamar', 'ayy','yeah']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.Lyrics)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names_out())
data_dtmn.index = data_nouns.index
data_dtmn



Unnamed: 0,aames,aap,abbot,abomination,abortion,abraham,abstract,absurdity,abundance,ac,...,zero,zigzagzig,zipper,zone,zoo,zoom,zs,zulu,zyklonb,zé
2pac,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
A Tribe Called Quest,0,0,0,0,0,0,3,1,0,0,...,0,0,0,0,0,0,1,3,0,0
Immortal Technique,0,0,0,0,1,1,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0
Jay-z and Kanye West,0,0,0,0,0,0,0,0,0,0,...,2,0,0,8,0,0,0,0,0,0
Kendrick Lamar,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Logic,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
N.W.A.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Nas,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
Outkast,0,0,0,0,0,0,0,0,0,0,...,0,0,1,2,0,0,0,0,0,0
The Notorious B.I.G.,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())


In [27]:
# Let's try topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.015*"bitch" + 0.009*"buggin" + 0.008*"way" + 0.007*"hol" + 0.006*"cause" + 0.006*"niggas" + 0.006*"shit" + 0.006*"nigga" + 0.005*"time" + 0.005*"man"'),
 (1,
  '0.020*"man" + 0.017*"fuck" + 0.011*"cause" + 0.011*"shit" + 0.010*"yo" + 0.009*"nigga" + 0.008*"niggas" + 0.008*"bitch" + 0.006*"ass" + 0.006*"life"'),
 (2,
  '0.013*"cause" + 0.010*"life" + 0.009*"love" + 0.009*"nigga" + 0.008*"world" + 0.008*"time" + 0.008*"niggas" + 0.008*"man" + 0.007*"way" + 0.007*"shit"')]

In [28]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.017*"bitch" + 0.011*"hol" + 0.011*"bone" + 0.009*"motherfuckers" + 0.009*"money" + 0.008*"nigga" + 0.008*"man" + 0.008*"lets" + 0.007*"way" + 0.007*"loot"'),
 (1,
  '0.016*"cause" + 0.013*"niggas" + 0.012*"fuck" + 0.012*"nigga" + 0.011*"life" + 0.010*"love" + 0.009*"shit" + 0.009*"man" + 0.009*"bitch" + 0.009*"world"'),
 (2,
  '0.015*"man" + 0.011*"buggin" + 0.008*"yo" + 0.008*"cause" + 0.007*"time" + 0.007*"fuck" + 0.006*"method" + 0.006*"niggas" + 0.006*"wutang" + 0.005*"shit"'),
 (3,
  '0.011*"shit" + 0.011*"man" + 0.010*"bitch" + 0.009*"everybody" + 0.008*"hey" + 0.007*"life" + 0.007*"mind" + 0.007*"way" + 0.006*"cause" + 0.006*"poopooooh"')]

## Nouns and Adjectives

In [29]:
# Let's create a function to pull out nouns and adjectives from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [30]:
# Apply the nouns function to the transcripts to filter only on nouns and adjectives
data_nouns_adj = pd.DataFrame(data_clean.Lyrics.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,Lyrics
2pac,hit ’ suckaass i motherfuckin friends thats i ...
A Tribe Called Quest,people people rear ayo killingoffgoodyoungnigg...
Immortal Technique,dance devil part i dance devil i nigga real na...
Jay-z and Kanye West,niggas paris song song hard motherfuckers i ha...
Kendrick Lamar,humble nobody pray day way ayy i syrup sandwic...
Logic,pre logic ive low i time i im mind life aint m...
N.W.A.,fuck tha police dr dre dj yella doc dr dre ice...
Nas,ny state mind dj premier yeah yeah black—its t...
Outkast,hey andré baby dont mess i sure uh na cant sta...
The Notorious B.I.G.,juicy notorious big fuck grip motherfucker yea...


In [33]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.Lyrics)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names_out())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,aames,aap,abbot,able,abomination,aborted,abortion,abraham,absolute,abstract,...,zero,zigzagzig,zipper,zone,zoo,zoom,zs,zulu,zyklonb,zé
2pac,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
A Tribe Called Quest,0,0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,1,3,0,0
Immortal Technique,0,0,0,0,0,1,1,1,0,0,...,0,0,0,1,1,0,0,0,1,0
Jay-z and Kanye West,0,0,0,0,0,0,0,0,0,0,...,2,0,0,8,0,0,0,0,0,0
Kendrick Lamar,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Logic,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
N.W.A.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Nas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Outkast,0,0,0,0,0,0,0,0,0,0,...,0,0,1,2,0,0,0,0,0,0
The Notorious B.I.G.,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [35]:
# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.007*"motherfuckers" + 0.004*"everybody" + 0.004*"bone" + 0.004*"motherfucker" + 0.003*"wutang" + 0.003*"police" + 0.003*"method" + 0.003*"murder" + 0.003*"style" + 0.003*"loot"'),
 (1,
  '0.009*"clean" + 0.008*"buggin" + 0.003*"magic" + 0.003*"applebum" + 0.003*"bonita" + 0.003*"poopooooh" + 0.003*"hook" + 0.003*"point" + 0.003*"somethin" + 0.003*"jazz"'),
 (2,
  '0.014*"hol" + 0.009*"drank" + 0.007*"funk" + 0.007*"lil" + 0.007*"vibe" + 0.005*"bish" + 0.005*"dna" + 0.005*"humble" + 0.004*"sit" + 0.003*"liquor"')]

In [36]:
# Let's try 4 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.010*"clean" + 0.006*"police" + 0.004*"dope" + 0.004*"dopeman" + 0.004*"compton" + 0.004*"represent" + 0.004*"gangsta" + 0.003*"streets" + 0.003*"motherfucker" + 0.003*"poopooooh"'),
 (1,
  '0.022*"buggin" + 0.008*"applebum" + 0.008*"bonita" + 0.007*"jazz" + 0.007*"somethin" + 0.006*"rhyme" + 0.006*"dawg" + 0.005*"brothers" + 0.004*"stop" + 0.004*"space"'),
 (2,
  '0.006*"hol" + 0.005*"everybody" + 0.005*"murder" + 0.004*"drank" + 0.004*"alive" + 0.003*"vibe" + 0.003*"lil" + 0.003*"funk" + 0.003*"motherfuckers" + 0.003*"huh"'),
 (3,
  '0.009*"motherfuckers" + 0.008*"bone" + 0.007*"wutang" + 0.007*"method" + 0.006*"loot" + 0.005*"gim" + 0.005*"style" + 0.004*"word" + 0.004*"dollar" + 0.004*"party"')]

## Identify topics in each document

In [37]:
# Final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldana.print_topics()

[(0,
  '0.008*"clean" + 0.007*"motherfuckers" + 0.005*"bone" + 0.005*"police" + 0.004*"loot" + 0.004*"motherfucker" + 0.004*"gim" + 0.004*"everybody" + 0.003*"party" + 0.003*"dope"'),
 (1,
  '0.009*"wutang" + 0.009*"method" + 0.007*"murder" + 0.006*"style" + 0.005*"ruckus" + 0.005*"dollar" + 0.005*"clan" + 0.004*"slow" + 0.004*"motherfuckin" + 0.004*"dirty"'),
 (2,
  '0.013*"buggin" + 0.005*"bonita" + 0.005*"applebum" + 0.004*"point" + 0.004*"somethin" + 0.004*"jazz" + 0.004*"brothers" + 0.003*"rhyme" + 0.003*"dawg" + 0.003*"stop"'),
 (3,
  '0.009*"hol" + 0.008*"everybody" + 0.006*"drank" + 0.005*"vibe" + 0.005*"funk" + 0.005*"lil" + 0.005*"magic" + 0.004*"na" + 0.004*"alive" + 0.004*"okay"')]

### Say that Topic 0:  Crime, Topic 1: Violence, Topic 2: Calmer, Topic 3: Party

In [38]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

[(0, '2pac'),
 (2, 'A Tribe Called Quest'),
 (2, 'Immortal Technique'),
 (1, 'Jay-z and Kanye West'),
 (3, 'Kendrick Lamar'),
 (3, 'Logic'),
 (0, 'N.W.A.'),
 (0, 'Nas'),
 (0, 'Outkast'),
 (0, 'The Notorious B.I.G.'),
 (3, 'Tyler the Creator'),
 (1, 'Wu-Tang Clan')]

### So, Crime: 2pac, N.W.A., Nas, Outkast, Biggie
### Violence: Wu-Tang Clan, Jay-z and Kanye
### Calmer (for rap): A Tribe Called Quest, Immortal Technique
### Party: Kendrick, Tyler, The Creator, Logic