# This notebook performs topic modeling on lyrics to that we can investigate questions including : The differences between East Coast and West Coast rap

In [135]:
# common Python imports
import sys
import os
from collections import defaultdict
import pickle

In [136]:
# these are used for NLP, Data Manipulation, etc
import gensim
import nltk
import numpy as np
import pandas as pd
import seaborn as sns

# Load some NLTK data before we get started

In [137]:
%time nltk.download('punkt')
%time nltk.download('stopwords')
%time nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\slick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Wall time: 996 µs
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\slick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Wall time: 0 ns
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\slick\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Wall time: 0 ns


True

# And load a part-of-speech tagging model that was already trained

In [138]:
# helper function to translate POS tags from treebank to wordnet
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN

In [139]:
# set up our lemmatizer in case we enable it
lemma = nltk.wordnet.WordNetLemmatizer()

# and load a POS tagger
# let's also load and test a Brill Part of Speech tagger which was trained on the Penn Treebank:
BRILL_TAGGER_FILE_PATH = 'resources/treebank_brill_aubt.pickle'
brill_tagger = pickle.load(open(BRILL_TAGGER_FILE_PATH, 'rb'))
print(brill_tagger)

# now let's kick the tires on this tagger
test_tag_tokens = 'The cat walked onto an airplane'.split()
print(brill_tagger.tag(test_tag_tokens))

print([lemma.lemmatize(x[0], get_wordnet_pos(x[1])) for x in brill_tagger.tag(test_tag_tokens)])

<nltk.tag.brill.BrillTagger object at 0x000001F11643DD30>
[('The', 'DT'), ('cat', '-None-'), ('walked', 'VBD'), ('onto', 'IN'), ('an', 'DT'), ('airplane', 'NN')]
['The', 'cat', 'walk', 'onto', 'an', 'airplane']


In [140]:
test_sentence_2 = 'Get me on the court and I\'m trouble Last week fucked around and got a triple double'
print(brill_tagger.tag(test_sentence_2.split()))

[('Get', '-None-'), ('me', 'PRP'), ('on', 'IN'), ('the', 'DT'), ('court', 'NN'), ('and', 'CC'), ("I'm", '-None-'), ('trouble', 'NN'), ('Last', 'JJ'), ('week', 'NN'), ('fucked', 'VBD'), ('around', 'IN'), ('and', 'CC'), ('got', 'VBD'), ('a', 'DT'), ('triple', 'RB'), ('double', 'VB')]


# Loading the dataset
## This dataset comes from the Kaggle website at this URL: https://www.kaggle.com/artimous/every-song-you-have-heard-almost

In [141]:
%%time

print('Loading dataframes from CSV.  This might take some time...')

# NOTE : Without setting the engine here, we might hit the exception : "C error: EOF inside string ..."

# This dataset is comprised of two separate files possibly for size and download limitations
# so we'll put them together in a moment...
lyrics_1_df = pd.read_csv('c:/datasets/lyrics/lyrics1.csv',
                       engine = 'python')
lyrics_2_df = pd.read_csv('c:/datasets/lyrics/lyrics2.csv',
                       engine = 'python')
# now we can put them together into a single frame
lyrics_df = pd.concat([lyrics_1_df, lyrics_2_df])

print('Length of Set #1 : {}'.format(len(lyrics_1_df)))
print('Length of Set #2 : {}'.format(len(lyrics_2_df)))
print('Length of Both Set combined : {}'.format(len(lyrics_df)))

Loading dataframes from CSV.  This might take some time...
Length of Set #1 : 250000
Length of Set #2 : 266174
Length of Both Set combined : 516174
Wall time: 30.8 s


In [142]:
#replace carriage returns with periods to see if we can split lyrics as if they are sentences
lyrics_df = lyrics_df.replace({'\n': ' . '}, regex=True)

In [143]:
lyrics_df.head(10)

Unnamed: 0,Band,Lyrics,Song
0,Elijah Blake,"No, no . I ain't ever trapped out the bando . ...",Everyday
1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die
2,Elijah Blake,She don't live on planet Earth no more . She f...,The Otherside
3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low ...",Pinot
4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds
5,Elijah Blake,I just want to ready your mind . 'Cause I'll s...,Uno
6,Elijah Harris,To believe . Or not to believe . That is the q...,Girlfriend (Main)
7,Elijah Levi,"No one here can love or understand me . Oh, wh...",Bye Bye Blackbird
8,Elijah Levi,"Lullaby of Birdland, that's what I . Always h...",Lullaby of Birdland
9,Elijah Levi,I hate to see that evening sun go down . I hat...,St. Louis Blues


# Before we start do do any text analysis, let's figure out the Hip-Hop artists we have

In [144]:
# NOTE : Could not find the following in this set : 
# EAST COAST : Nas
# WEST COAST : Warren G, Tha Dogg Pound
east_coast_artists = ['The Notorious B.I.G.', 'Diddy', 'Wu-Tang Clan', 'Craig Mack', 'Tim Dog']
west_coast_artists = ['N.W.A', 'Dr. Dre', '2Pac', 'Eazy-E', 'Ice Cube', 'Snoop Dogg', 'Nate Dogg', 'Daz Dillinger', ]

In [145]:
artist_check_df = lyrics_df[lyrics_df['Band'].str.contains("D.P.")].groupby(['Band']).size()
print(artist_check_df)

Band
3D Picnic                          1
Benito DiPaula                     1
DMP Big Band                       7
DePaul University Jazz Ensemble    2
Jacki DePiro                       3
Mario DePriest                     1
Nicky DePaola                      4
Sidney DeParis                     3
dtype: int64


In [146]:
artist_song_check_df = lyrics_df[lyrics_df['Band'] == 'Craig Mack']['Song']
print(artist_song_check_df)

84884           Flava in Ya Ear
84885         Funk Wit da Style
84886                  Get Down
84887             Judgement Day
84888                  Mainline
84889    Making Moves With Puff
84890    Project: Funk da World
84891                  Real Raw
84892            When God Comes
Name: Song, dtype: object


In [147]:
conditions = [
    (lyrics_df['Band'].isin(east_coast_artists)),
    (lyrics_df['Band'].isin(west_coast_artists))]
choices = ['East', 'West']
lyrics_df['RapCoast'] = np.select(conditions, choices, default='')

In [148]:
rap_df = lyrics_df[lyrics_df['RapCoast'].str.len() > 0]

In [149]:
print(rap_df.groupby(['Band', 'RapCoast']).size())

Band                  RapCoast
2Pac                  West        252
Craig Mack            East          9
Daz Dillinger         West         22
Diddy                 East        104
Dr. Dre               West         80
Eazy-E                West         24
Ice Cube              West        178
N.W.A                 West         22
Nate Dogg             West         40
Snoop Dogg            West        344
The Notorious B.I.G.  East         90
Tim Dog               East          7
Wu-Tang Clan          East        125
dtype: int64


In [150]:
print(rap_df.groupby(['RapCoast']).size())

RapCoast
East    335
West    962
dtype: int64


In [151]:
# let's write this to a file
rap_artist_filename = 'rap_artists.csv'
rap_df.to_csv(rap_artist_filename)
print('Wrote Rap artist file to CSV : {}'.format(rap_artist_filename))

Wrote Rap artist file to CSV : rap_artists.csv


In [152]:
text_list = rap_df['Lyrics'].tolist()
lyrics_index_list = rap_df.index.values

In [153]:
%%time

text_tokenized_list = []
token_count_list = []
unique_token_set = set()
for i, text in enumerate(text_list):
    if i % 10000 == 0:
        print('Tokenization process : [{0}/{1}]'.format(i, len(text_list)))
        
    # get the index into the original text
    index = text_list[i]
        
    # there are lots and lots of rows which have no lyrics at all, so let's skip them
    if not isinstance(text, str):
        #print('Skipping column type : {0} at index {1}'.format(type(text), index))  
        continue
        
    # this is a better way to tokenize, but for the interest of time, we will tokenize with
    # whitespace using python's split() function
    #tokens = nltk.word_tokenize(text)
    tokens = text.lower().split()
    text_tokenized_list.append(tokens)
    token_count_list.append(len(tokens))
    unique_token_set |= set(tokens)
    
print('Total size of tokenized list : {}'.format(len(text_tokenized_list)))
print('Total size of unique tokens : {}'.format(len(unique_token_set)))
print('DONE reading, tokenizing and counting')

Tokenization process : [0/1297]
Total size of tokenized list : 1297
Total size of unique tokens : 38586
DONE reading, tokenizing and counting
Wall time: 490 ms


# Now we'll process and clean the texts before we train a topic model...

In [154]:
# load a stopword set we want to use...
stoplist = nltk.corpus.stopwords.words('english')
stop_word_set = set(stoplist)
print(stop_word_set)

{'we', 'its', 'been', 'there', 'having', 'nor', 'be', 's', 'up', 'about', 'himself', 'have', 'by', 'of', 'no', 'should', 'ma', 'during', 'this', 'so', 'd', 're', 'couldn', 'they', 'do', 'doesn', 'such', 'y', 'whom', "she's", 'she', 'for', 'because', 'into', 'after', 'these', 'both', "hasn't", "shan't", 'above', 'you', "hadn't", 'needn', 'does', 'themselves', 'if', 'ours', 'in', 'can', 'further', 'more', 'm', 'him', 'yourself', 'ourselves', 'out', 'any', 'where', 'at', 'an', 'own', 'don', 'until', 'i', 'through', 'under', 'am', 'as', 've', 'while', 'had', 'herself', 'why', "you've", 'between', 'doing', 'again', "weren't", 'won', 'and', 'the', 'but', 'below', 'here', 'most', "won't", 'down', 'to', "didn't", 'wasn', 'which', "couldn't", 'what', 'did', 'when', 'now', 'are', 'weren', 'those', 'aren', 'hers', 'theirs', 'not', 'with', 'mustn', 'isn', 't', 'shan', 'few', 'his', 'o', "should've", "you're", 'some', 'hasn', "aren't", 'her', 'will', 'is', 'hadn', 'only', 'on', 'were', 'yourselves'

In [155]:
# we'll remove a few more from our dataset
stop_word_set.add('get')
stop_word_set.add('got')
stop_word_set.add('nigga')
stop_word_set.add('niggas')
stop_word_set.add('bitch')
stop_word_set.add('fuck')
stop_word_set.add('ain\t')
stop_word_set.add('aint')

In [156]:
class Token:
    def __init__(self, token, pos):
        self.token = token
        self.pos = pos
        
    def __repr__(self):
        return '{0}/{1}'.format(self.token, self.pos)

In [157]:
MIN_TERM_FREQUENCY = 3
MIN_TOKEN_LENGTH = 3
CULL_STOP_WORDS = True
CULL_BY_MIN_FREQUENCY = True
CULL_NON_ALPHA = False
CULL_TOKENS_WITH_NO_ALPHA = True
CULL_SHORT_TOKENS = True
CULL_PHRASES_WITH_STOPWORD_EDGE = False
CULL_PHRASE_NO_ALPHA_EDGE = False
CULL_PHRASE_DISALLOWED_START = False
CULL_NON_NOUNS = True
FILTER_DICTIONARY_BY_GLOBAL_IDF = False
MIN_GLOBAL_IDF_VALUE = 4.03

In [158]:
# but for now we'll use the original text verbatim
filtered_tokenized_texts = text_tokenized_list

print(filtered_tokenized_texts[0][:50])

['[pac]', '.', 'live', 'in', 'this', 'motherfucker', 'thug', 'pound', 'bitch', '.', 'kurupt', 'daz', 'makaveli', 'idi', 'kastro', 'hussein', '.', 'you', 'know', 'bitch', 'the', 'whole', "muthafuckin'", 'clique.', '.', 'we', "gon'", 'hit', "yo'", 'ass', 'up.', '.', 'you', 'know', 'where', 'we', "comin'", 'from', '.', 'death', 'row', 'bitch', 'ay', 'yo', 'kurupt', '.', 'you', 'first', 'to', 'blast']


In [159]:
# convert everything into this class even if we do not do POS tagging
if CULL_NON_NOUNS:
    print('Preparing Token and POS information for later culling...')
    for i in range(len(filtered_tokenized_texts)):
        document_tokens = filtered_tokenized_texts[i]
        token_objects = [Token(pos[0], pos[1]) for pos in brill_tagger.tag(document_tokens)]
        filtered_tokenized_texts[i] = token_objects
        
        #print(document_tokens)
        #break
    
    print('DONE with part of speech tagging')
    
else:
    print('Preparing Token information even without culling later')
    for i in range(len(filtered_tokenized_texts)):
        document_tokens = filtered_tokenized_texts[i]
        token_objects = [Token(token, '') for token in document_tokens if len(token) > 0]
        # store this back in 
        filtered_tokenized_texts[i] = token_objects
    
#print(filtered_tokenized_texts[:10])
print(filtered_tokenized_texts[0][:50])

Preparing Token and POS information for later culling...
DONE with part of speech tagging
[[pac]/-None-, ./., live/VB, in/IN, this/DT, motherfucker/NN, thug/-None-, pound/NN, bitch/NN, ./., kurupt/JJ, daz/-None-, makaveli/-None-, idi/-None-, kastro/-None-, hussein/NNP, ./., you/PRP, know/VBP, bitch/NN, the/DT, whole/JJ, muthafuckin'/NNP, clique./-None-, ./., we/PRP, gon'/-None-, hit/VBN, yo'/-None-, ass/NN, up./-None-, ./., you/PRP, know/VBP, where/WRB, we/PRP, comin'/NNP, from/IN, ./., death/NN, row/NN, bitch/NN, ay/-None-, yo/-None-, kurupt/JJ, ./., you/PRP, first/JJ, to/TO, blast/JJ]


In [160]:
%%time

# prepare to remove words that appear only once
frequency = defaultdict(int)
for text in filtered_tokenized_texts:
    for token in text:
        frequency[token.token] += 1
        
if CULL_STOP_WORDS:
    print('Culling stopwords...')
    # then CULL by alpha
    filtered_tokenized_texts = [[token for token in text if token.token not in stop_word_set]
             for text in filtered_tokenized_texts]
    
if CULL_NON_NOUNS:
    # keep this for any noun or any phrase ('_') since phrases may not be properly labeled for part of speech
    filtered_tokenized_texts = [[token for token in text if ('_' in token.token or get_wordnet_pos(token.pos) == nltk.corpus.wordnet.NOUN)]
             for text in filtered_tokenized_texts]
    
if CULL_PHRASES_WITH_STOPWORD_EDGE:
    print('Culling phrases with a stopword on their EDGE')
    filtered_tokenized_texts = [[token for token in text if not phrase_has_stopword_edge(token.token, stop_word_set)]
             for text in filtered_tokenized_texts]
    
if CULL_PHRASE_NO_ALPHA_EDGE:
    print('Culling phrases with a NO ALPHA term on their EDGE')
    filtered_tokenized_texts = [[token for token in text if not phrase_has_no_alpha_edge(token.token)]
             for text in filtered_tokenized_texts]
    
if CULL_PHRASE_DISALLOWED_START:
    print('Culling phrases starting with a DISALLOWED term (e.g. "p", "n")')
    filtered_tokenized_texts = [[token for token in text if not phrase_has_disallowed_start(token.token)]
             for text in filtered_tokenized_texts]

if CULL_BY_MIN_FREQUENCY:
    print('Culling by min frequency...')
    # CULL by frequency
    filtered_tokenized_texts = [[token for token in text if frequency[token.token] > MIN_TERM_FREQUENCY]
             for text in filtered_tokenized_texts]
    
# NOTE that this culling only culls tokens which have NO alpha tokens at all
# which is very different that the processing below
# this will enable this:
# 'TP53' whereas the method below would cull it
if CULL_TOKENS_WITH_NO_ALPHA:
    filtered_tokenized_texts = [[token for token in text if any(c.isalpha() for c in token.token)]
             for text in filtered_tokenized_texts]

if CULL_NON_ALPHA:
    print('Culling all non-alpha tokens...')
    # then CULL by alpha
    filtered_tokenized_texts = [[token for token in text if token.token.isalpha()]
             for text in filtered_tokenized_texts]
    
if CULL_SHORT_TOKENS:
    filtered_tokenized_texts = [[token for token in text if len(token.token) >= MIN_TOKEN_LENGTH]
             for text in filtered_tokenized_texts]
    
# finally convert back from the object into simple token strings
filtered_tokenized_texts = [[token.token for token in text] for text in filtered_tokenized_texts]

DEMO_WORDS_TO_PRINT = 50

print('Demo document tokens:')
print(filtered_tokenized_texts[0][:DEMO_WORDS_TO_PRINT])

Culling stopwords...
Culling by min frequency...
Demo document tokens:
['[pac]', 'motherfucker', 'thug', 'pound', 'daz', 'makaveli', 'kastro', 'hussein', "muthafuckin'", "gon'", "yo'", 'ass', 'up.', "comin'", 'death', 'row', 'westside', 'ballers', '[kurupt]', 'steel', 'cowards', 'panic', 'button', "thinkin'", "shit's", 'bomb', 'vietnam', 'rhyme', 'spine', 'back', 'cracker', 'smack', 'grammar', 'murder', 'alabama', 'arm', 'hammer', 'across', 'like', 'cause', 'fit', 'show', 'hoe', 'proposal', '(bitch)', "mashin'", 'ford', 'hell', 'host', 'horror']
Wall time: 2.62 s


In [161]:
dictionary = gensim.corpora.Dictionary(filtered_tokenized_texts)
print('Total dictionary size : [{}]'.format(len(dictionary.keys())))

Total dictionary size : [8185]


In [162]:
corpus = [dictionary.doc2bow(text) for text in filtered_tokenized_texts]
print('Corpus length : {}'.format(len(corpus)))

Corpus length : 1297


In [163]:
%%time 

NUM_TOPICS = 10
NUM_WORKERS = 6
VANILLA_LDA_PASSES = 1

# train model
lda = gensim.models.LdaMulticore(corpus, 
                                 id2word = dictionary, 
                                 num_topics = NUM_TOPICS, 
                                 workers = NUM_WORKERS, 
                                 passes = VANILLA_LDA_PASSES)  

Wall time: 21.5 s


In [164]:
lda.print_topics(-1)

[(0,
  '0.033*"i\'m" + 0.027*"like" + 0.007*"cause" + 0.006*"that\'s" + 0.005*"man" + 0.005*"time" + 0.005*"\'cause" + 0.005*"world" + 0.005*"ass" + 0.005*"money"'),
 (1,
  '0.039*"i\'m" + 0.021*"like" + 0.011*"one" + 0.008*"me," + 0.008*"that\'s" + 0.007*"cause" + 0.007*"wanna" + 0.006*"\'cause" + 0.005*"it," + 0.005*"y\'all"'),
 (2,
  '0.034*"i\'m" + 0.021*"like" + 0.010*"life" + 0.009*"\'cause" + 0.008*"cause" + 0.007*"time" + 0.007*"girl" + 0.007*"way" + 0.006*"that\'s" + 0.006*"baby"'),
 (3,
  '0.042*"i\'m" + 0.013*"like" + 0.008*"that\'s" + 0.007*"\'cause" + 0.007*"life" + 0.006*"wanna" + 0.006*"time" + 0.006*"yeah" + 0.005*"one" + 0.005*"me,"'),
 (4,
  '0.017*"i\'m" + 0.010*"that\'s" + 0.009*"wanna" + 0.008*"like" + 0.007*"ass" + 0.006*"me," + 0.006*"time" + 0.006*"snoop" + 0.005*"niggaz" + 0.005*"one"'),
 (5,
  '0.022*"i\'m" + 0.016*"cause" + 0.015*"like" + 0.009*"one" + 0.007*"man" + 0.006*"gangsta" + 0.006*"ass" + 0.006*"life" + 0.005*"wanna" + 0.005*"nah"'),
 (6,
  '0.033*"i