# This notebook performs topic modeling on lyrics to that we can investigate questions including : The differences between East Coast and West Coast rap

In [1]:
import sys
import os

In [2]:
import gensim
import nltk
import numpy as np
import pandas as pd
import seaborn as sns



# Loading the dataset
## This dataset comes from the Kaggle website at this URL: https://www.kaggle.com/artimous/every-song-you-have-heard-almost

In [3]:
%%time

print('Loading dataframes from CSV.  This might take some time...')

# NOTE : Without setting the engine here, we might hit the exception : "C error: EOF inside string ..."

# This dataset is comprised of two separate files possibly for size and download limitations
# so we'll put them together in a moment...
lyrics_1_df = pd.read_csv('c:/datasets/lyrics/lyrics1.csv',
                       engine = 'python')
lyrics_2_df = pd.read_csv('c:/datasets/lyrics/lyrics2.csv',
                       engine = 'python')
# now we can put them together into a single frame
lyrics_df = pd.concat([lyrics_1_df, lyrics_2_df])

print('Length of Set #1 : {}'.format(len(lyrics_1_df)))
print('Length of Set #2 : {}'.format(len(lyrics_2_df)))
print('Length of Both Set combined : {}'.format(len(lyrics_df)))

Loading dataframes from CSV.  This might take some time...
Length of Set #1 : 250000
Length of Set #2 : 266174
Length of Both Set combined : 516174
Wall time: 32.9 s


In [4]:
#replace carriage returns
lyrics_df = lyrics_df.replace({'\n': ' '}, regex=True)

In [5]:
lyrics_df.head(10)

Unnamed: 0,Band,Lyrics,Song
0,Elijah Blake,"No, no I ain't ever trapped out the bando But ...",Everyday
1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die
2,Elijah Blake,She don't live on planet Earth no more She fou...,The Otherside
3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low ...",Pinot
4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds
5,Elijah Blake,I just want to ready your mind 'Cause I'll sti...,Uno
6,Elijah Harris,To believe Or not to believe That is the quest...,Girlfriend (Main)
7,Elijah Levi,"No one here can love or understand me Oh, what...",Bye Bye Blackbird
8,Elijah Levi,"Lullaby of Birdland, that's what I Always hea...",Lullaby of Birdland
9,Elijah Levi,I hate to see that evening sun go down I hate ...,St. Louis Blues


# Before we start do do any text analysis, let's figure out the Hip-Hop artists we have

In [6]:
# NOTE : Could not find the following in this set : 
# EAST COAST : Nas
# WEST COAST : Warren G, Tha Dogg Pound
east_coast_artists = ['The Notorious B.I.G.', 'Diddy', 'Wu-Tang Clan', 'Craig Mack', 'Tim Dog']
west_coast_artists = ['N.W.A', 'Dr. Dre', '2Pac', 'Eazy-E', 'Ice Cube', 'Snoop Dogg', 'Nate Dogg', 'Daz Dillinger', ]

In [7]:
artist_check_df = lyrics_df[lyrics_df['Band'].str.contains("D.P.")].groupby(['Band']).size()
print(artist_check_df)

Band
3D Picnic                          1
Benito DiPaula                     1
DMP Big Band                       7
DePaul University Jazz Ensemble    2
Jacki DePiro                       3
Mario DePriest                     1
Nicky DePaola                      4
Sidney DeParis                     3
dtype: int64


In [8]:
artist_song_check_df = lyrics_df[lyrics_df['Band'] == 'Craig Mack']['Song']
print(artist_song_check_df)

84884           Flava in Ya Ear
84885         Funk Wit da Style
84886                  Get Down
84887             Judgement Day
84888                  Mainline
84889    Making Moves With Puff
84890    Project: Funk da World
84891                  Real Raw
84892            When God Comes
Name: Song, dtype: object


In [9]:
conditions = [
    (lyrics_df['Band'].isin(east_coast_artists)),
    (lyrics_df['Band'].isin(west_coast_artists))]
choices = ['East', 'West']
lyrics_df['RapCoast'] = np.select(conditions, choices, default='')

In [10]:
rap_df = lyrics_df[lyrics_df['RapCoast'].str.len() > 0]

In [11]:
print(rap_df.groupby(['Band', 'RapCoast']).size())

Band                  RapCoast
2Pac                  West        252
Craig Mack            East          9
Daz Dillinger         West         22
Diddy                 East        104
Dr. Dre               West         80
Eazy-E                West         24
Ice Cube              West        178
N.W.A                 West         22
Nate Dogg             West         40
Snoop Dogg            West        344
The Notorious B.I.G.  East         90
Tim Dog               East          7
Wu-Tang Clan          East        125
dtype: int64


In [12]:
print(rap_df.groupby(['RapCoast']).size())

RapCoast
East    335
West    962
dtype: int64


In [13]:
# let's write this to a file
rap_artist_filename = 'rap_artists.csv'
rap_df.to_csv(rap_artist_filename)
print('Wrote Rap artist file to CSV : {}'.format(rap_artist_filename))

Wrote Rap artist file to CSV : rap_artists.csv


In [14]:
text_list = rap_df['Lyrics'].tolist()
lyrics_index_list = rap_df.index.values

In [17]:
%%time

text_tokenized_list = []
token_count_list = []
unique_token_set = set()
for i, text in enumerate(text_list):
    if i % 10000 == 0:
        print('Tokenization process : [{0}/{1}]'.format(i, len(text_list)))
        
    # get the index into the original text
    index = text_list[i]
        
    # there are lots and lots of rows which have no lyrics at all, so let's skip them
    if not isinstance(text, str):
        #print('Skipping column type : {0} at index {1}'.format(type(text), index))  
        continue
        
    # this is a better way to tokenize, but for the interest of time, we will tokenize with
    # whitespace using python's split() function
    #tokens = nltk.word_tokenize(text)
    tokens = text.split()
    text_tokenized_list.append(tokens)
    token_count_list.append(len(tokens))
    unique_token_set |= set(tokens)
    
print('Total size of tokenized list : {}'.format(len(text_tokenized_list)))
print('Total size of unique tokens : {}'.format(len(unique_token_set)))
print('DONE reading, tokenizing and counting')

Tokenization process : [0/1297]
Total size of tokenized list : 1297
Total size of unique tokens : 45378
DONE reading, tokenizing and counting
Wall time: 323 ms


# Now it's time to train a topic model...

In [18]:
# TODO : let's do some additional filtering here later...

# but for now we'll use the text list verbatim
filtered_text_tokenized_list = text_tokenized_list

In [20]:
dictionary = gensim.corpora.Dictionary(filtered_text_tokenized_list)
print('Total dictionary size : [{}]'.format(len(dictionary.keys())))

Total dictionary size : [45378]


In [25]:
corpus = [dictionary.doc2bow(text) for text in filtered_text_tokenized_list]
print('Corpus length : {}'.format(len(corpus)))

Corpus length : 1297


In [29]:
%%time 

NUM_TOPICS = 10
NUM_WORKERS = 6
VANILLA_LDA_PASSES = 1

# train model
lda = gensim.models.LdaMulticore(corpus, 
                                 id2word=dictionary, 
                                 num_topics=NUM_TOPICS, 
                                 workers = NUM_WORKERS, 
                                 passes = VANILLA_LDA_PASSES)  

Wall time: 37.8 s


In [30]:
lda.print_topics(-1)

[(0,
  '0.038*"the" + 0.026*"I" + 0.021*"you" + 0.018*"to" + 0.016*"my" + 0.013*"it" + 0.011*"a" + 0.011*"and" + 0.011*"me" + 0.010*"in"'),
 (1,
  '0.039*"the" + 0.023*"I" + 0.017*"to" + 0.015*"a" + 0.015*"it" + 0.013*"you" + 0.012*"my" + 0.011*"me" + 0.011*"and" + 0.010*"in"'),
 (2,
  '0.035*"I" + 0.034*"the" + 0.021*"a" + 0.019*"you" + 0.014*"to" + 0.014*"in" + 0.011*"and" + 0.011*"me" + 0.011*"my" + 0.010*"I\'m"'),
 (3,
  '0.050*"the" + 0.022*"to" + 0.021*"I" + 0.019*"you" + 0.016*"a" + 0.016*"and" + 0.014*"in" + 0.013*"on" + 0.010*"my" + 0.009*"me"'),
 (4,
  '0.027*"the" + 0.026*"I" + 0.024*"a" + 0.018*"you" + 0.017*"and" + 0.015*"to" + 0.012*"I\'m" + 0.011*"in" + 0.010*"on" + 0.010*"my"'),
 (5,
  '0.046*"the" + 0.022*"a" + 0.022*"I" + 0.022*"to" + 0.021*"you" + 0.011*"and" + 0.011*"my" + 0.010*"me" + 0.009*"that" + 0.008*"in"'),
 (6,
  '0.051*"the" + 0.021*"to" + 0.017*"a" + 0.015*"my" + 0.014*"I" + 0.013*"you" + 0.010*"in" + 0.008*"and" + 0.008*"of" + 0.008*"it"'),
 (7,
  '0.035*