## Import

In [1]:
import pandas as pd
import numpy as np
import json
import glob
import re

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from wordcloud import WordCloud, STOPWORDS

#spacy
import spacy
#from nltk.corpus import STOPWORDS

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings


In [2]:
DATA_PATH = 'data/wordclouds/unfiltered_polarity.csv'
data = pd.read_csv(DATA_PATH)
data.sample(5)

Unnamed: 0.1,Unnamed: 0,Artist,Song Name,release date,GPE,Line,Line Before,Line After,polarity,pos,neg,neu,compound
4054,4054,Immortal Technique,Harlem Streets,2011-10-27,Harlem,"""Homicide Harlem—blaow! What\'s the problem?""","""Homicide Harlem—blaow! What\'s the problem?""","""—blaow!""13Embed', ""Hip hop speech LyricsYou n...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
658,658,De La Soul,Come on Down,1989-03-03,Jersey,Memphis (c\'mon) Utah (c\'mon) Jersey (c\'mon)...,New Orleans (c\'mon) Little Rock (c\'mon) B-Mo...,Atlanta (c\'mon) Brooklyn (c\'mon) Philly (c\'...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
1568,1568,Queen Latifah,Brownsville,1993-11-16,Brooklyn,Lay my head all up in Brooklyn,That\'s where the fuck I\'m at,But Jersey\'s on the map,"{'neg': 0.412, 'neu': 0.588, 'pos': 0.0, 'comp...",0.0,0.412,0.588,-0.5423
2761,2761,Ma$e,Do You Remember,,Harlem,"Harlem World style, pursue my dream, cuz see",I\'m just a young cat tryin\' to do his thing,"The things that went 3 mil, I didn\'t even lik...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
3602,3602,Big Pun,New York Giants,1999-11-16,South Bronx,"[Pun] South Bronx, Brook-lawn pa-pa",[MOP] Form the alliance,[MOP] New York Giants,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0


In [3]:
labels = []
for i, row in data.iterrows():
    #positive 1
    if row.compound >= 0.5:
        labels.append(1)
    elif row.compound <= -0.5:
        labels.append(-1)
    else:
        labels.append(0)
data['labels'] = labels

In [4]:
data.labels

0       0
1       0
2       0
3       0
4       0
       ..
4336    1
4337    1
4338    0
4339    0
4340    1
Name: labels, Length: 4341, dtype: int64

In [5]:
data['GPE'] = data['GPE'].replace(['NYC'],'New York City')
data['GPE'] = data['GPE'].replace(['New York'],'New York City')
data['GPE'] = data['GPE'].replace(['South Bronx'],'Bronx')

In [6]:
data.GPE.unique()

array(['Harlem', 'New York City', 'Bronx', 'Jersey', 'East Coast',
       'West Coast', 'Broadway', 'Brooklyn', 'America', 'Jamaica',
       'Manhattan', 'Queens', 'Staten Island', 'Chinatown', 'Long Island',
       'Queensbridge', 'Brownsville', 'SoHo', "Hell's Kitchen", 'Chelsea'],
      dtype=object)

In [7]:
data.columns

Index(['Unnamed: 0', 'Artist', 'Song Name', 'release date', 'GPE', 'Line',
       'Line Before', 'Line After', 'polarity', 'pos', 'neg', 'neu',
       'compound', 'labels'],
      dtype='object')

In [8]:
data.groupby('labels').sum()

Unnamed: 0_level_0,Unnamed: 0,pos,neg,neu,compound
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,960687,5.114,185.997,249.889,-285.28
0,7902231,216.44,182.701,3249.864,35.4176
1,557052,112.446,2.191,135.364,162.2661


## Dropping duplicated and NA lines

In [9]:
df1 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line Before', 'labels']]
df1.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']
df2 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']]
df3 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line After', 'labels']]
df3.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']
df_lines = pd.concat([df1, df2, df3])
df_lines.sample(5)

Unnamed: 0,Artist,Song Name,release date,GPE,Line,labels
2061,Big L,American Dream,,Harlem,Yea What? Harlem on the Rise,0
4153,Joey bada$$,Born Day (AquariUS),2012-09-06,Brooklyn,To the world,0
4231,Ol' Dirty Bastard,Brooklyn Zoo II,2020-03-28,Brooklyn,Is the Brooklyn Zoo in the muthafucka house?,-1
3907,Action Bronson,Industry Shakedown Cypher,2015-03-23,New York City,"Cock Glocks, pop shots, turn your face to a bl...",-1
2879,Puff Daddy,Intro,2015-11-04,West Coast,Left my East Coast girl the Bentley to twirl (...,0


In [10]:
data.shape, df_lines.shape

((4341, 14), (13023, 6))

In [11]:
df_lines.duplicated(subset=['Line']).sum()

4417

In [12]:
df_lines = df_lines.drop_duplicates(subset=['Line'])
df_lines = df_lines.dropna(subset=['Line'])
df_lines.shape

(8605, 6)

In [13]:
df_lines.GPE.unique()

array(['Harlem', 'New York City', 'Bronx', 'Jersey', 'East Coast',
       'West Coast', 'Broadway', 'Brooklyn', 'America', 'Jamaica',
       'Manhattan', 'Queens', 'Chinatown', 'Long Island', 'Queensbridge',
       'Brownsville', 'Staten Island', 'SoHo', "Hell's Kitchen",
       'Chelsea'], dtype=object)

In [14]:
df_lines.labels.unique()

array([ 0, -1,  1])

In [15]:
df_lines.GPE.unique()

array(['Harlem', 'New York City', 'Bronx', 'Jersey', 'East Coast',
       'West Coast', 'Broadway', 'Brooklyn', 'America', 'Jamaica',
       'Manhattan', 'Queens', 'Chinatown', 'Long Island', 'Queensbridge',
       'Brownsville', 'Staten Island', 'SoHo', "Hell's Kitchen",
       'Chelsea'], dtype=object)

## Creating corpus by labels

In [16]:
corpus = []
i = 0
#For each neighborhood create 3 corpus (pos, neg, neutral)
for neighborhood in df_lines.GPE.unique():
    for label in df_lines.labels.unique():
        temp = []
        df_temp = df_lines[df_lines.GPE==neighborhood]
        for j, row in df_temp[df_temp.labels==label].iterrows():
            #Cleaning lyrics
            line = row.Line.replace('\\n', ' ')
            line = line.replace("\'", '')
            line = line.replace("\\", '')
            line = re.sub("[\(\[].*?[\)\]]", "", line)
            line = line.lower()

            # Remove words shorter than 3 character
            line = ' '.join([w for w in line.split() if len(w)>2])
            #appening the cleaned line in a temporary list  
            temp.append(line)
            i += 1
            #if i==5: break

        corpus_i = " ".join(temp)
        corpus.append(corpus_i)


  line = re.sub("[\(\[].*?[\)\]]", "", line)


In [17]:
len(corpus[0]), len(corpus[1]), len(corpus[10])


(26674, 4677, 2288)

In [18]:
len(corpus)

60

In [19]:
corpus[0][0:100]

'motthaven, you know, tremont stunting the stunners, smoking out the whips its sort like all the real'

## Lematization

In [20]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []

    for text in texts:
        doc = nlp(text)
        new_text = []

        for token in doc:
            if token.pos_ in allowed_postags: #pos = part of speach
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
        
    return texts_out

lemmatized_corpus = corpus  #Not activated in this case
len(lemmatized_corpus[0])

26674

In [21]:
len(corpus[2])

2274

## Removing stop words

In [22]:

def gen_words(texts):
    final = []
    for i,text in enumerate(texts):
        new = gensim.utils.simple_preprocess(text, deacc=True) #remove the accent in case there are
        final.append(new)
    return final

data_words = gen_words(lemmatized_corpus)  

In [23]:
len(data_words[0])

4591

In [24]:
# Stopwords removal 
PATH_DATA = './data/wordclouds/'
PATH_STOPWORD = PATH_DATA + 'stopword_list.csv'
sw = pd.read_csv(PATH_STOPWORD, header=None)
sw_list = list(sw.values.flatten())
sw_list =  sw_list[:-2] #remove 2 NaNs at the end of the list

# Wordcloud library stopwords
stopwords =  list(STOPWORDS) + sw_list

# Additionnal handpicked stopwords
add_sw = ['//', 'yeah', 'huh', 'yo', 's', 'nt', 'lyric', 'lyrics', 'll','harlem', 'new','york', 
        'bronx', 'jersey', 'west',
       'manhattan', 'brooklyn', 'taten', 'america', 'east', 'Coast',
       'long', 'island', 'queensbridge', 'brownsville', 'talk', 'man', 'dont', 
       'aint', 'fuck', 'nyc', 'yall', 'rap', 'ill', 'wanna', 'gotta', 'staten', 'youre','coast', 'queens', 'nigga', 'niggas',
       'city', 'em']
stopwords = stopwords + add_sw

# Adding default spacy stopword list
en = spacy.load('en_core_web_sm')
spacy_stopwords = en.Defaults.stop_words
stopwords = stopwords + list(spacy_stopwords)
len(stopwords)

1105

In [25]:
a = []
for l in data_words:
    a.append(len(l))

a

[4591,
 793,
 385,
 12725,
 1483,
 1118,
 2535,
 409,
 173,
 1742,
 393,
 132,
 650,
 78,
 66,
 364,
 157,
 93,
 642,
 56,
 19,
 8646,
 1371,
 741,
 8549,
 1572,
 1616,
 1452,
 110,
 130,
 620,
 116,
 29,
 4155,
 476,
 191,
 87,
 17,
 0,
 441,
 0,
 0,
 504,
 186,
 63,
 558,
 94,
 51,
 1084,
 33,
 36,
 40,
 23,
 0,
 20,
 0,
 0,
 14,
 0,
 0]

In [26]:

new_data_words= []

for text in data_words:
    tokens_without_sw = []
    for word in text:
        if not word in stopwords:
            tokens_without_sw.append(word)
    new_data_words.append(tokens_without_sw)
    
data_words = new_data_words

In [27]:
a = []
for l in new_data_words:
    a.append(len(l))

a

[2078,
 396,
 182,
 6444,
 710,
 534,
 1180,
 169,
 80,
 825,
 199,
 64,
 278,
 35,
 42,
 153,
 75,
 41,
 341,
 35,
 8,
 4015,
 659,
 358,
 4156,
 803,
 785,
 856,
 60,
 85,
 292,
 57,
 11,
 2002,
 236,
 97,
 52,
 8,
 0,
 193,
 0,
 0,
 249,
 86,
 31,
 257,
 38,
 27,
 503,
 14,
 16,
 26,
 16,
 0,
 12,
 0,
 0,
 8,
 0,
 0]

## Bigrams/trigrams

In [28]:
from gensim.models.phrases import Phraser
from gensim.models import Phrases

In [29]:
bigram_phrases = Phrases(data_words, min_count=2, threshold=10)
trigram_phrases = Phrases(bigram_phrases[data_words], threshold=10)

bigram = Phraser(bigram_phrases)
trigram = Phraser(trigram_phrases)

#function changing the individual words by their corresponding bigrams and trigrams
def make_bigrams(texts):
    b = []
    for doc in texts:
        b.append(bigram[doc])
    return b

def make_trigrams(texts):
    b = []
    for doc in texts:
        b.append(trigram[bigram[doc]])
    return b

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

#print(data_bigrams_trigrams[1])

## TF-IDF dictionnary

In [30]:
from gensim.models import TfidfModel
texts = data_bigrams_trigrams

id2word = corpora.Dictionary(texts)

corpus = [id2word.doc2bow(text) for text in texts]


tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.05 #threshold filtering word wich appears more than this freq in all the docs
words = []
words_missing_in_tfidf = []
#Creating new corpus by removing too frequent words
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  

    #reassign        
    corpus[i] = new_bow

## Bag of word and dictionnary

id2word = corpora.Dictionary(data_words)

corpus = []

for i, text in enumerate(data_words):
    new = id2word.doc2bow(text)
    corpus.append(new)


id2word[[0][:1][0]]

len(id2word)

## Visualization

In [31]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, 
update_every=1, chunksize=100, passes=10, alpha="auto")

In [32]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [41]:
id2word[26]

'ayy'

In [46]:
tfidf[corpus[0]][0][0]

26

In [52]:
word_list = []
df_list = []
for i in range(len(corpus)):
    for j in range(len(corpus[i])):
        word_id = tfidf[corpus[i]][j][0]
        freq = tfidf[corpus[i]][j][1]
        word_list.append([id2word[word_id], freq])
    
    df_temp = pd.DataFrame(word_list)
    df_list.append(df_temp)

df_list[0]

Unnamed: 0,0,1
0,ayy,0.107507
1,baby,0.096049
2,bout,0.115515
3,bruh,0.097183
4,cam,0.107507
5,check,0.09132
6,coming_coming,0.146933
7,cruise,0.107507
8,danger_zone,0.122058
9,dimes_sprint,0.091543


In [55]:
df.sort_values(by=1, ascending=False)

Unnamed: 0,0,1
1413,broadway,0.898954
1413,broadway,0.898954
1413,broadway,0.898954
1413,broadway,0.898954
1413,broadway,0.898954
...,...,...
2898,gun,0.050489
2898,gun,0.050489
2898,gun,0.050489
2898,gun,0.050489
