## Import

In [1]:
import pandas as pd
import numpy as np
import json
import glob
import re

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from wordcloud import WordCloud, STOPWORDS

#spacy
import spacy
#from nltk.corpus import STOPWORDS

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings


In [2]:
DATA_PATH = 'data/wordclouds/unfiltered_polarity.csv'
data = pd.read_csv(DATA_PATH)
data.sample(5)

Unnamed: 0.1,Unnamed: 0,Artist,Song Name,release date,GPE,Line,Line Before,Line After,polarity,pos,neg,neu,compound
4125,4125,Immortal Technique,The Rebel,2001-09-14,America,They used to call us communists for fighting A...,I'm coming in here to take your head off4Embed...,Now they call us terrorists to spread hysteria,"{'neg': 0.0, 'neu': 0.812, 'pos': 0.188, 'comp...",0.188,0.0,0.812,0.4019
1753,1753,Masta Ace,Nana,2009-01-01,Jamaica,"On Red, who run the spot, this old Jamaican","In a tall can, he go the the door and start br...","Like forty-nine or fifty years old, he\'s maki...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
977,977,Public Enemy,Get It In,1988-06-28,Bronx,"In the Bronx, we rock the block, you gotta (Ge...","Riding on the block, you gotta (Get it in)","Nassau county on the rock, you gotta (Get it in)","{'neg': 0.266, 'neu': 0.734, 'pos': 0.0, 'comp...",0.0,0.266,0.734,-0.4404
2470,2470,Inspectah Deck,Crazy,2019-07-12,America,"Trying to live that American dream, a three ca...",He was good til he caught the charge,"Man, it\'s hard and it\'s ways to go","{'neg': 0.0, 'neu': 0.707, 'pos': 0.293, 'comp...",0.293,0.0,0.707,0.4404
3368,3368,Nas,American Way,1996-07-02,America,"Yeah I think about this every day, that\'s the...","Yeah I think about this every day, that\'s the...",,"{'neg': 0.243, 'neu': 0.608, 'pos': 0.149, 'co...",0.149,0.243,0.608,-0.34


In [3]:
labels = []
for i, row in data.iterrows():
    #positive 1
    if row.compound >= 0.3:
        labels.append(1)
    elif row.compound <= -0.3:
        labels.append(-1)
    else:
        labels.append(0)
data['labels'] = labels

In [4]:
data.GPE.unique()

array(['Harlem', 'New York', 'Bronx', 'South Bronx', 'Jersey',
       'East Coast', 'West Coast', 'Broadway', 'Brooklyn', 'NYC',
       'America', 'Jamaica', 'Manhattan', 'Queens', 'Staten Island',
       'Chinatown', 'New York City', 'Long Island', 'Queensbridge',
       'Brownsville', 'SoHo', "Hell's Kitchen", 'Chelsea'], dtype=object)

## Agregating Neighborhood

In [5]:
data.groupby('GPE').count()

Unnamed: 0_level_0,Unnamed: 0,Artist,Song Name,release date,Line,Line Before,Line After,polarity,pos,neg,neu,compound,labels
GPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
America,556,556,556,435,556,523,485,556,556,556,556,556,556
Broadway,46,46,46,28,46,46,45,46,46,46,46,46,46
Bronx,242,242,242,186,242,238,221,242,242,242,242,242,242
Brooklyn,835,835,835,712,835,807,749,835,835,835,835,835,835
Brownsville,52,52,52,47,52,52,50,52,52,52,52,52,52
Chelsea,1,1,1,1,1,1,1,1,1,1,1,1,1
Chinatown,7,7,7,6,7,7,6,7,7,7,7,7,7
East Coast,75,75,75,61,75,73,69,75,75,75,75,75,75
Harlem,434,434,434,344,434,428,414,434,434,434,434,434,434
Hell's Kitchen,1,1,1,1,1,1,1,1,1,1,1,1,1


In [6]:
data['GPE'] = data['GPE'].replace(['NYC'],'New York City')
data['GPE'] = data['GPE'].replace(['New York'],'New York City')
data['GPE'] = data['GPE'].replace(['SoHo'],'New York City')
data['GPE'] = data['GPE'].replace(['Broadway'],'New York City')
data['GPE'] = data['GPE'].replace(['Manhattan'],'New York City')
data['GPE'] = data['GPE'].replace(['Chinatown'],'New York City')

data['GPE'] = data['GPE'].replace(['Jersey'],'Harlem')
data['GPE'] = data['GPE'].replace(['Long Island'],'Harlem')
data['GPE'] = data['GPE'].replace(['Staten Island'],'Harlem')

data['GPE'] = data['GPE'].replace(['South Bronx'],'Bronx')
data['GPE'] = data['GPE'].replace(['Brooklyn'],'Bronx')
data['GPE'] = data['GPE'].replace(['Queensbridge'],'Bronx')
data['GPE'] = data['GPE'].replace(['Queens'],'Bronx')
data['GPE'] = data['GPE'].replace(['Brownsville'],'Bronx')

data['GPE'] = data['GPE'].replace(['East Coast'],'America')
data['GPE'] = data['GPE'].replace(['West Coast'],'America')


## Dropping unwanted location

In [7]:
data.set_index('GPE', inplace=True)

In [8]:
data.drop(['Chelsea', 'Jamaica', "Hell's Kitchen", 'America'], inplace=True)
data.reset_index(inplace=True)

In [9]:
data.groupby('GPE').count()

Unnamed: 0_level_0,Unnamed: 0,Artist,Song Name,release date,Line,Line Before,Line After,polarity,pos,neg,neu,compound,labels
GPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Bronx,1645,1645,1645,1372,1645,1607,1509,1645,1645,1645,1645,1645,1645
Harlem,708,708,708,562,708,694,668,708,708,708,708,708,708
New York City,1178,1178,1178,933,1178,1158,1090,1178,1178,1178,1178,1178,1178


In [10]:
data.columns

Index(['GPE', 'Unnamed: 0', 'Artist', 'Song Name', 'release date', 'Line',
       'Line Before', 'Line After', 'polarity', 'pos', 'neg', 'neu',
       'compound', 'labels'],
      dtype='object')

## Dropping duplicated and NA lines

In [11]:
df1 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line Before', 'labels']]
df1.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']
df2 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']]
df3 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line After', 'labels']]
df3.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']
df_lines = pd.concat([df1, df2, df3])
df_lines.sample(5)

Unnamed: 0,Artist,Song Name,release date,GPE,Line,labels
2192,Young M.A,Body Bag,2019-09-27,Bronx,I'm 'bout to talk my shit,-1
1164,U-God,Climate,2008-01-01,New York City,So we sat at the table with the top heads and ...,0
2626,Keith Murray,Some Shit,2003-07-15,Harlem,And feed it to the jigga-boos wit fried chicke...,0
1401,Masta Ace,Me & AG,2012-07-17,New York City,"I'm so underground, my Queen named Harriet",0
2630,Keith Murray,World Be Free,1994-11-08,Harlem,"Like Philly, D.C., Cincinnati, and M.D",1


In [12]:
data.shape, df_lines.shape

((3531, 14), (10593, 6))

In [13]:
df_lines.duplicated(subset=['Line']).sum()

3576

In [14]:
df_lines = df_lines.drop_duplicates(subset=['Line'])
df_lines = df_lines.dropna(subset=['Line'])
df_lines.shape

(7016, 6)

In [15]:
df_lines.GPE.unique()

array(['Harlem', 'New York City', 'Bronx'], dtype=object)

In [16]:
df_lines.labels.unique()

array([ 0,  1, -1])

## Creating corpus by labels

In [17]:
corpus = []
i = 0

#For each label, creating a corpus of sentences describing it
for neighborhood in df_lines.GPE.unique():
    temp = []
    for j, row in df_lines[df_lines.GPE==neighborhood].iterrows():
        #Cleaning lyrics
        line = row.Line.replace('\\n', ' ')
        line = line.replace("\'", '')
        line = line.replace("\\", '')
        line = re.sub("[\(\[].*?[\)\]]", "", line)
        line = line.lower()

        # Remove words shorter than 3 character
        line = ' '.join([w for w in line.split() if len(w)>2])
        #appening the cleaned line in a temporary list  
        temp.append(line)
        i += 1
        #if i==5: break

    corpus_i = " ".join(temp)
    corpus.append(corpus_i)


  line = re.sub("[\(\[].*?[\)\]]", "", line)


In [18]:
len(corpus)

3

In [19]:
#len(corpus[0]),len(corpus[1]),len(corpus[2])#, len(corpus[3])


In [20]:
corpus[0][0:100]

'motthaven, you know, tremont shawty sucked out the condom, kissed hubby the mouth, stunting the stun'

## Lematization

In [21]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []

    for text in texts:
        doc = nlp(text)
        new_text = []

        for token in doc:
            if token.pos_ in allowed_postags: #pos = part of speach
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
        
    return texts_out

lemmatized_corpus = corpus  #Not activated in this case
len(lemmatized_corpus[0])

56696

In [22]:
len(corpus[0])

56696

## Removing stop words

In [23]:

def gen_words(texts):
    final = []
    for i,text in enumerate(texts):
        new = gensim.utils.simple_preprocess(text, deacc=True) #remove the accent in case there are
        final.append(new)
    return final

data_words = gen_words(lemmatized_corpus)  

In [24]:
len(data_words[0])

9677

In [25]:
# Stopwords removal 
PATH_DATA = './data/wordclouds/'
PATH_STOPWORD = PATH_DATA + 'stopword_list.csv'
sw = pd.read_csv(PATH_STOPWORD, header=None)
sw_list = list(sw.values.flatten())
sw_list =  sw_list[:-2] #remove 2 NaNs at the end of the list

# Wordcloud library stopwords
stopwords =  list(STOPWORDS) + sw_list

# Additionnal handpicked stopwords
add_sw = ['//', 'yeah', 'huh', 'yo', 's', 'nt', 'lyric', 'lyrics', 'll','harlem', 'new','york', 
        'bronx', 'jersey', 'west',
       'manhattan', 'brooklyn', 'taten', 'america', 'east', 'Coast',
       'long', 'island', 'queensbridge', 'brownsville', 'talk', 'man', 'dont', 
       'aint', 'fuck', 'nyc', 'yall', 'rap', 'ill', 'wanna', 'gotta', 'staten', 'youre','coast', 'queens', 'nigga', 'niggas',
       'city', 'em']
stopwords = stopwords + add_sw

# Adding default spacy stopword list
en = spacy.load('en_core_web_sm')
spacy_stopwords = en.Defaults.stop_words
stopwords = stopwords + list(spacy_stopwords)
len(stopwords)

1105

In [26]:
a = []
for l in data_words:
    a.append(len(l))

a

[9677, 17035, 20249]

In [27]:

new_data_words= []

for text in data_words:
    tokens_without_sw = []
    for word in text:
        if not word in stopwords:
            tokens_without_sw.append(word)
    new_data_words.append(tokens_without_sw)
    
data_words = new_data_words

In [28]:
a = []
for l in new_data_words:
    a.append(len(l))

a

[4497, 8560, 9531]

## Bigrams/trigrams

In [29]:
from gensim.models.phrases import Phraser
from gensim.models import Phrases

In [30]:
bigram_phrases = Phrases(data_words, min_count=10, threshold=50)
trigram_phrases = Phrases(bigram_phrases[data_words], min_count=100, threshold=100)

bigram = Phraser(bigram_phrases)
trigram = Phraser(trigram_phrases)

#function changing the individual words by their corresponding bigrams and trigrams
def make_bigrams(texts):
    b = []
    for doc in texts:
        b.append(bigram[doc])
    return b

def make_trigrams(texts):
    b = []
    for doc in texts:
        b.append(trigram[bigram[doc]])
    return b

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

#print(data_bigrams_trigrams[1])

## TF-IDF dictionnary

In [31]:
from gensim.models import TfidfModel
texts = data_bigrams_trigrams

id2word = corpora.Dictionary(texts)

corpus = [id2word.doc2bow(text) for text in texts]


tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.001 #threshold filtering word wich appears more than this freq in all the docs
words = []
words_missing_in_tfidf = []
#Creating new corpus by removing too frequent words
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  

    #reassign        
    corpus[i] = new_bow
len(low_value_words)

0

## Bag of word and dictionnary

id2word = corpora.Dictionary(data_words)

corpus = []

for i, text in enumerate(data_words):
    new = id2word.doc2bow(text)
    corpus.append(new)


id2word[[0][:1][0]]

len(id2word)

## Visualization

In [32]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, random_state=100, 
update_every=1, chunksize=100, passes=10, alpha="auto")

In [33]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
