## Import

In [30]:
import pandas as pd
import numpy as np
import json
import glob
import re

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from wordcloud import WordCloud, STOPWORDS

#spacy
import spacy
#from nltk.corpus import STOPWORDS

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings


In [31]:
DATA_PATH = 'data/wordclouds/unfiltered_polarity.csv'
data = pd.read_csv(DATA_PATH)
data.sample(5)

Unnamed: 0.1,Unnamed: 0,Artist,Song Name,release date,GPE,Line,Line Before,Line After,polarity,pos,neg,neu,compound
4064,4064,Immortal Technique,In Da Club Freestyle,,America,Corporate America dancin' offbeat to the rhythm,"Indigenous holocaust, and the home of the slaves",They really think this country never sponsored...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
978,978,Public Enemy,Get Off My Back,1990-04-10,New York,I\'m in the streets of New York (Go away!),"Which tape should I rock, LL\'s or ours?",I pop in my Kool G Rap and Polo tape,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
3927,3927,50 Cent,Body On It,2009-06-16,Queens,"Police Arrested a 34 year old man, they believ...",Reporter:,,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
244,244,Redman,Blow Your Mind,1998-12-08,America,This black man made it in America,Hey,Facts,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
1095,1095,Jay Z,Brooklyn (Go Hard),2003-04-18,Brooklyn,"Brooklyn we go hard, we go hard (B-R, O, O-K)","Brooklyn we go hard, we go hard","Brooklyn we go hard, we go hard (L-Y, N, come ...","{'neg': 0.359, 'neu': 0.641, 'pos': 0.0, 'comp...",0.0,0.359,0.641,-0.2023


In [32]:
labels = []
for i, row in data.iterrows():
    #positive 1
    if row.compound >= 0.3:
        labels.append(1)
    elif row.compound <= -0.3:
        labels.append(-1)
    else:
        labels.append(0)
data['labels'] = labels

In [33]:
data.labels

0       0
1       0
2       0
3       0
4       1
       ..
4336    1
4337    1
4338    1
4339    0
4340    1
Name: labels, Length: 4341, dtype: int64

In [34]:
data['GPE'] = data['GPE'].replace(['NYC'],'New York City')
data['GPE'] = data['GPE'].replace(['New York'],'New York City')
data['GPE'] = data['GPE'].replace(['South Bronx'],'Bronx')


In [52]:
data[data.GPE == 'Queensbridge'][['Line']].values

array([['While my Queensbridge people stay roughing you up'],
       ["I'm Queensbridge most respected rapper"],
       ["I'm Queensbridge most respected rapper"],
       ['Queensbridge projects is lounging on a cocaine mountain'],
       ['I had the illest gun in Queensbridge history'],
       ["I hear you the reason he can\\'t come to Queensbridge"],
       ['I had the illest gun in Queensbridge history'],
       ["I hear you the reason he can\\'t come to Queensbridge"],
       ["Son, you think I only know niggas in Queensbridge, i\\'m a made nigga"],
       ['Whens the last time you came to Queensbridge to see the kids'],
       ['Straight Outta Queensbridge a crazy motherf**ker named Cormega'],
       ["Even in Queensbridge, you'll be surprised who's informing yo"],
       ['Queensbridge representative, ghetto entrepeneur, stay on tour'],
       ['Hustle mad bricks, Queensbridge, no doubt, rep the clique'],
       ["Repped to the fullest, Queensbridge representin'"],
       ['Queen

In [6]:
data.GPE.unique()

array(['Harlem', 'New York City', 'Bronx', 'Jersey', 'East Coast',
       'West Coast', 'Broadway', 'Brooklyn', 'America', 'Jamaica',
       'Manhattan', 'Queens', 'Staten Island', 'Chinatown', 'Long Island',
       'Queensbridge', 'Brownsville', 'SoHo', "Hell's Kitchen", 'Chelsea'],
      dtype=object)

In [7]:
data.columns

Index(['Unnamed: 0', 'Artist', 'Song Name', 'release date', 'GPE', 'Line',
       'Line Before', 'Line After', 'polarity', 'pos', 'neg', 'neu',
       'compound', 'labels'],
      dtype='object')

## Dropping duplicated and NA lines

In [8]:
df1 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line Before', 'labels']]
df1.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']
df2 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']]
df3 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line After', 'labels']]
df3.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']
df_lines = pd.concat([df1, df2, df3])
df_lines.sample(5)

Unnamed: 0,Artist,Song Name,release date,GPE,Line,labels
3306,Beastie Boys,An Open Letter to NYC,,Bronx,Right next to High Bridge across from Harlem,0
2252,Fat Joe,Another Wild Nigga From the Bronx,,Bronx,"[""Sawed off shotgun, hand on the pump"" -] Cypr...",1
2477,Inspectah Deck,Get Right,2019-07-12,Queens,"That's overlookin', Bronx, Queens, Brooklyn",0
1888,KRS-One,Build ya Skillz,1993-09-28,Bronx,Overthrowing the comp Big up Bronx!,1
27,A$AP Rocky,Angels,,Harlem,Rollin\' in my Benzo,0


In [9]:
data.shape, df_lines.shape

((4341, 14), (13023, 6))

In [10]:
df_lines.duplicated(subset=['Line']).sum()

4417

In [11]:
df_lines = df_lines.drop_duplicates(subset=['Line'])
df_lines = df_lines.dropna(subset=['Line'])
df_lines.shape

(8605, 6)

In [12]:
df_lines.GPE.unique()

array(['Harlem', 'New York City', 'Bronx', 'Jersey', 'East Coast',
       'West Coast', 'Broadway', 'Brooklyn', 'America', 'Jamaica',
       'Manhattan', 'Queens', 'Chinatown', 'Long Island', 'Queensbridge',
       'Brownsville', 'Staten Island', 'SoHo', "Hell's Kitchen",
       'Chelsea'], dtype=object)

In [13]:
df_lines.labels.unique()

array([ 0,  1, -1])

## Creating corpus by labels

In [14]:
corpus = []
i = 0

#For each label, creating a corpus of sentences describing it
for label in df_lines.labels.unique():
    temp = []
    for j, row in df_lines[df_lines.labels==label].iterrows():
        #Cleaning lyrics
        line = row.Line.replace('\\n', ' ')
        line = line.replace("\'", '')
        line = line.replace("\\", '')
        line = re.sub("[\(\[].*?[\)\]]", "", line)
        line = line.lower()

        # Remove words shorter than 3 character
        line = ' '.join([w for w in line.split() if len(w)>2])
        #appening the cleaned line in a temporary list  
        temp.append(line)
        i += 1
        #if i==5: break

    corpus_i = " ".join(temp)
    corpus.append(corpus_i)


  line = re.sub("[\(\[].*?[\)\]]", "", line)


In [15]:
len(corpus[0]),len(corpus[1]),len(corpus[2])


(232228, 65518, 68707)

In [16]:
corpus[0][0:100]

'motthaven, you know, tremont still watching out for and torres cars, chains2embed", "50s 100s lyrics'

## Lematization

In [17]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []

    for text in texts:
        doc = nlp(text)
        new_text = []

        for token in doc:
            if token.pos_ in allowed_postags: #pos = part of speach
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
        
    return texts_out

lemmatized_corpus = corpus  #Not activated in this case
len(lemmatized_corpus[0])

232228

In [18]:
len(corpus[2])

68707

## Removing stop words

In [19]:

def gen_words(texts):
    final = []
    for i,text in enumerate(texts):
        new = gensim.utils.simple_preprocess(text, deacc=True) #remove the accent in case there are
        final.append(new)
    return final

data_words = gen_words(lemmatized_corpus)  

In [20]:
len(data_words[0])

38892

In [21]:
# Stopwords removal 
PATH_DATA = './data/wordclouds/'
PATH_STOPWORD = PATH_DATA + 'stopword_list.csv'
sw = pd.read_csv(PATH_STOPWORD, header=None)
sw_list = list(sw.values.flatten())
sw_list =  sw_list[:-2] #remove 2 NaNs at the end of the list

# Wordcloud library stopwords
stopwords =  list(STOPWORDS) + sw_list

# Additionnal handpicked stopwords
add_sw = ['//', 'yeah', 'huh', 'yo', 's', 'nt', 'lyric', 'lyrics', 'll','harlem', 'new','york', 
        'bronx', 'jersey', 'west',
       'manhattan', 'brooklyn', 'taten', 'america', 'east', 'Coast',
       'long', 'island', 'queensbridge', 'brownsville', 'talk', 'man', 'dont', 
       'aint', 'fuck', 'nyc', 'yall', 'rap', 'ill', 'wanna', 'gotta', 'staten', 'youre','coast', 'queens', 'nigga', 'niggas',
       'city', 'em']
stopwords = stopwords + add_sw

# Adding default spacy stopword list
en = spacy.load('en_core_web_sm')
spacy_stopwords = en.Defaults.stop_words
stopwords = stopwords + list(spacy_stopwords)
len(stopwords)

1105

In [22]:
a = []
for l in data_words:
    a.append(len(l))

a

[38892, 11096, 11641]

In [23]:

new_data_words= []

for text in data_words:
    tokens_without_sw = []
    for word in text:
        if not word in stopwords:
            tokens_without_sw.append(word)
    new_data_words.append(tokens_without_sw)
    
data_words = new_data_words

In [24]:
a = []
for l in new_data_words:
    a.append(len(l))

a

[18896, 5353, 5628]

## Bigrams/trigrams

In [25]:
from gensim.models.phrases import Phraser
from gensim.models import Phrases

In [26]:
bigram_phrases = Phrases(data_words, min_count=2, threshold=100)
trigram_phrases = Phrases(bigram_phrases[data_words], threshold=100)

bigram = Phraser(bigram_phrases)
trigram = Phraser(trigram_phrases)

#function changing the individual words by their corresponding bigrams and trigrams
def make_bigrams(texts):
    b = []
    for doc in texts:
        b.append(bigram[doc])
    return b

def make_trigrams(texts):
    b = []
    for doc in texts:
        b.append(trigram[bigram[doc]])
    return b

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

#print(data_bigrams_trigrams[1])

## TF-IDF dictionnary

In [27]:
from gensim.models import TfidfModel
texts = data_bigrams_trigrams

id2word = corpora.Dictionary(texts)

corpus = [id2word.doc2bow(text) for text in texts]


tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.01 #threshold filtering word wich appears more than this freq in all the docs
words = []
words_missing_in_tfidf = []
#Creating new corpus by removing too frequent words
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  

    #reassign        
    corpus[i] = new_bow
low_value_words

[16,
 29,
 33,
 73,
 115,
 156,
 172,
 180,
 215,
 218,
 252,
 253,
 257,
 301,
 307,
 309,
 310,
 315,
 317,
 325,
 327,
 333,
 335,
 336,
 339,
 340,
 350,
 358,
 362,
 366,
 372,
 377,
 398,
 399,
 417,
 418,
 419,
 445,
 451,
 458,
 471,
 474,
 494,
 501,
 511,
 518,
 539,
 542,
 547,
 548,
 553,
 572,
 574,
 579,
 585,
 586,
 636,
 638,
 643,
 656,
 659,
 669,
 681,
 691,
 709,
 713,
 717,
 728,
 746,
 747,
 755,
 761,
 779,
 783,
 793,
 799,
 801,
 819,
 834,
 854,
 858,
 869,
 871,
 881,
 890,
 902,
 927,
 947,
 959,
 1003,
 1008,
 1013,
 1027,
 1035,
 1037,
 1041,
 1045,
 1061,
 1067,
 1070,
 1090,
 1091,
 1123,
 1134,
 1136,
 1139,
 1148,
 1157,
 1164,
 1172,
 1182,
 1216,
 1219,
 1226,
 1230,
 1244,
 1256,
 1264,
 1270,
 1310,
 1319,
 1326,
 1328,
 1346,
 1355,
 1361,
 1375,
 1393,
 1412,
 1413,
 1427,
 1431,
 1445,
 1447,
 1452,
 1453,
 1454,
 1460,
 1485,
 1486,
 1500,
 1508,
 1511,
 1512,
 1526,
 1559,
 1563,
 1567,
 1597,
 1623,
 1657,
 1662,
 1676,
 1701,
 1710,
 1718,
 

## Bag of word and dictionnary

id2word = corpora.Dictionary(data_words)

corpus = []

for i, text in enumerate(data_words):
    new = id2word.doc2bow(text)
    corpus.append(new)


id2word[[0][:1][0]]

len(id2word)

## Visualization

In [28]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, random_state=100, 
update_every=1, chunksize=100, passes=10, alpha="auto")

In [29]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
