## Import

In [54]:
import pandas as pd
import numpy as np
import json
import glob
import re

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from wordcloud import WordCloud, STOPWORDS

#spacy
import spacy
#from nltk.corpus import STOPWORDS

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings


In [55]:
DATA_PATH = 'data/wordclouds/selected_polarity-2022-05-08 11:43:02.920751.csv'
data = pd.read_csv(DATA_PATH)
data.head(5)

Unnamed: 0.1,Unnamed: 0,Artist,Song Name,release date,GPE,Line,Line Before,Line After,polarity,pos,neg,neu,compound
0,8,French Montana,100,2012-04-13,Harlem,Shout out to ma niggas in Harlem,"Motthaven, you know, Tremont",Can't call no blocks out there,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
1,14,French Montana,2 Times,,New York,New York undercovers amongst the others,Still watching out for JC and Torres,The family be couple hundred of us,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
2,16,French Montana,50's & 100's,2016-11-05,South Bronx,Who run it? (South Bronx),"40 cars, 40 chains2Embed"", ""50's & 100's Lyric...","Who run it? (North Memphis, let me chirp these...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
3,95,French Montana,Bag,,New York,"Talk about New York, big estate","And grab a casket, my chest christens in the l...",Got the bank and I ain't fit (That's a stash),"{'neg': 0.0, 'neu': 0.714, 'pos': 0.286, 'comp...",0.286,0.0,0.714,0.4939
4,98,French Montana,Bag,,New York,Runnin' through New York with a strap (That's ...,[Verse 2: French Montana],"'Cause we comin' for they head, not they chaps...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0


In [56]:
data.shape

(1620, 13)

In [57]:
data['GPE'] = data['GPE'].replace(['NYC'],'New York City')
data['GPE'] = data['GPE'].replace(['New York'],'New York City')
data['GPE'] = data['GPE'].replace(['South Bronx'],'Bronx')

In [58]:
data.GPE.unique()

array(['Harlem', 'New York City', 'Bronx', 'Jersey', 'West Coast',
       'Manhattan', 'Brooklyn', 'Staten Island', 'America', 'East Coast',
       'Long Island', 'Queensbridge', 'Brownsville'], dtype=object)

In [59]:
data.columns

Index(['Unnamed: 0', 'Artist', 'Song Name', 'release date', 'GPE', 'Line',
       'Line Before', 'Line After', 'polarity', 'pos', 'neg', 'neu',
       'compound'],
      dtype='object')

## Dropping duplicated and NA lines

In [60]:
df1 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line Before']]
df1.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line']
df2 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line']]
df3 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line After']]
df3.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line']
df_lines = pd.concat([df1, df2, df3])
df_lines.sample(5)

Unnamed: 0,Artist,Song Name,release date,GPE,Line
1411,A$AP Ferg,Focus,,Harlem,"Flip phone shorty got a Motorola, feds can't l..."
549,EPMD,House Party,,Brownsville,
481,Das EFX,If Only,1993-11-16,Brooklyn,"The boogity-woogity Brooklyn boy, he\'s cool l..."
122,A Tribe Called Quest,His Name is Mutty Ranks,1993-11-09,New York City,"From New York to ATLiens, youknowhatI'msayin?"
780,Ja Rule,Clap Back,,Brooklyn,"[Ja Rule] All my Brooklyn niggas, Brooklyn sir..."


In [61]:
data.shape, df_lines.shape

((1620, 13), (4860, 5))

In [62]:
df_lines.duplicated(subset=['Line']).sum()

1522

In [63]:
df_lines = df_lines.drop_duplicates(subset=['Line'])
df_lines = df_lines.dropna(subset=['Line'])
df_lines.shape

(3337, 5)

In [64]:
df_lines.GPE.unique()

array(['Harlem', 'New York City', 'Bronx', 'Jersey', 'West Coast',
       'Manhattan', 'Brooklyn', 'America', 'Long Island', 'Queensbridge',
       'Brownsville', 'Staten Island', 'East Coast'], dtype=object)

## Creating corpus by neighborhood

In [65]:
corpus = []
i = 0

#For each neighborhood, creating a corpus of sentences describing it
for neighborhood in df_lines.GPE.unique():
    temp = []
    for j, row in df_lines[df_lines.GPE==neighborhood].iterrows():
        #Cleaning lyrics
        line = row.Line.replace('\\n', ' ')
        line = line.replace("\'", '')
        line = line.replace("\\", '')
        line = re.sub("[\(\[].*?[\)\]]", "", line)
        line = line.lower()

        # Remove words shorter than 3 character
        line = ' '.join([w for w in line.split() if len(w)>2])
        #appening the cleaned line in a temporary list  
        temp.append(line)
        i += 1
        #if i==5: break

    corpus_i = " ".join(temp)
    corpus.append(corpus_i)


  line = re.sub("[\(\[].*?[\)\]]", "", line)


In [66]:
len(corpus[0])


13873

In [67]:
corpus[0][0:100]

'motthaven, you know, tremont all niggas the scene like some sparta shit brought the garden back over'

## Lematization

In [68]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []

    for text in texts:
        doc = nlp(text)
        new_text = []

        for token in doc:
            if token.pos_ in allowed_postags: #pos = part of speach
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
        
    return texts_out

lemmatized_corpus = corpus  #Not activated in this case
len(lemmatized_corpus[0])

13873

In [69]:
corpus[0][0:100]

'motthaven, you know, tremont all niggas the scene like some sparta shit brought the garden back over'

## Removing stop words

In [70]:

def gen_words(texts):
    final = []
    for i,text in enumerate(texts):
        new = gensim.utils.simple_preprocess(text, deacc=True) #remove the accent in case there are
        final.append(new)
    return final

data_words = gen_words(lemmatized_corpus)  

In [71]:
len(data_words[0])

2335

In [72]:
# Stopwords removal 
PATH_DATA = './data/wordclouds/'
PATH_STOPWORD = PATH_DATA + 'stopword_list.csv'
sw = pd.read_csv(PATH_STOPWORD, header=None)
sw_list = list(sw.values.flatten())
sw_list =  sw_list[:-2] #remove 2 NaNs at the end of the list

# Wordcloud library stopwords
stopwords =  list(STOPWORDS) + sw_list

# Additionnal handpicked stopwords
add_sw = ['//', 'yeah', 'huh', 'yo', 's', 'nt', 'lyric', 'lyrics', 'll','harlem', 'new','york', 
        'bronx', 'jersey', 'west',
       'manhattan', 'brooklyn', 'taten', 'america', 'east', 'Coast',
       'long', 'island', 'queensbridge', 'brownsville', 'talk', 'man', 'dont', 
       'aint', 'fuck', 'nyc', 'yall', 'rap', 'ill', 'wanna', 'gotta', 'staten', 'youre','coast', 'queens', 'nigga', 'niggas',
       'city', 'em']
stopwords = stopwords + add_sw

# Adding default spacy stopword list
en = spacy.load('en_core_web_sm')
spacy_stopwords = en.Defaults.stop_words
stopwords = stopwords + list(spacy_stopwords)
len(stopwords)

1105

In [73]:
a = []
for l in data_words:
    a.append(len(l))

a

[2335, 7120, 1836, 937, 300, 554, 5231, 3415, 319, 405, 442, 606, 289]

In [74]:

new_data_words= []

for text in data_words:
    tokens_without_sw = []
    for word in text:
        if not word in stopwords:
            tokens_without_sw.append(word)
    new_data_words.append(tokens_without_sw)
    
data_words = new_data_words

In [75]:
a = []
for l in new_data_words:
    a.append(len(l))

a

[1096, 3267, 791, 471, 125, 256, 2410, 1574, 142, 187, 225, 269, 121]

## Bigrams/trigrams

In [76]:
from gensim.models.phrases import Phraser
from gensim.models import Phrases

In [77]:
bigram_phrases = Phrases(data_words, min_count=2, threshold=100)
trigram_phrases = Phrases(bigram_phrases[data_words], threshold=100)

bigram = Phraser(bigram_phrases)
trigram = Phraser(trigram_phrases)

#function changing the individual words by their corresponding bigrams and trigrams
def make_bigrams(texts):
    b = []
    for doc in texts:
        b.append(bigram[doc])
    return b

def make_trigrams(texts):
    b = []
    for doc in texts:
        b.append(trigram[bigram[doc]])
    return b

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

#print(data_bigrams_trigrams[1])

## TF-IDF dictionnary

In [78]:
from gensim.models import TfidfModel
texts = data_bigrams_trigrams

id2word = corpora.Dictionary(texts)

corpus = [id2word.doc2bow(text) for text in texts]


tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.01 #threshold filtering word wich appears more than this freq in all the docs
words = []
words_missing_in_tfidf = []
#Creating new corpus by removing too frequent words
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  

    #reassign        
    corpus[i] = new_bow
low_value_words

[]

## Bag of word and dictionnary

id2word = corpora.Dictionary(data_words)

corpus = []

for i, text in enumerate(data_words):
    new = id2word.doc2bow(text)
    corpus.append(new)


id2word[[0][:1][0]]

len(id2word)

## Visualization

In [81]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, random_state=100, 
update_every=1, chunksize=50, passes=10, alpha="auto")

In [82]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
