## Import

In [1]:
import pandas as pd
import numpy as np
import json
import glob
import re

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from wordcloud import WordCloud, STOPWORDS

#spacy
import spacy
#from nltk.corpus import STOPWORDS

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings


In [2]:
DATA_PATH = 'data/wordclouds/selected_polarity-2022-05-08 11:43:02.920751.csv'
data = pd.read_csv(DATA_PATH)
data.sample(5)

Unnamed: 0.1,Unnamed: 0,Artist,Song Name,release date,GPE,Line,Line Before,Line After,polarity,pos,neg,neu,compound
1251,6910,Beastie Boys,An Open Letter to NYC,,Brooklyn,"Brooklyn, Bronx, Queens, and Staten",Through your gates at Ellis Island we passed i...,From the Battery to the top of Manhattan,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
89,1030,Redman,Boodah Break,2007-03-27,New York,My foul mouth turn a New York crowd out,Before Mr. T my jewels was too gaudy,Reach out and touch and use nine to dial out,"{'neg': 0.0, 'neu': 0.667, 'pos': 0.333, 'comp...",0.333,0.0,0.667,0.4588
62,973,Redman,All I Do,,East Coast,"(Jersey in the buildin\', New York, East Coast...",It starts when I feel the flow,I\'m talkin\' \'bout music (I\'m talkin\' \'bo...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
145,1476,Kurtis Blow,Party Time,1985-01-01,Harlem,A place called Harlem was my home,I'm Kurtis Blow on the microphone,"I was rocking one day, it started to shake","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0
672,4355,KRS-One,Beware,2008-02-19,New York,When you advertise in New York,[Hook: KRS-One],"You best to beware, oh yes","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0


In [3]:
labels = []
for i, row in data.iterrows():
    #positive 1
    if row.compound >= 0.5:
        labels.append(1)
    elif row.compound <= -0.5:
        labels.append(-1)
    else:
        labels.append(0)
data['labels'] = labels

In [4]:
data.labels

0       0
1       0
2       0
3       0
4       0
       ..
1615   -1
1616    0
1617    0
1618    1
1619    0
Name: labels, Length: 1620, dtype: int64

In [5]:
data['GPE'] = data['GPE'].replace(['NYC'],'New York City')
data['GPE'] = data['GPE'].replace(['New York'],'New York City')
data['GPE'] = data['GPE'].replace(['South Bronx'],'Bronx')

In [6]:
data.GPE.unique()

array(['Harlem', 'New York City', 'Bronx', 'Jersey', 'West Coast',
       'Manhattan', 'Brooklyn', 'Staten Island', 'America', 'East Coast',
       'Long Island', 'Queensbridge', 'Brownsville'], dtype=object)

In [7]:
data.columns

Index(['Unnamed: 0', 'Artist', 'Song Name', 'release date', 'GPE', 'Line',
       'Line Before', 'Line After', 'polarity', 'pos', 'neg', 'neu',
       'compound', 'labels'],
      dtype='object')

## Dropping duplicated and NA lines

In [8]:
df1 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line Before', 'labels']]
df1.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']
df2 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']]
df3 = data[['Artist', 'Song Name', 'release date', 'GPE', 'Line After', 'labels']]
df3.columns = ['Artist', 'Song Name', 'release date', 'GPE', 'Line', 'labels']
df_lines = pd.concat([df1, df2, df3])
df_lines.sample(5)

Unnamed: 0,Artist,Song Name,release date,GPE,Line,labels
174,Cormega,Glory Days,2005-02-22,Brooklyn,Alpo and all them Mob Style niggas doin' it up...,1
1177,Mos Def,Brooklyn (Alternate Version),,Brooklyn,Brooklyn my habitat,0
841,Fat Joe,Another Wild Nigga From the Bronx,,Bronx,"I come from the Bronx, a.k.a., West Bubblefuck",0
585,2Pac,All Out,1993-02-16,West Coast,"Just West Coast, slut, to my real niggas stuck...",1
1305,Nas,Black President,,America,,0


In [9]:
data.shape, df_lines.shape

((1620, 14), (4860, 6))

In [10]:
df_lines.duplicated(subset=['Line']).sum()

1522

In [11]:
df_lines = df_lines.drop_duplicates(subset=['Line'])
df_lines = df_lines.dropna(subset=['Line'])
df_lines.shape

(3337, 6)

In [12]:
df_lines.GPE.unique()

array(['Harlem', 'New York City', 'Bronx', 'Jersey', 'West Coast',
       'Manhattan', 'Brooklyn', 'America', 'Long Island', 'Queensbridge',
       'Brownsville', 'Staten Island', 'East Coast'], dtype=object)

In [13]:
df_lines.labels.unique()

array([ 0, -1,  1])

## Creating corpus by labels

In [14]:
corpus = []
i = 0

#For each label, creating a corpus of sentences describing it
for label in df_lines.labels.unique():
    temp = []
    for j, row in df_lines[df_lines.labels==label].iterrows():
        #Cleaning lyrics
        line = row.Line.replace('\\n', ' ')
        line = line.replace("\'", '')
        line = line.replace("\\", '')
        line = re.sub("[\(\[].*?[\)\]]", "", line)
        line = line.lower()

        # Remove words shorter than 3 character
        line = ' '.join([w for w in line.split() if len(w)>2])
        #appening the cleaned line in a temporary list  
        temp.append(line)
        i += 1
        #if i==5: break

    corpus_i = " ".join(temp)
    corpus.append(corpus_i)


  line = re.sub("[\(\[].*?[\)\]]", "", line)


In [15]:
len(corpus[0])


111966

In [16]:
corpus[0][0:100]

'motthaven, you know, tremont still watching out for and torres cars, chains2embed", "50s 100s lyrics'

## Lematization

In [18]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []

    for text in texts:
        doc = nlp(text)
        new_text = []

        for token in doc:
            if token.pos_ in allowed_postags: #pos = part of speach
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
        
    return texts_out

lemmatized_corpus = corpus  #Not activated in this case
len(lemmatized_corpus[0])

111966

In [22]:
len(corpus[2])

13077

## Removing stop words

In [23]:

def gen_words(texts):
    final = []
    for i,text in enumerate(texts):
        new = gensim.utils.simple_preprocess(text, deacc=True) #remove the accent in case there are
        final.append(new)
    return final

data_words = gen_words(lemmatized_corpus)  

In [24]:
len(data_words[0])

19041

In [25]:
# Stopwords removal 
PATH_DATA = './data/wordclouds/'
PATH_STOPWORD = PATH_DATA + 'stopword_list.csv'
sw = pd.read_csv(PATH_STOPWORD, header=None)
sw_list = list(sw.values.flatten())
sw_list =  sw_list[:-2] #remove 2 NaNs at the end of the list

# Wordcloud library stopwords
stopwords =  list(STOPWORDS) + sw_list

# Additionnal handpicked stopwords
add_sw = ['//', 'yeah', 'huh', 'yo', 's', 'nt', 'lyric', 'lyrics', 'll','harlem', 'new','york', 
        'bronx', 'jersey', 'west',
       'manhattan', 'brooklyn', 'taten', 'america', 'east', 'Coast',
       'long', 'island', 'queensbridge', 'brownsville', 'talk', 'man', 'dont', 
       'aint', 'fuck', 'nyc', 'yall', 'rap', 'ill', 'wanna', 'gotta', 'staten', 'youre','coast', 'queens', 'nigga', 'niggas',
       'city', 'em']
stopwords = stopwords + add_sw

# Adding default spacy stopword list
en = spacy.load('en_core_web_sm')
spacy_stopwords = en.Defaults.stop_words
stopwords = stopwords + list(spacy_stopwords)
len(stopwords)

1105

In [26]:
a = []
for l in data_words:
    a.append(len(l))

a

[19041, 2527, 2221]

In [27]:

new_data_words= []

for text in data_words:
    tokens_without_sw = []
    for word in text:
        if not word in stopwords:
            tokens_without_sw.append(word)
    new_data_words.append(tokens_without_sw)
    
data_words = new_data_words

In [28]:
a = []
for l in new_data_words:
    a.append(len(l))

a

[8701, 1218, 1015]

## Bigrams/trigrams

In [29]:
from gensim.models.phrases import Phraser
from gensim.models import Phrases

In [30]:
bigram_phrases = Phrases(data_words, min_count=2, threshold=100)
trigram_phrases = Phrases(bigram_phrases[data_words], threshold=100)

bigram = Phraser(bigram_phrases)
trigram = Phraser(trigram_phrases)

#function changing the individual words by their corresponding bigrams and trigrams
def make_bigrams(texts):
    b = []
    for doc in texts:
        b.append(bigram[doc])
    return b

def make_trigrams(texts):
    b = []
    for doc in texts:
        b.append(trigram[bigram[doc]])
    return b

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

#print(data_bigrams_trigrams[1])

## TF-IDF dictionnary

In [31]:
from gensim.models import TfidfModel
texts = data_bigrams_trigrams

id2word = corpora.Dictionary(texts)

corpus = [id2word.doc2bow(text) for text in texts]


tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.01 #threshold filtering word wich appears more than this freq in all the docs
words = []
words_missing_in_tfidf = []
#Creating new corpus by removing too frequent words
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  

    #reassign        
    corpus[i] = new_bow
low_value_words

[]

## Bag of word and dictionnary

id2word = corpora.Dictionary(data_words)

corpus = []

for i, text in enumerate(data_words):
    new = id2word.doc2bow(text)
    corpus.append(new)


id2word[[0][:1][0]]

len(id2word)

## Visualization

In [32]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, random_state=100, 
update_every=1, chunksize=100, passes=10, alpha="auto")

In [33]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
