In [2]:
# Run in python console
import nltk; nltk.download('stopwords')

# Run in terminal or command prompt
!python3 -m spacy download en


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/preetgandhi95/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

[93m    Linking successful[0m
    /Users/preetgandhi95/miniconda3/envs/nlpclass/lib/python3.6/site-packages/en_core_web_sm
    -->
    /Users/preetgandhi95/miniconda3/envs/nlpclass/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [4]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


In [5]:
lyr=pd.read_csv('merged_eng_lyrics_genre_explicit.csv')

In [6]:
def repl(e):
    return [re.sub('\n', ' ', sent) for sent in e]

In [7]:
blues=repl(list(lyr[lyr['Blues']==1]['lyrics']))
cg=repl(list(lyr[lyr['Christian/Gospel']==1]['lyrics']))
country=repl(list(lyr[lyr['Country']==1]['lyrics']))
de=repl(list(lyr[lyr['Dance/Electro']==1]['lyrics']))
disco=repl(list(lyr[lyr['Disco']==1]['lyrics']))
dh=repl(list(lyr[lyr['Dutch-House']==1]['lyrics']))
folk=repl(list(lyr[lyr['Folk']==1]['lyrics']))
hiphop=repl(list(lyr[lyr['Hip-Hop']==1]['lyrics']))
indie=repl(list(lyr[lyr['Indie']==1]['lyrics']))
jazz=repl(list(lyr[lyr['Jazz']==1]['lyrics'] ) )
latin=repl(list(lyr[lyr['Latin']==1]['lyrics']))
metal=repl(list(lyr[lyr['Metal']==1]['lyrics']))
other=repl(list(lyr[lyr['Other']==1]['lyrics']))
pop=repl(list(lyr[lyr['Pop']==1]['lyrics']))
rb=repl(list(lyr[lyr['R&B']==1]['lyrics']))
reggae=repl(list(lyr[lyr['Reggae']==1]['lyrics']))
ra=repl(list(lyr[lyr['Rock/Alt']==1]['lyrics']))
rap=repl(list(lyr[lyr['Rock/Alt/Pop']==1]['lyrics']))
missing=repl(list(lyr[lyr['missing']==1]['lyrics']))
explicit=repl(list(lyr[lyr['Explicit']==1]['lyrics']))



In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

blues_words = list(sent_to_words(blues))
cg_words = list(sent_to_words(cg))
country_words = list(sent_to_words(country))
de_words = list(sent_to_words(de))
disco_words = list(sent_to_words(disco))
dh_words = list(sent_to_words(dh))
folk_words = list(sent_to_words(folk))
hiphop_words = list(sent_to_words(hiphop))
jazz_words = list(sent_to_words(jazz))
latin_words = list(sent_to_words(latin))
metal_words = list(sent_to_words(metal))
other_words = list(sent_to_words(other))
pop_words = list(sent_to_words(pop))
rb_words = list(sent_to_words(rb))
reggae_words = list(sent_to_words(reggae))
ra_words = list(sent_to_words(ra))
rap_words = list(sent_to_words(rap))
missing_words = list(sent_to_words(missing))
explicit_words = list(sent_to_words(explicit))

In [9]:
def build(wrd):
# Build the bigram and trigram models
    bigram = gensim.models.Phrases(wrd, min_count=1, threshold=1) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[wrd], threshold=1)  

# Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod,trigram_mod

blues_b,blues_t=build(blues_words)
cg_b,cg_t=build(cg_words)
country_b,country_t=build(country_words)
de_b,de_t=build(de_words)
disco_b,disco_t=build(disco_words)
dh_b,dh_t=build(dh_words)
folk_b,folk_t=build(folk_words)
hiphop_b,hiphop_t=build(hiphop_words)
jazz_b,jazz_t=build(jazz_words)
latin_b,latin_t=build(latin_words)
metal_b,metal_t=build(metal_words)
other_b,other_t=build(other_words)
pop_b,pop_t=build(pop_words)
rb_b,rb_t=build(rb_words)
reggae_b,reggae_t=build(reggae_words)
ra_b,ra_t=build(ra_words)
rap_b,rap_t=build(rap_words)
missing_b,missing_t=build(missing_words)
explicit_b,explicit_t=build(explicit_words)    

In [17]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts,bi):
    return [bi[doc] for doc in texts]

def make_trigrams(texts,tri,bi):
    return [tri[bi[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        nlp = spacy.load('en', disable=['parser', 'ner'])
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [18]:
def lem(data_words,bi,tri):# Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops,bi)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    return lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])



In [None]:
# blues_lem=lem(blues_words)
# cg_lem=lem(cg_words)
# country_lem=lem(country_words)
# de_lem=lem(de_words)
# disco_lem=lem(disco_words)
# dh_lem=lem(dh_words)
# folk_lem=lem(folk_words)
hiphop_lem=lem(hiphop_words,hiphop_b,hiphop_t)
# jazz_lem=lem(jazz_words)
# latin_lem=lem(latin_words)
# metal_lem=lem(metal_words)
# other_lem=lem(other_words)
# pop_lem=lem(pop_words)
# rb_lem=lem(rb_words)
# reggae_lem =lem(reggae_words)
# ra_lem=lem(ra_words)
# rap_lem=lem(rap_words)
# missing_lem=lem(missing_words)
# explicit_lem=lem(explicit_words)

In [78]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]



In [81]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('because', 1),
  ('broken_heart', 1),
  ('care', 1),
  ('clown', 7),
  ('come', 1),
  ('cry', 1),
  ('die', 1),
  ('dream', 4),
  ('easy', 1),
  ('everybody', 3),
  ('feeling', 1),
  ('guess', 1),
  ('guy', 1),
  ('joke', 2),
  ('know', 5),
  ('laugh', 4),
  ('little_closer', 1),
  ('look', 2),
  ('love', 7),
  ('part', 1),
  ('party', 1),
  ('play', 1),
  ('run', 1),
  ('say', 2),
  ('say_love', 1),
  ('see', 2),
  ('side', 1),
  ('smile', 1),
  ('start', 4),
  ('tell', 1),
  ('thing', 1),
  ('want', 1),
  ('wonder', 1),
  ('would', 1)]]

In [82]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [83]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.037*"wanna" + 0.031*"duck" + 0.024*"saturday" + 0.016*"mama" + '
  '0.015*"dealer" + 0.014*"wall" + 0.014*"wig" + 0.014*"trudi" + 0.012*"get" + '
  '0.010*"motor"'),
 (1,
  '0.017*"look" + 0.015*"tonight" + 0.011*"moon" + 0.010*"music" + '
  '0.010*"gonna_make" + 0.009*"catch" + 0.009*"star" + 0.009*"life" + '
  '0.008*"doo" + 0.007*"breathe"'),
 (2,
  '0.043*"want" + 0.032*"tired" + 0.026*"baby" + 0.016*"keep" + 0.015*"get" + '
  '0.014*"mouth" + 0.012*"despair" + 0.010*"money" + 0.009*"fine" + '
  '0.009*"sittin"'),
 (3,
  '0.036*"know" + 0.033*"go" + 0.026*"love" + 0.016*"come" + 0.016*"day" + '
  '0.015*"get" + 0.014*"baby" + 0.014*"heart" + 0.012*"see" + 0.012*"girl"'),
 (4,
  '0.008*"day" + 0.008*"water" + 0.007*"say" + 0.007*"glenarvan" + '
  '0.006*"boat" + 0.006*"make" + 0.006*"shore" + 0.005*"birthday" + 0.005*"go" '
  '+ 0.004*"sea"'),
 (5,
  '0.055*"get" + 0.015*"shit" + 0.010*"try" + 0.009*"bump" + 0.009*"go" + '
  '0.009*"say" + 0.009*"really" + 0.008*"business"

In [84]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.141460404241398

Coherence Score:  0.3827779398472426


In [85]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
