In [43]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
df = pd.read_csv('lyrics_data/all_lyrics.csv', index_col = 0)


In [90]:
text_corpus = df.text.values.tolist()
text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing email addresses
text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline characters
text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quote characters


In [93]:
warnings.simplefilter("ignore", DeprecationWarning)

def doc_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

words = list(doc_to_words(text_corpus))

In [95]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
 
def remove_stopwords(text):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in text_corpus]
 
words = remove_stopwords(words)

In [97]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
 
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
 
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

0 documents lemmatised


In [102]:
import gensim.corpora as corpora
id2word = corpora.Dictionary(data_lemmatized)
 
# Create Corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
 
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           per_word_topics=True)

In [103]:
doc_lda = lda_model[corpus]

In [104]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [106]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=text_corpus):
    # Array of top 10 topics
    top10array = []
 
    for row in range(ldamodel.num_topics):
        wp = ldamodel.show_topic(row)
        topic_keywords = ", ".join([word for word, prop in wp])
        top10array.append((row+1, topic_keywords))
 
    top10dict = dict(top10array)
 
    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) for topic in ldamodel[corpus]])[0])
    sent_topics_df.columns=["Data"]
    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
 
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
    return(sent_topics_df)
 
df_topic_sents_keywords = format_topics_sentences()

In [114]:
df_topic_sents_keywords

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
0,7,0.8635,"go, hand, feel, let, good, get, dream, save, m...",you’re the sun that rose again in my life a r...
1,5,0.9891,"go, not, want, wanna, even, do, know, get, sti...",if you ask me about that moment bright sunshi...
2,13,0.8124,"love, feel, know, get, heart, look, time, even...",at first sight i could recognize you as if we...
3,17,0.9916,"get, baby, call, time, go, feel, know, day, il...",you hide but only appear when you smile where...
4,4,0.8612,"love, go, crazy, say, want, even, world, heart...",is this love is this love sometimes i know so...
5,14,0.4421,"girl, day, baby, get, know, go, say, show, wor...",the world is a complex we wus lookin for love...
6,5,0.6485,"go, not, want, wanna, even, do, know, get, sti...",a sound of something breaking i awake from sl...
7,4,0.5792,"love, go, crazy, say, want, even, world, heart...",for you i could pretend like i was happy when...
8,5,0.9915,"go, not, want, wanna, even, do, know, get, sti...",full of loneliness this garden bloomed full o...
9,5,0.6046,"go, not, want, wanna, even, do, know, get, sti...",it was a good start in itself before i knew i...


In [109]:
sent_topics_sorteddf_mallet = pd.DataFrame()
 
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
 
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)
 
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
 
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
 
# Show
sent_topics_sorteddf_mallet.head()

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,1,0.9953,"go, get, feel, heart, love, know, want, come, ...",like snow piles up i remember what you gave m...
1,2,0.9952,"go, head, rock, let, even, look, get, feel, sa...",you’re my type you’re my type even if you don...
2,3,0.9965,"call, baby, let, run, want, go, get, know, lov...",i had the same dream in an empty desert at the...
3,4,0.9957,"love, go, crazy, say, want, even, world, heart...",it’s what you hear at least once when you’re y...
4,5,0.9952,"go, not, want, wanna, even, do, know, get, sti...",goodbye goodbye if i knew we were going to br...


In [111]:
df.text[0]

'  you’re the sun that rose again in my life a reincarnation of my childhood dreams i don’t know what these emotions are am i still dreaming  this dream is a blue mirage in the desert a priori deep inside of me i’m so happy i can’t breathe my surroundings are getting more and more transparent  i hear the faraway ocean across the dream over the horizon i’m going to the place that’s getting clearer take my hands now you are the cause of my euphoria  euphoria take my hands now you are the cause of my euphoria euphoria close the door now when i’m with you i’m in utopia  were you wandering around looking for an erased dream too it’s different from the typical definition of destiny your pained eyes are looking at the same place as me wont you please stay in dreams  i hear the ocean from far away across the dream over the horizon i’m going to the place that’s getting clearer take my hands now you are the cause of my euphoria  euphoria take my hands now you are the cause of my euphoria  even i