In [1]:
import pandas as pd
import lyricsgenius as lg
import textdistance as td
import multiprocessing as mp


# Data preprepocessing:
Here we make a dataset with all unique song as entry. 

In [None]:
song = pd.read_csv("lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv",sep= '\t',error_bad_lines = False,names = ["userID","DateTime","MBID_Artistid","Artist","MBID_Songid","Song"])
song = song[["MBID_Songid","Song","Artist"]]
song["Song_Artist"] = song.Song +' '+song.Artist
songs_count = song.Song_Artist.value_counts()
songs_count = pd.DataFrame(songs_count).reset_index().rename(columns = {"Song_Artist" : "Count", "index":"Song_Artist"})
song = song[~song.duplicated('Song_Artist')] 
song = pd.merge(song,songs_count, on = "Song_Artist")
song = song.sort_values("Count", ascending = False).reset_index(drop=True)

# Lyrics Scraping

Here we query the lyrics using the genius API with the python library LyricsGenius. Since the query does not always return the song we asked, we check using the levenshtein similiraty if the returned song name and artist is similar enough to the one we queried

In [None]:
genius = lg.Genius("Xa7lZ8kmkbGSE0Pj5rnbcVqMdlNYMDy3QZ5eye0wF8GJzittEMZ-r5BZ_L8fhW72",skip_non_songs=True, verbose = False)

def clean_text(text):
    if type(text) == str:
        text = text.lower()
        text = text.replace("remix","")
        text = text.replace("radio edit","")
        text = text.replace("/","")
        text = ' '.join(text.split())
        for pair in ["()","[]","{}"]:
            start=0
            end = 0
            while start !=-1 and end != -1 and start <= end and len(text)>0:
                start = text.find(pair[0])
                end = text.find(pair[1])
                if text[0] == pair[0] and text[-1] == pair[1]:
                    text = text[1:-1]
                    if text == 'unknown':
                        text = ''
                elif start !=-1 and end != -1:
                    text = text[:start] + text[end+1:]
        text = ' '.join(text.split())
    return text

def levenshtein_verification(row)
    song = row.Song
    artist = row.Artist
    query = row.Lyrics
    song_sim = max(td.levenshtein.normalized_similarity(song.lower(),query["title"].lower()),
                   td.levenshtein.normalized_similarity(clean_text(song),query["title"].lower()))
    artist_sim = max(td.levenshtein.normalized_similarity(artist.lower(),query["artist"].lower()),
                    td.levenshtein.normalized_similarity(clean_text(artist),query["artist"].lower()))
    
    if (song_sim >0.6) and (artist_sim > 0.6):
        row.Lyrics = {'artist':query["artist"], 'title':query["title"], 'lyrics': query['lyrics']}
    else:
        row.Lyrics = '[unfound]'
    return row

song['Lyrics'] = ''

for ind,row in song.iterrows():
    if type(row.Song_Artist) == str:
        if row.Lyrics == '':
            try:
                row.Lyrics = genius.search_song(row.Song,row.Artist).to_dict()
                lyrics = levenshtein_verification(row).Lyrics
            except:
                lyrics = '[unfound]'
            song.loc[ind,"Lyrics"] = lyrics

# Topic Modeling using LDA model

In [None]:
import spacy as sp
from spacy_langdetect import LanguageDetector
from spacy.language import Language
import numpy as np
from deep_translator import GoogleTranslator

@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector(language_detection_function=None)


nlp = sp.load("en_core_web_md")
nlp.add_pipe('language_detector')
lemmatizer = nlp.get_pipe("lemmatizer")

In [None]:
lyrics = pd.read_csv("Lyrics/lyrics",index_col=0)

In [4]:
lyrics = lyrics[lyrics.Lyrics.apply(lambda x : '{' in x)][["Song_Artist","Lyrics"]]
lyrics = lyrics[~lyrics.Song_Artist.duplicated()]
lyrics.Lyrics = lyrics.Lyrics.apply(eval)
lyrics.Lyrics = lyrics.Lyrics[lyrics.Lyrics.apply(lambda x : type(x) == dict)].apply(lambda x : x['lyrics'])
lyrics = lyrics[lyrics.Lyrics.apply(lambda x :  len(x) >5 and len(x)<5000) ]

In [2]:
#a function to filter and tokenize the lyrics
count = 0
def tokenize_lyrics(lyric):
    global count
    try:
        #remove bracket ([]) and its content
        #print(lyric)
        lyric = lyric.replace('\n',' ')
        pair='[]'
        start=0
        end = 0
        while start !=-1 and end != -1 and start <= end and len(lyric)>0:
            start = lyric.find(pair[0])
            end = lyric.find(pair[1])
            if start !=-1 and end != -1:
                lyric = lyric[:start] + lyric[end+1:]

        #remove unwanted character
        chars = '*%"-_/&=#@^~¨$€£'
        for char in chars:
            lyric = lyric.replace(char,' ')
        #print(lyric)
        lyric = ' '.join(lyric.split()).lower()

        #translate lyric to english
        length = len(lyric)
        if length> 5 and length < 5000: #around 4% of the dataset are bigger than 5000 char
            doc = nlp(lyric)
            lang = doc._.language
            if lang["language"] != 'en' or lang["score"]<0.7:
                if lang["score"]<0.7:
                    try:
                        lyric=GoogleTranslator(source='auto', target='en').translate(lyric)
                    except:
                        lyric = []
                else:
                    try:
                        lyric=GoogleTranslator(source=lang["language"], target='en').translate(lyric)
                    except:
                        lyric = []

                if len(lyric) != 0:
                    doc = nlp(lyric)
            if len(lyric) != 0:
                lyric = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_oov and len(token.text)>2]
        else:
            lyric = []
    except:
        print(count)
        lyric = []
    count+=1
    return lyric

def remove_word(word_list):
    word_to_remove = ["get","come","like","way","tell","look", "man", "hear", "right", "'cause","because","yeah","let","'em","gon","hey","ho","oh","ooh","oooh",'go']
    for word in  word_to_remove:
        if word in word_list:
            word_list = [w for w in word_list if w!=word]
    return word_list

In [None]:
#preprocessing (translation of non english lyrics, removing stop word and out-of-vocabulary word)
n_cpu = 3
with mp.Pool(n_cpu) as pool:
    lyrics["Lyrics_en"] = pool.map(tokenize_lyrics, lyrics["Lyrics"])


In [4]:
import gensim
from gensim.models import CoherenceModel
import numpy as np
import pickle



In [6]:


# Build the bigram and trigram models
bigram = gensim.models.Phrases(lyrics.Lyrics_en.to_list(), min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[lyrics.Lyrics_en.to_list()], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


id2word = gensim.corpora.Dictionary([remove_word(doc) for doc in make_trigrams(lyrics.Lyrics_en.to_list())]) # we remove the word after the trigram making
id2word.filter_extremes(no_below = 40, no_above = 0.75) #remove word present in less than 50 lyrics or in more than 75% of the lyrics
corpus = [id2word.doc2bow(lyric) for lyric in lyrics.Lyrics_en.to_list()]
lyrics = lyrics.Lyrics_en.to_list()

In [8]:
num_topics = 6
num_workers = 4
#Model training
lda_model = gensim.models.LdaMulticore(workers = num_workers,
                                       corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       chunksize = 500,
                                       passes=20,
                                       iterations=100,
                                       eta = None,
                                       alpha = np.ones(num_topics))

doc_lda = lda_model[corpus]
lda_model.print_topics()

[(0,
  '0.055*"know" + 0.039*"time" + 0.024*"think" + 0.020*"life" + 0.018*"want" + 0.017*"feel" + 0.017*"thing" + 0.016*"try" + 0.015*"day" + 0.015*"leave"'),
 (1,
  '0.010*"say" + 0.009*"old" + 0.007*"people" + 0.007*"big" + 0.007*"little" + 0.007*"new" + 0.007*"home" + 0.006*"good" + 0.006*"work" + 0.005*"drink"'),
 (2,
  '0.013*"die" + 0.012*"god" + 0.011*"life" + 0.009*"world" + 0.009*"kill" + 0.009*"dead" + 0.008*"soul" + 0.008*"blood" + 0.007*"death" + 0.006*"fight"'),
 (3,
  '0.014*"shit" + 0.014*"fuck" + 0.011*"rock" + 0.010*"know" + 0.009*"bitch" + 0.009*"nigga" + 0.008*"cause" + 0.008*"hit" + 0.008*"niggas" + 0.007*"beat"'),
 (4,
  '0.128*"love" + 0.051*"baby" + 0.044*"want" + 0.032*"girl" + 0.026*"wanna" + 0.024*"know" + 0.023*"feel" + 0.020*"little" + 0.020*"need" + 0.018*"dance"'),
 (5,
  '0.019*"eye" + 0.018*"light" + 0.018*"night" + 0.017*"dream" + 0.017*"away" + 0.016*"fall" + 0.013*"day" + 0.012*"heart" + 0.012*"sun" + 0.011*"run"')]

In [16]:
num_topics = 5
id2word = gensim.corpora.dictionary.Dictionary.load("LDAmodel/id2word.txt")
with open("LDAmodel/corpus.txt", "rb") as fp:   # Unpickling
    corpus = pickle.load(fp)
lda_model = gensim.models.LdaMulticore.load("LDAmodel/5topasym")
doc_lda = lda_model[corpus]

In [17]:
#topic visualisation
topic_term_dists=lda_model.get_topics()
doc_topic_dists = np.array([np.array([pair[1] for pair in topics]+[0]*(num_topics-len(topics))) for topics in doc_lda])
doc_topic_dists = doc_topic_dists/np.sum(doc_topic_dists,axis=1)[:, None]
doc_lengths = [len(doc) for doc in corpus]
term_freq = id2word.cfs
term_freq = [term_freq[key] for key in sorted(term_freq)]
vocab = [key for key in id2word.token2id]

import pyLDAvis# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.prepare(topic_term_dists,
                                   doc_topic_dists,
                                   doc_lengths,
                                   vocab, term_freq)
LDAvis_prepared


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-note

### Topic 1 : Love/Romance
### Topic 2 : Vulgarity/ Street language
### Topic 3 : Festivity/ Pop Culture
### Topic 4 : Violence
### Topic 5 : Contemplative


In [18]:
num_topics = 6
#id2word = gensim.corpora.dictionary.Dictionary.load("LDAmodel/id2word.txt")
#with open("LDAmodel/corpus.txt", "rb") as fp:   # Unpickling
#    corpus = pickle.load(fp)
lda_model = gensim.models.LdaMulticore.load("LDAmodel/6topsym")
doc_lda = lda_model[corpus]

In [19]:
#topic visualisation
topic_term_dists=lda_model.get_topics()
doc_topic_dists = np.array([np.array([pair[1] for pair in topics]+[0]*(num_topics-len(topics))) for topics in doc_lda])
doc_topic_dists = doc_topic_dists/np.sum(doc_topic_dists,axis=1)[:, None]
doc_lengths = [len(doc) for doc in corpus]
term_freq = id2word.cfs
term_freq = [term_freq[key] for key in sorted(term_freq)]
vocab = [key for key in id2word.token2id]

import pyLDAvis# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.prepare(topic_term_dists,
                                   doc_topic_dists,
                                   doc_lengths,
                                   vocab, term_freq)
LDAvis_prepared

### Topic 1 : Melancholy/Regret
### Topic 2 : Violence/ Spirituality
### Topic 3 : Daily Life/ Pop Culture
### Topic 4 : Contemplative/ Journey
### Topic 5 : Vulgarity/Street language
### Topic 6 : Romance/Festivity


## Hyper Parameter tuning

In [8]:
def hp_tuning(id2word,texts,corpus,k,eta,alpha) :
    num_topics = k
    num_workers = 4
    lda_model = gensim.models.LdaMulticore(workers = num_workers,
                                       corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       chunksize = 2000,
                                       passes=20,
                                       iterations=80,
                                       eta = eta,
                                       alpha = alpha)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, corpus = corpus, coherence='c_v')
    return coherence_model_lda.get_coherence()

In [23]:
coherence_scores = {}
n_topics = range(4,20)
etas = [None,'auto']
for k in n_topics:
    alphas = ['asymmetric',np.array([i for i in range(k*2,k,-1)]),np.ones(k)]
    for ind1,eta in enumerate(etas):
            for ind2,alpha in enumerate(alphas):
                try :
                    coherence_scores[k,ind1,ind2]
                except:
                    coherence_scores[k,ind1,ind2]=hp_tuning(id2word,lyrics.Lyrics_en.to_list(),corpus,k,eta,alpha)
     
            

In [41]:
n_topics=[]
etas=[[],[]]
alphas=[[],[],[]]
for k in range(4,20):
    topic = []
    for eta in range(2):
        for alpha in range(3):
            topic.append(coherence_scores[k,eta,alpha])
            etas[eta].append(coherence_scores[k,eta,alpha])
            alphas[alpha].append(coherence_scores[k,eta,alpha])
    n_topics.append(topic)          

In [45]:
print([np.mean(eta) for eta in etas])
print([np.mean(alpha) for alpha in alphas])
print([np.mean(n_topic) for n_topic in n_topics])

[0.40192152969925693, 0.3979923396619694]
[0.41270533321635394, 0.379927606997542, 0.4072378638279436]
[0.4270167873483935, 0.43713811098785943, 0.43500973083801564, 0.41576445726926514, 0.4007052831323778, 0.40254840617623594, 0.3961988217011776, 0.3955474866553958, 0.3910367393503569, 0.3937775396460643, 0.38756780727465484, 0.3887611831957401, 0.3813925729867229, 0.3855265890636847, 0.38219131210522383, 0.37912812715864247]
