In [None]:
import pandas as pd
import lyricsgenius as lg
import textdistance as td


# Data preprepocessing:
Here we make a dataset with all unique song as entry. 

In [None]:
song = pd.read_csv("lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv",sep= '\t',error_bad_lines = False,names = ["userID","DateTime","MBID_Artistid","Artist","MBID_Songid","Song"])
song = song[["MBID_Songid","Song","Artist"]]
song["Song_Artist"] = song.Song +' '+song.Artist
songs_count = song.Song_Artist.value_counts()
songs_count = pd.DataFrame(songs_count).reset_index().rename(columns = {"Song_Artist" : "Count", "index":"Song_Artist"})
song = song[~song.duplicated('Song_Artist')] 
song = pd.merge(song,songs_count, on = "Song_Artist")
song = song.sort_values("Count", ascending = False).reset_index(drop=True)

# Lyrics Scraping

Here we query the lyrics using the genius API with the python library LyricsGenius. Since the query does not always return the song we asked, we check using the levenshtein similiraty if the returned song name and artist is similar enough to the one we queried

In [None]:
genius = lg.Genius("Xa7lZ8kmkbGSE0Pj5rnbcVqMdlNYMDy3QZ5eye0wF8GJzittEMZ-r5BZ_L8fhW72",skip_non_songs=True, verbose = False)

def clean_text(text):
    if type(text) == str:
        text = text.lower()
        text = text.replace("remix","")
        text = text.replace("radio edit","")
        text = text.replace("/","")
        text = ' '.join(text.split())
        for pair in ["()","[]","{}"]:
            start=0
            end = 0
            while start !=-1 and end != -1 and start <= end and len(text)>0:
                start = text.find(pair[0])
                end = text.find(pair[1])
                if text[0] == pair[0] and text[-1] == pair[1]:
                    text = text[1:-1]
                    if text == 'unknown':
                        text = ''
                elif start !=-1 and end != -1:
                    text = text[:start] + text[end+1:]
        text = ' '.join(text.split())
    return text

def levenshtein_verification(row)
    song = row.Song
    artist = row.Artist
    query = row.Lyrics
    song_sim = max(td.levenshtein.normalized_similarity(song.lower(),query["title"].lower()),
                   td.levenshtein.normalized_similarity(clean_text(song),query["title"].lower()))
    artist_sim = max(td.levenshtein.normalized_similarity(artist.lower(),query["artist"].lower()),
                    td.levenshtein.normalized_similarity(clean_text(artist),query["artist"].lower()))
    
    if (song_sim >0.6) and (artist_sim > 0.6):
        row.Lyrics = {'artist':query["artist"], 'title':query["title"], 'lyrics': query['lyrics']}
    else:
        row.Lyrics = '[unfound]'
    return row

song['Lyrics'] = ''

for ind,row in song.iterrows():
    if type(row.Song_Artist) == str:
        if row.Lyrics == '':
            try:
                row.Lyrics = genius.search_song(row.Song,row.Artist).to_dict()
                lyrics = levenshtein_verification(row).Lyrics
            except:
                lyrics = '[unfound]'
            song.loc[ind,"Lyrics"] = lyrics

# Topic Modeling
Topic modeling using LDA \
We did not manange to find meaningful topic yet

In [None]:
import spacy as sp
from spacy_langdetect import LanguageDetector
from spacy.language import Language
import numpy as np
from deep_translator import GoogleTranslator,batch_detection, single_detection
import gensim

@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector(language_detection_function=None)


nlp = sp.load("en_core_web_sm")
nlp.add_pipe('language_detector')
lemmatizer = nlp.get_pipe("lemmatizer")

In [None]:
count = 0
#a function to filter and tokenize the lyrics
def tokenize_lyrics(lyric):
    global count
    #remove bracket ([]) and its content
    lyric = lyric.replace('\n',' ')
    pair='[]'
    start=0
    end = 0
    while start !=-1 and end != -1 and start <= end and len(lyric)>0:
        start = lyric.find(pair[0])
        end = lyric.find(pair[1])
        if start !=-1 and end != -1:
            lyric = lyric[:start] + lyric[end+1:]
            
    #remove unwanted character
    chars = '*%"-_/&=#@^~¨$€£'
    for char in chars:
        lyric = replace(char,' ')
    lyric = ' '.join(lyric.split()).lower()
    
    #translate lyric to english
    if len(lyric)>0:
        doc = nlp(lyric)
        lang = doc._.language
        if lang["language"] != 'en' or lang["score"]<0.7:
            if lang["score"]<0.7:
                try:
                    lyric=GoogleTranslator(source='auto', target='en').translate(lyric[:4999])
                except:
                    print(count)
                    print(lyric)
            else:
                try:
                    lyric=GoogleTranslator(source=lang["language"], target='en').translate(lyric[:4999])
                except:
                    print(count)
                    print(lyric)
            doc = nlp(lyric)

        lyric = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_oov]
    else:
        lyric = []
    count+=1
    return lyric

In [None]:
#preprocessing
lyrics = song.Lyrics[song.Lyrics.apply(lambda x : type(x) == dict)].apply(lambda x : x['lyrics'])
lyrics = lyrics.apply(tokenize_lyrics)

In [None]:
#Model training
id2word = gensim.corpora.Dictionary(lyrics)
corpus = [id2word.doc2bow(lyric) for lyric in lyrics]
num_topics = 5
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       chunksize = 100,
                                       passes=5)

doc_lda = lda_model[corpus]
lda_model.print_topics()

In [None]:
#topic visualisation
topic_term_dists=lda_model.get_topics()
doc_topic_dists = np.array([np.array([pair[1] for pair in topics]+[0]*(num_topics-len(topics))) for topics in doc_lda])
doc_topic_dists = doc_topic_dists/np.sum(doc_topic_dists,axis=1)[:, None]
doc_lengths = [len(doc) for doc in corpus]
term_freq = id2word.cfs
term_freq = [term_freq[key] for key in sorted(term_freq)]
vocab = [key for key in id2word.token2id]

import pyLDAvis# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.prepare(topic_term_dists,
                                   doc_topic_dists,
                                   doc_lengths,
                                   vocab, term_freq)
LDAvis_prepared
