In [1]:
import re
import dbus
import json
import spacy
import langid
import string
import spotipy
import requests
import flatdict
import itertools
import numpy as np
import pandas as pd
from time import time
import en_core_web_sm
from time import sleep
from random import randint
from fuzzywuzzy import fuzz
from bs4 import BeautifulSoup
from fuzzywuzzy import process
from collections import OrderedDict
from IPython.core.display import clear_output
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
class TrackInfo:
    '''A class containing methods to retrieve metada and lyrics of Spotify tracks'''
    
    def __init__(self):        
        ''' Instantiate spotify session
            Return: track object with its title, artist and album'''
        
        #Dictionary with ISO-language codes mapped to languages
        self.codes = {"af":"Afrikaans", "am":"Amharic", "an":"Aragonese", "ar":"Arabic", "as":"Assamese", "az":"Azerbaijani",\
                "be":"Belarusian", "bg":"Bulgarian", "bn":"Bengali", "br":"Breton", "bs":"Bosnian",\
                "ca":"Catalan", "cs":"Czech", "cy":"Welsh", "da":"Danish", "de":"German", "dz":"Dzongkha",\
                "el":"Greek", "en":"English", "eo":"Esperanto", "es":"Spanish", "et":"Estonian", "eu":"Basque",\
                "fa":"Persian", "fi":"Finnish", "fo":"Faroese", "fr":"French", "ga":"Irish", "gl":"Galician", "gu":"Gujarati", \
                "he":"Hebrew", "hi":"Hindi", "hr":"Croatian", "ht":"Haitian", "hu":"Hungarian", "hy":"Armenian",\
                "id":"Indonesian", "is":"Icelandic", "it":"Italian", "ja":"Japanese", "jv":"Javanese", \
                "ka":"Georgian", "kk":"Kazakh", "km":"Central Khmer", "kn":"Kannada", "ko":"Korean", "ku":"Kurdish", "ky":"Kirghiz",\
                "la":"Latin", "lb":"Luxembourgish", "lo":"Lao", "lt":"Lithuanian", "lv":"Latvian", "mg":"Malagasy", "mk":"Macedonian", \
                "ml":"Malayalam", "mn":"Mongolian", "mr":"Marathi", "ms":"Malay", "mt":"Maltese", "nb":"Norwegian Bokmål", "ne":"Nepali",\
                "nl":"", "nn":"Norwegian Nynorsk", "no":"Norwegian", "oc":"Occitan", "or":"Oriya", "pa":"Punjabi", "pl":"Polish",\
                "ps":"Pashto", "pt":"Portuguese", "qu":"Quechua", "ro":"Romanian", "ru":"Russian", "rw":"Kinyarwanda",\
                "se":"Northern Sami", "si":"Sinhala", "sk":"Slovak", "sl":"Slovenian", "sq":"Albanian", "sr":"Serbian", \
                "sv":"Swedish", "sw":"Swahili", "ta":"Tamil", "te":"Telugu", "th":"Thai", "tl":"Tagalog", "tr":"Turkish",\
                "ug":"Uighur", "uk":"Ukrainian", "ur":"Urdu", "vi":"Vietnamese", "vo":"Volapük", \
                "wa":"Walloon", "xh":"Xhosa", "zh":"Chinese", "zu":"Zulu"}
        
        self.spotify = None
        self.trackDict = dict()

        # API tokens
        client_id = "314c62651741485eb3bdc9c07f8d5b73"
        client_secret = "e6d17930c5df497ba2fa1f1bfce6321b"
        redirect_url = "https://localhost:8888/callback"

        #Spotify api call
        try:
            client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
            self.spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
        except:
            print("Exception during Spotify session initialization")
        
    
    def unpack(self, result):
        # Retrieve the track title, album name and artist names from spotify track metadata
        # If there are multiple artist append with '&'
        self.req = 0
        if result:
            # Flatten the nested metadata dictionary. Nested keys are joined with '.' delimeter
            self.trackDict = flatdict.FlatDict(result, delimiter='.')
            self.track_id = self.trackDict['uri']
            self.title = self.trackDict['name']
            self.artist = ' & '.join([i['name'] for i in self.trackDict['artists']])
            self.album = self.trackDict['album.name']
            self.album_uri = self.trackDict['album.uri']
            self.artist_uri = [i['uri'] for i in self.trackDict['artists']]
    
    def processTrackName(self, title, album):
        '''Returns the processed name of the track by removing appended details like album
           This makes it possible to fetch the lyrics from Genius API by passing the track title
           as the API returns a null result if the track title is passed along with other appended details
        '''
        title = re.sub(r'[\(\[\{].*?[\)\]\}]', "", title)
        title = (title.split('-')[0]).strip()
        if album:
            ratio1 = fuzz.partial_ratio(album, title)
            ratio2 = fuzz.token_set_ratio(album, title)
            if ratio1 < 45 and ratio2 < 45:
                title = title.replace(album, '')
        return title.strip()
        
    def fetch_lyrics_page(self, song_title, artist_name):
        '''Make a request to Genius API to search and fetch the lyrics page of the track'''
        response = None
        base_url = 'https://api.genius.com'
        headers = {'Authorization': 'Bearer YkmxZvf4kNPOxSd3aTuuczGjWDZintTlgGLQYtjxAy4__gQbpl-EENB_LRb1Nwaz'}
        search_url = base_url + '/search'
        data = {'q': song_title + ' ' + artist_name}
        try:
            response = requests.get(search_url, data=data, headers=headers)
            self.req = self.req + 1
            geniusResponse = response.json()
            i = 0
            names = artist_name.split('&')
            while (not geniusResponse['response']['hits']) and (i < len(names)):   
                data = {'q': song_title + ' ' + names[i]}
                response = requests.get(search_url, data=data, headers=headers)
                self.req = self.req + 1
                geniusResponse = response.json()
                i = i + 1
                sleep(randint(1,3))
            return geniusResponse
        except:
            print("Exception during request call to Genius")
    
        
    def processStrings(self, strng):
        '''Returns a processed string after removing punctutaion and converting to lowercase'''
        strng = re.sub(r'[\W_]+', u'', strng, flags=re.UNICODE)
        return strng.lower()
    
    def findResponseMatch(self, geniusResponse, artist_names, trackTitle, albumName):
        '''Find a match of the song in Genius database to fetch the track lyrics
           Steps: 1) Find a match for the artist_name
                  2) If the artist_name matches, check if the track title matches
                  
            Algorithm to match Artist Name
            1) Search for a match with the exact name as present in music.artist column in dataset(inputString)
            2) If a match is not found,
               a) Remove punctutation from the name string along with whitespace
               b) Convert the string to lowercase
               c) Apply steps 'a' and 'b' to the string "match['result']['primary_artist']['name']" from response object (responseString)
               d) Apply approximate string matching (use fuzzywuzzy package)
            
            Algorithm to match Track Title
            1) Search for a match with the exact name as present in music.title column in dataset(inputTitle)
            2) 
            
            Returns: response object entry for which matches on the artist and track title names
        '''
        
        track_info = None
        bestMatch = None
        bestRatio = {'Ratio':0, 'Result': None}
        inputArtist = self.processStrings(artist_names)
        inputTitle = self.processStrings(trackTitle)
        
        if geniusResponse and geniusResponse['response'] and geniusResponse['response']['hits']: 
            #Iterate through the response object
            for match in geniusResponse['response']['hits']:
                result = None
                responseArtist = self.processStrings(match['result']['primary_artist']['name'])
                # Search a match with the exact name in music.artist dataset column
                if responseArtist:                
                    if (responseArtist in inputArtist) or (inputArtist in responseArtist):
                        result = match['result']
                    else:
                        ratio1 = fuzz.partial_ratio(inputArtist, responseArtist)
                        ratio2 = fuzz.token_set_ratio(inputArtist, responseArtist)
                        if ratio1 > 70 and ratio2 > 70:
                            result = match['result']

                # If a match for the artist_name is found, find a match for the title track
                if result:
                    rtitle = self.processStrings(result['title'])
                    titleFeat = self.processStrings(result['title_with_featured'])
                    fullTitle = self.processStrings(result['full_title'])
                    # Search a match with the exact name in music.title dataset column
                    if (inputTitle == rtitle) or (inputTitle == titleFeat) or (inputTitle == fullTitle):
                        track_info = result
                        break
                    # Remove punctuations and convert to lower case
                    # Remove album name and artist names from the input string                
                    else:
                        processedTitle = self.processStrings(trackTitle.split('-')[0])
                        if (processedTitle == rtitle) or (processedTitle == titleFeat) or (processedTitle == fullTitle):
                            bestMatch = result
                        else:
                            processedTitle = self.processTrackName(processedTitle, albumName)
                            if (processedTitle == rtitle) or (processedTitle == titleFeat) or (processedTitle == fullTitle):
                                bestMatch = result
                            else:
                                ratio1 = fuzz.partial_ratio(inputTitle, rtitle)
                                ratio2 = fuzz.token_set_ratio(inputTitle, rtitle)
                                if ratio1 > 70 and ratio2 > 70 and (ratio1+ratio2) > bestRatio['Ratio']:
                                    bestRatio['Result'] = result                   
                else:
                    continue
                
        if (not track_info) and (bestMatch):
            track_info = bestMatch
        elif bestRatio['Result']:
            track_info = bestRatio['Result'] 

        return track_info

    def scrape_lyrics_artist_title(self, genius_track_url):
        '''Get lyrics of the track
           Returns: song lyrics, track title (as found in Genius), artist name (as found in Genius)
        '''        
        lyrics = None
        foundArtistName = None
        foundTrackTitle = None    
        featuredArtistName = None
        producerName = None
        other_track_info = {}
        page = requests.get(genius_track_url)
        self.req = self.req + 1
        html = BeautifulSoup(page.text, 'html.parser')
        lyricsClass = html.find('div', class_='lyrics')
        if lyricsClass:
            lyrics = lyricsClass.get_text()
        if html.find('div', class_='header_with_cover_art-primary_info'):
            titleTag = html.find('h1', class_ = 'header_with_cover_art-primary_info-title')
            artistTag = html.find('a', class_ = 'header_with_cover_art-primary_info-primary_artist')
            otherTag = html.find_all('h3')
            for i in range(len(otherTag)):
                key = otherTag[i].find('span', {'class': 'metadata_unit-label'})
                value = otherTag[i].find('span', {'class': 'metadata_unit-info'})
                if key:
                    other_track_info[key.get_text()] = value.get_text()

            if titleTag:
                foundTrackTitle = titleTag.get_text()
            if artistTag:
                foundArtistName = artistTag.get_text()

        return (lyrics, foundTrackTitle, foundArtistName, other_track_info)

    def scrape_artist_bio(self, genius_artist_url):
        '''Get artist description through Genius API'''
        artist_bio = None
        page = requests.get(genius_artist_url)
        self.req = self.req + 1
        html = BeautifulSoup(page.text, 'html.parser')
        infoTag = html.find('div', class_='rich_text_formatting')
        if infoTag:
            artist_bio = infoTag.get_text()
        return artist_bio
    
    def getTrackLanguage(self, lyrics):
        '''Returns the language of the track'''

        language = langid.classify(lyrics)
        return self.codes[language[0]]

    def getAudioFeatures(self):
        '''Returns Audio Features for a track'''
        self.req = self.req + 1
        return self.sp.audio_features(self.track_id)
    
    def getArtistGenre(self):
        '''Returns the Genre of the Artist'''
        genres = [(self.sp.artist(uri))['genres'] for uri in self.artist_uri]
        self.req = self.req + 1
        return list(itertools.chain.from_iterable(genres))
        
    def getAlbumGenre(self):
        '''Returns the Genre of the Album'''
        album = self.sp.albums([self.album_uri])
        self.req = self.req + 1
        return album['albums'][0]['genres']   
    
    def processArtistName(self, artist):
        '''Returns the exact name of the track by removing appended details
           This makes it possible to fetch the lyrics from Genius API by passing the track title
           as the API returns a null result if the track title is passed along with other appended details
        '''
        title = re.sub(r'\(.*?\)', "", title)
        title = title.split('-')[0]
        return title.strip()
    
    def getArtistCountry(self, songlang, artist_url):
        '''Return the country of the artist
           If the song language is not in English, then set language country as the country of the artist
           Else use the first four sentences of the artist bio to detect the country
           Since the information about the nationality is present in the first four-five lines of the biography      
        '''
        country = ''
        if songlang != "English":
            country = songlang
        else:
        #Use the artist url to scrape the Artist biography from Genius
            if artist_url:
                artist_info = self.scrape_artist_bio(artist_url)

                if artist_info:
                    infoLang = langid.classify(artist_info)
                    if infoLang[0] != 'en':
                        country = self.codes[infoLang[0]]
                    else:
                        splitText = artist_info.split('.')
                        length = len(splitText)
                        shortText = None
                        if length >= 1:
                            shortText = splitText[0]
                        if length > 2:
                            shortText = shortText + ". " +  splitText[1]
                        if length > 3:
                            shortText = shortText + ". " +  splitText[2]

                        if shortText:
                            nlp = en_core_web_sm.load()
                            doc = nlp(shortText)
                            st = ''
                            for ent in doc.ents:
                                if ent.label_ == "NORP":
                                    return ent.text
                                elif ent.label_ == "GPE":
                                    st = st + ent.text + ', '
                            country = st[0:-2]

        return country
    


In [3]:
def main():
    
    lyrics = []    
    tracks = []    
    trackLyrics = dict()    

    #Import the songs data into a dataframe
    songResults = pd.read_csv('results_id_queries.csv')
       
        
    #Give the range of data frame rows for which the lyrics is required
    sn = songResults[0:2]     
            
    # Get the uri list
    uri_list = sn['uri'].tolist()        

    trackInfo = TrackInfo()

    tracksDetails = trackInfo.spotify.tracks(uri_list)
    artists_uris = [m['artists'][0]['uri'] for m in tracksDetails['tracks']]
    artists_genres = trackInfo.spotify.artists(artists_uris)
    album_genres = []
    audio_features = trackInfo.spotify.audio_features(uri_list)

    #Iterate over the rows of the dataframe
    for i in range(sn.shape[0]):
        value = sn.iloc[i, :]
        featuresDict = OrderedDict()
        track_info = None
        song_url = None
        artist_url = None
        track_lyrics = None
        artist_info = None
        songlang = None
        lyrics_state = None
        start_time = time()

        #Create an instance of the trackInfo class
        trackInfo.unpack(tracksDetails['tracks'][i])

        trackid = trackInfo.track_id.split(":")[-1]

        # Add values from dataframe to the dictionary
        featuresDict['track_uri'] = trackInfo.track_id
        featuresDict['music_title'] = value['music.title']
        featuresDict['music_artist'] = value['music.artist']
        featuresDict['music_album'] = value['music.album']
        featuresDict['found_title_spotify'] = trackInfo.title
        featuresDict['found_album_spotify'] = trackInfo.album
        featuresDict['found_artist_spotify'] = trackInfo.artist

        #Get the cleaned track title by removing appended details in brackets and albumName.
        #Split the string on  '-' and pass the first token in the call which is the actual title
        trackTitle = trackInfo.processTrackName(trackInfo.title, trackInfo.album)

        #Make a call to Genius and retrive matches of title-artist combination in the genius lyrics database
        geniusResponse = trackInfo.fetch_lyrics_page(trackTitle, trackInfo.artist)    

        #Genius call returns the best possible matches. Find the title-artist that best matched spotify input title-artist
        matchedTrack = trackInfo.findResponseMatch(geniusResponse, trackInfo.artist, trackInfo.title, trackInfo.album)

        #If a match is found in genius, fetch the track and artist url from the response
        #Lyrics State specifices if the lyrics is complete
        if matchedTrack:
            song_url = matchedTrack['url']
            artist_url = matchedTrack['primary_artist']['url']
            lyrics_state = matchedTrack['lyrics_state']

        #Store the urls in the dictionary
        if artist_url:
            featuresDict['genius_artist_url'] = artist_url
        #Use the song url to scrape the lyrics page from Genius
        if song_url:
            featuresDict['genius_track_url'] = song_url
            featuresDict['lyrics_state'] = lyrics_state
            #Retrieve the lyrics, track title, artist and album name as found in Genius
            geniusInfo = trackInfo.scrape_lyrics_artist_title(song_url)

            #Store the results in a dictionary
            #If lyrics is found, create a value as "lyrics:trackid and store it in trackDict
            #Store the text of the lyrics in a separate dictionary which is finally stored in a json file

            if geniusInfo:
                track_lyrics = geniusInfo[0]
                #print(track_lyrics)
                trackLyrics['lyrics:'+trackid] = track_lyrics
                trackInfo.trackDict['lyricsId'] = 'lyrics:'+trackid
    
                if geniusInfo[1]:
                    featuresDict['found_genius_title'] = geniusInfo[1]
                if geniusInfo[2]:
                    featuresDict['found_genius_artist'] = geniusInfo[2]
                if geniusInfo[3]:
                    featuresDict['other_track_info'] = geniusInfo[3]

        #Detect the language of the lyrics
        if track_lyrics:
            songlang = trackInfo.getTrackLanguage(track_lyrics)
            featuresDict['lyrics_language'] = songlang

        #Get the country of the Artist 
        country =  trackInfo.getArtistCountry(songlang, artist_url)
        featuresDict['artist_country'] = country

        #Get the album and artist genres and store in a dictionary
        featuresDict['artist_genres'] = str(artists_genres['artists'][i]['genres'])
        featuresDict['album_genres'] = str([])

        #Add the metadata dictionary to the ordered dictionary
        featuresDict.update(trackInfo.trackDict)

        #Store the audio analysis url in the dictionary
        featuresDict['analysis_url'] = 'https://api.spotify.com/v1/audio-analysis/' + trackid

        #Find audio features of the track and merge with the trackDict
        features = audio_features[i]
        if features:
            featuresDict.update(features)

        tracks.append(featuresDict)
        #lyrics.append(trackLyrics)

        #Calculate the request frequency per second
        req = trackInfo.req
        elapsed_time = time() - start_time

        #Enforce Rate limit
        #Pause the loop
        sleep(randint(2,4))

        # Break the loop if the number of requests is greater than expected
        if req > 13:
            print(i)
            print('Number of requests was greater than expected.')
            break

       
        try:
            with open('lyrics.txt', 'w+') as f:    
                json.dump(trackLyrics, f)
        except IOError:
            print('Error during file handling')      

        try:
            dfn = pd.DataFrame.from_dict(tracks)
            dfn.to_csv("songs.csv", index=False) 
        except:
            print("Error while writing to csv file")
    

main()