In [35]:
from lyricsgenius import Genius
import json
import pandas as pd
import asyncio
import re
import os
from requests.exceptions import HTTPError
from IPython.display import clear_output

In [36]:
with open('../secrets.json') as f:
    secrets = json.load(f)
    
token = secrets['GENIUS_ACCESS_TOKEN']
genius = Genius(token)

In [49]:
qdf = pd.read_csv('../datasets/rank_1/query.csv')

In [50]:
artist_cache = {}

In [56]:
async def search_artist(artist, retries, delay):
    
    if artist in artist_cache:
        return artist_cache[artist]
        
    data = None
    attempts = 0
    
    while attempts < retries:
        
        try:
            data = await asyncio.to_thread(genius.search_artist, artist, max_songs=0)
            artist_cache[artist] = data
            return data
            
        except Exception as e:
            
            if isinstance(e, HTTPError) and e.response and e.response.status in {429, 500, 503, 504}:
                await asyncio.sleep(delay)
                attempts += 1
                print('Retrying...')
                continue

            else:
                print(e)
                return None
            
    return data

In [None]:
genius_artists = await asyncio.gather(*(search_artist(artist, retries=1, delay=50) for artist in qdf['Artist']))

In [27]:
async def search_song(song, artist, retries, delay):
    data = None
    attempts = 0
    while attempts < retries:
        try:
            data = await asyncio.to_thread(genius.search_song, song, artist.name)
            if data:
                return data
            else:
                return None
        except Exception as e:
            if isinstance(e, HTTPError) and e.response and e.response.status in {429, 500, 503, 504}:
                print(f"Retrying due to error: {e}")
                await asyncio.sleep(delay)
                attempts += 1
                continue
            else:
                print(f"Error: {e}")
    return data

In [None]:
songs_results = await asyncio.gather(*(search_song(song, artist, retries=2, delay=3) for song, artist in zip(qdf['Song'], genius_artists)))

In [10]:
lyrics = [song.lyrics if song else 'None' for song in songs_results]

In [11]:
def clean_lyrics(raw_lyrics):
    cleaned = re.sub(r'^\d+\s+Contributors\S*\s+', '', raw_lyrics)
    cleaned = re.sub(r"^.*Lyrics", '', cleaned)
    cleaned = re.sub(r"\[.*?\]", "", cleaned)
    cleaned = re.sub(r"[^a-zA-Z0-9\s']", "", cleaned)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    cleaned = re.sub(r"\d?Embed.*$", "", cleaned).strip()
    return cleaned

In [12]:
lyrics_clean = [clean_lyrics(l) for l in lyrics]

In [13]:
idf = pd.read_csv('../datasets/rank_1/unique.csv')
track_ids = idf['Track ID']

In [14]:
cleaned_with_ids = {
    index: {"track_id": track_id, "lyrics": lyric}
    for index, (track_id, lyric) in enumerate(zip(track_ids, lyrics_clean))
}

In [15]:
with open('../datasets/rank_1/lyrics.json', 'w') as json_file:
    json.dump(cleaned_with_ids, json_file)