In [6]:
import pandas as pd
import lyricsgenius
import ast
import time
import re
import os
import random
from dotenv import load_dotenv

In [7]:
load_dotenv()
genius = lyricsgenius.Genius(os.getenv("TOKEN_GENIUS"))
genius.timeout = 15
genius.sleep_time = 0.5

In [8]:
df = pd.read_csv("data/dataset_processed.csv")

In [9]:
def clean_lyrics(lyrics):
    if not lyrics:
        return ""
    lyrics = re.sub(r'.*Lyrics\d*', '', lyrics)
    lyrics = re.sub(r'\d*Embed$', '', lyrics)
    lyrics = lyrics.replace('\n', ' ')
    return lyrics.strip()

def get_first_artist(artists_str):
    try:
        artists_list = ast.literal_eval(artists_str)
        return artists_list[0] if artists_list else ""
    except:
        return ""

In [None]:
songs_to_search = []
for _, row in df.iterrows():
    artist = get_first_artist(row['artists'])
    title = row['name']
    if artist and title:
        songs_to_search.append({
            "id": row['id'],
            "artist": artist,
            "title": title,
            "decade": row['decade']
        })

print(f"Buscando letras para {len(songs_to_search)} canciones...")
total = len(songs_to_search)

lyrics_data = []
checkpoint_interval = 100
checkpoint_file = "data/lyrics_dataset.csv"

if os.path.exists(checkpoint_file):
    lyrics_df = pd.read_csv(checkpoint_file)
    processed_ids = set(lyrics_df["id"])
    print(f"Se cargaron {len(processed_ids)} canciones ya guardadas.")
else:
    lyrics_df = pd.DataFrame(columns=["id", "artist", "song_name", "lyrics", "decade"])
    processed_ids = set()

random.shuffle(songs_to_search)

for i, s in enumerate(songs_to_search):
    if s["id"] in processed_ids:
        continue

    try:
        if i % 10 == 0:
            print(f"Progreso: {i}/{total} canciones procesadas")

        song = genius.search_song(s["title"], s["artist"])

        if song:
            lyrics_data.append({
                "id": s["id"],
                "artist": s["artist"],
                "song_name": s["title"],
                "lyrics": clean_lyrics(song.lyrics),
                "decade": s["decade"]
            })
        else:
            print(f"No se encontró: {s['artist']} - {s['title']}")

        if len(lyrics_data) % checkpoint_interval == 0:
            temp_df = pd.DataFrame(lyrics_data)
            lyrics_df = pd.concat([lyrics_df, temp_df], ignore_index=True)
            lyrics_df.to_csv(checkpoint_file, index=False, encoding="utf-8")
            lyrics_data = []

        time.sleep(random.uniform(1, 2))

    except Exception as e:
        error_str = str(e)
        if "429" in error_str or "1015" in error_str:
            print("Límite alcanzado o bloqueo detectado.")
            retry_after = 3600
            match = re.search(r"Retry-After': '(\d+)'", error_str)
            if match:
                retry_after = int(match.group(1))
            print(f"Esperando {retry_after / 60:.1f} minutos antes de continuar...")
            if lyrics_data:
                temp_df = pd.DataFrame(lyrics_data)
                lyrics_df = pd.concat([lyrics_df, temp_df], ignore_index=True)
                lyrics_df.to_csv(checkpoint_file, index=False, encoding="utf-8")
                lyrics_data = []
            time.sleep(retry_after + 10)
        else:
            print(f"Error con {s['title']} de {s['artist']}: {e}")
            time.sleep(2)

if lyrics_data:
    temp_df = pd.DataFrame(lyrics_data)
    lyrics_df = pd.concat([lyrics_df, temp_df], ignore_index=True)
    lyrics_df.to_csv(checkpoint_file, index=False, encoding="utf-8")

Buscando letras para 120750 canciones...
Se cargaron 9200 canciones ya guardadas.
Progreso: 0/120750 canciones procesadas
Searching for "Bittersweet Tragedy" by Melanie Martinez...
Done.
Searching for "Por el Resto" by Los Enanitos Verdes...
Done.
Searching for "You Still Believe In Me - Instrumental Stereo Mix" by The Beach Boys...
No results found for: 'You Still Believe In Me - Instrumental Stereo Mix The Beach Boys'
No se encontró: The Beach Boys - You Still Believe In Me - Instrumental Stereo Mix
Searching for "Please, Please, Please - Live At The Apollo Theater, 1967" by James Brown & The Famous Flames...
Done.
Searching for "Angie Girl" by Stevie Wonder...
Done.
Searching for "Southern Accents" by Tom Petty and the Heartbreakers...
Done.
Searching for "O.P. Jebediah" by The Dip...
Done.
Searching for "Trouble Every Day" by The Mothers Of Invention...
Done.
Searching for "Hey Ma" by Cam’ron...
Done.
Searching for "The Rover - Remaster" by Led Zeppelin...
No results found for: 'Th