In [10]:
import os
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import lyricsgenius
from dotenv import load_dotenv
import time

# --- 1. Load ENV vars ---
load_dotenv(dotenv_path="../env/.env")
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=os.getenv("SPOTIFY_CLIENT_ID"),
    client_secret=os.getenv("SPOTIFY_CLIENT_SECRET")
))
genius = lyricsgenius.Genius(os.getenv("GENIUS_API_TOKEN"))
genius.skip_non_songs = True
genius.remove_section_headers = False
genius.verbose = False

In [12]:
# --- 2. Set artist ---
artist_name = "Booba"
spotify_artist = sp.search(q=artist_name, type="artist", limit=1)["artists"]["items"][0]
artist_id = spotify_artist["id"]

# --- 3. Get albums ---
albums = sp.artist_albums(artist_id=artist_id, album_type="album", country="FR", limit=50)["items"]
seen, unique_albums = set(), []
for album in albums:
    if album["name"] not in seen:
        seen.add(album["name"])
        unique_albums.append(album)

# --- 4. Build dataset ---
all_tracks = []
for album in unique_albums:
    album_name = album["name"]
    release_date = album["release_date"]
    album_id = album["id"]
    
    tracks = sp.album_tracks(album_id)["items"]
    for track in tracks:
        title = track["name"]
        duration = track["duration_ms"] // 1000
        track_id = track["id"]
        
        try:
            popularity = sp.track(track_id)["popularity"]
        except:
            popularity = None
        
        # Extraction des artistes (principal + featuring)
        artists_list = [artist['name'] for artist in track['artists']]
        artistes = ", ".join(artists_list)
        
        # Chercher les paroles avec Genius
        try:
            genius_song = genius.search_song(title=title, artist=artist_name)
            lyrics = genius_song.lyrics if genius_song else None
        except:
            lyrics = None
        
        all_tracks.append({
            "titre": title,
            "album": album_name,
            "date": release_date,
            "durée_sec": duration,
            "popularité": popularity,
            "artistes": artistes,
            "paroles": lyrics
        })

        time.sleep(0.5)  # pour éviter le rate limit

# --- 5. Sauvegarde ---
df = pd.DataFrame(all_tracks)
# df = df.dropna(subset=["paroles"])  # optionnel, garder que les titres avec paroles

df.to_csv("../data/raw/booba_songs_dataset.csv", index=False)
print("✅ Dataset enregistré dans data/raw/booba_songs_dataset.csv")

# --- Aperçu ---
df.head()


✅ Dataset enregistré dans data/raw/booba_songs_dataset.csv


Unnamed: 0,titre,album,date,durée_sec,popularité,artistes,paroles
0,Rebel,AD VITAM ÆTERNAM,2024-02-08,133,52,Booba,"21 ContributorsRebel Lyrics[Paroles de ""Rebel""..."
1,Saga,AD VITAM ÆTERNAM,2024-02-08,194,66,Booba,"20 ContributorsSaga Lyrics[Paroles de ""Saga""]\..."
2,Dolce Camara,AD VITAM ÆTERNAM,2024-02-08,178,73,"Booba, SDM",54 ContributorsTranslationsEnglishDolce Camara...
3,Sport Billy,AD VITAM ÆTERNAM,2024-02-08,126,46,Booba,"32 ContributorsSport Billy Lyrics[Paroles de ""..."
4,Signé,AD VITAM ÆTERNAM,2024-02-08,180,48,Booba,16 ContributorsSigné LyricsC’est courant mai 2...
