## Build merged Spotify lyrics dataset
Combine `songs.csv` and `spotify_songs.csv`, normalize key columns, and persist a cleaned dataset we can reuse for model training or lyric retrieval.

In [6]:
import ast
import re
from IPython.display import display
from pathlib import Path
import pandas as pd
import numpy as np

NOTEBOOK_DIR = Path.cwd().resolve()
if (NOTEBOOK_DIR / "datasets").exists():
    PROJECT_ROOT = NOTEBOOK_DIR
else:
    PROJECT_ROOT = NOTEBOOK_DIR.parent

DATASET_ROOT = PROJECT_ROOT / "datasets" / "emotions_NLP"

SONGS_META_PATH = PROJECT_ROOT / "datasets" / "song_features" / "songs.csv"
SPOTIFY_SONGS_PATH = PROJECT_ROOT / "datasets" / "song_features" / "spotify_songs.csv"
COMBINED_SONGS_PATH = PROJECT_ROOT / "datasets" / "song_features" / "merged_spotify_songs.csv"

CANONICAL_COLUMNS = [
    "song_id",
    "name",
    "artists",
    "album_name",
    "album_release_date",
    "playlist_genre",
    "playlist_subgenre",
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "duration_ms",
    "track_popularity",
    "lyrics",
    "language",
    "source",
]

def _parse_lyrics(raw: str) -> str:
    if pd.isna(raw):
        return ""
    text = str(raw).strip()
    if not text:
        return ""
    if text.startswith("[") and text.endswith("]"):
        try:
            tokens = ast.literal_eval(text)
            if isinstance(tokens, (list, tuple)):
                text = " ".join(str(tok) for tok in tokens)
        except Exception:
            text = text.replace("[", " ").replace("]", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def _clean_artists(value: str) -> str:
    if pd.isna(value):
        return ""
    value = str(value)
    value = value.replace("[", "").replace("]", "")
    value = value.replace("'", "")
    value = re.sub(r"\s+", " ", value)
    return value.strip()

def _standardize_song_frame(df: pd.DataFrame, source: str) -> pd.DataFrame:
    rename_map = {
        "track_id": "song_id",
        "track_name": "name",
        "track_artist": "artists",
        "track_album_name": "album_name",
        "track_album_release_date": "album_release_date",
        "playlist_subgenre": "playlist_subgenre",
        "language": "language",
    }
    df = df.rename(columns=rename_map)
    keep_cols = [col for col in CANONICAL_COLUMNS if col in df.columns]
    missing_cols = [col for col in CANONICAL_COLUMNS if col not in df.columns]
    for col in missing_cols:
        df[col] = np.nan
    df = df[CANONICAL_COLUMNS].copy()
    df["source"] = source
    df["lyrics"] = df["lyrics"].apply(_parse_lyrics)
    df = df[df["lyrics"].str.len() >= 30]
    df["artists"] = df["artists"].apply(_clean_artists)
    df["album_name"] = df["album_name"].fillna("")
    df["language"] = df["language"].fillna("unknown")
    numeric_cols = [
        "danceability",
        "energy",
        "key",
        "loudness",
        "mode",
        "speechiness",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo",
        "duration_ms",
        "track_popularity",
    ]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    pop = df["track_popularity"]
    if pop.notna().any() and pop.max() <= 1:
        df["track_popularity"] = pop * 100
    df = df.dropna(subset=["song_id", "lyrics"])
    df["lyrics"] = df["lyrics"].str.lower()
    df["lyrics"] = df["lyrics"].str.replace(r"\s+", " ", regex=True).str.strip()
    df["lyrics_length"] = df["lyrics"].str.len()
    return df

songs_base = pd.read_csv(SONGS_META_PATH)
spotify_additional = pd.read_csv(SPOTIFY_SONGS_PATH)

songs_base = _standardize_song_frame(songs_base, source="songs_csv")
spotify_additional = _standardize_song_frame(spotify_additional, source="spotify_songs_csv")

merged_songs = pd.concat([songs_base, spotify_additional], ignore_index=True)
merged_songs = merged_songs.drop_duplicates(subset=["song_id"])
merged_songs = merged_songs.reset_index(drop=True)

print(f"Merged songs: {len(merged_songs):,} rows")
display(merged_songs.sample(3))

Merged songs: 18,247 rows


Unnamed: 0,song_id,name,artists,album_name,album_release_date,playlist_genre,playlist_subgenre,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,duration_ms,track_popularity,lyrics,language,source,lyrics_length
9182,3rYil4c9oKuTVUBquB1izZ,Twork It Out,Usher,8701,2001-08-07,r&b,new jack swing,0.627,0.675,6.0,...,0.0,0.0766,0.519,89.973,282867,43.0,"yo, check this here there's nothing that i'd r...",en,spotify_songs_csv,3028
12602,5Df6eVUr5N0eZInCo3VmJG,Claimen,Jayh,Claimen,2014-12-12,latin,latin hip hop,0.888,0.616,2.0,...,1e-06,0.0627,0.751,97.918,148547,40.0,"je wilt claimen, noh ze willen claimen, ze wil...",nl,spotify_songs_csv,1839
4726,1QDsm9dZ4IETeluYNER6rX,Everyday & Everynight - Straight Pass,Yvette Michele,My Dream,1997-10-28,r&b,new jack swing,0.739,0.456,6.0,...,0.000776,0.108,0.658,94.893,232107,29.0,i'm crazy for you mr. dj i just wanna get to k...,en,spotify_songs_csv,1478


Persist the cleaned corpus for downstream training/reranking.

In [7]:
merged_songs.to_csv(COMBINED_SONGS_PATH, index=False)
print(f"Saved merged dataset to {COMBINED_SONGS_PATH}")
merged_songs.head(2)

Saved merged dataset to /Users/himanshu/Documents/Github/prompt2song/datasets/song_features/merged_spotify_songs.csv


Unnamed: 0,song_id,name,artists,album_name,album_release_date,playlist_genre,playlist_subgenre,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,duration_ms,track_popularity,lyrics,language,source,lyrics_length
0,6oJ6le65B3SEqPwMRNXWjY,higher love,Kygo,,2019-06-28,Pop,,0.63268,0.667346,0.727273,...,,0.086004,0.39137,0.290605,228267,0.5,"bring me higher love, love bring me higher lov...",unknown,songs_csv,1683
1,3yNZ5r3LKfdmjoS3gkhUCT,bad guy (with justin bieber),Billieeilish,,2019-07-11,Pop,,0.602614,0.425904,0.0,...,,0.10293,0.687634,0.508374,194840,0.318182,"yeah, yeah oh, ah white shirt now red, my bloo...",unknown,songs_csv,5531
