### Create a unified dataset that has song id, artist name, track name, V-A mean and std columns

In [2]:
import sys
import os
import pandas as pd
from pathlib import Path

sys.path.append('..')

# import toolkits from src
from src.deam_loader import create_deam_base_dataset
from src.lyric_utils import LyricsFetcher

from pathlib import Path

ANNOTATIONS_PATH = Path("../data/raw/DEAM_Annotations/annotations/annotations averaged per song/song_level/static_annotations_averaged_songs_1_2000.csv")
METADATA_DIR = Path("../data/raw/metadata_DEAM/metadata")
TOKEN_FILE = Path("../secrets.txt")
CACHE_FILE = Path("../data/processed/DEAM/lyrics_cache.json")
OUTPUT_FILE = Path("../data/processed/deam_core_with_lyrics.parquet")

In [3]:
# Create the base dataset without lyrics
print("Step 1: Creating base dataset ")
base_df = create_deam_base_dataset(
    annotations_path=ANNOTATIONS_PATH,
    metadata_dir=METADATA_DIR
)

Step 1: Creating base dataset 
Loaded 58 test songs
Created dataset with 1802 songs
  Train/val (≤2000): 1744
  Test (>2000): 58


### Add lyrics

In [4]:
# Initialize the lyrics fetcher and enrich the base dataset
print("\nStep 2: Enriching with lyrics")
token = TOKEN_FILE.read_text().strip()
fetcher = LyricsFetcher(genius_api_token=token, cache_path=CACHE_FILE)
final_df = fetcher.enrich_dataframe(base_df, batch_save_size=50)

2025-08-22 07:33:35,398 - INFO - LyricsFetcher initialized with fuzzy matching threshold of 85.
2025-08-22 07:33:35,400 - INFO - Found 1802 songs needing lyrics.



Step 2: Enriching with lyrics


Fetching Lyrics:   0%|          | 0/1802 [00:00<?, ?it/s]

  song = Song(self, song_info, lyrics)
2025-08-22 07:39:54,480 - INFO - Cache saved after processing 50 new songs.
2025-08-22 07:43:09,164 - INFO - Lyrics fetching complete. Final cache saved.
2025-08-22 07:43:09,165 - INFO - Found lyrics for 241 out of 1802 songs.


In [5]:
# Save the final, complete dataset
print("\n--- Step 3: Saving final dataset ---")
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
final_df.to_parquet(OUTPUT_FILE, index=False)
print(f"Successfully saved final dataset to {OUTPUT_FILE}")


--- Step 3: Saving final dataset ---
Successfully saved final dataset to ../data/processed/deam_core_with_lyrics.parquet


#### Lyrics fetched with the Genius API via the lyricsgenius library
#### Each (artist, track) was normalized (lowercasing, removing punctuation, trimming "feat." and parentheses)
#### Multiple name variants were tried for robust matching (e.g. "and" vs &, shortened artist names)
#### Fuzzy string matching (rapidfuzz) was used when exact matches were not reliable (Threshold = 85)
#### two-pass search strategy: first with artist and title, then title only with post-validation
#### Retrieved lyrics were cleaned: removed ads, "embed", contributor info, or section headers. Short/empty results were discarded
#### Cache file (lyrics_cache.json) ensured songs were not refetched and progress could resume if interrupted and avoid repeated API calls

In [6]:
print("\nFinal DataFrame Info:")
final_df.info()
display(final_df.head())


Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1802 entries, 0 to 1801
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   song_id       1802 non-null   int64  
 1   track_name    1802 non-null   object 
 2   artist_name   1802 non-null   object 
 3   valence_mean  1744 non-null   float64
 4   arousal_mean  1744 non-null   float64
 5   valence_std   1744 non-null   float64
 6   arousal_std   1744 non-null   float64
 7   lyrics        241 non-null    object 
dtypes: float64(4), int64(1), object(3)
memory usage: 112.8+ KB


Unnamed: 0,song_id,track_name,artist_name,valence_mean,arousal_mean,valence_std,arousal_std,lyrics
0,2,Tonight A Lonely Century,The New Mystikal Troubadours,3.1,3.0,0.94,0.63,
1,3,DD Groove,Kevin MacLeod,3.5,3.3,1.75,1.62,
2,4,Slow Burn,Kevin MacLeod,5.7,5.5,1.42,1.63,
3,5,Nothing Much,My Bubba & Mi,4.4,5.3,2.01,1.85,
4,7,Hustle,Kevin MacLeod,5.8,6.4,1.47,1.69,


### Save the final lyric dataset
#### manual changes: addition of lyrics to niche songs from bandcamp and other less known lyric sources
#### listening to each song that has lyrics to check for matching lyrics (deleting ones that didn't match)
#### translating non-english songs with chatGPT4o with the prompt to preserve emotional conveyance

In [7]:
import json
# Load data + curated cache
final_df = pd.read_parquet(OUTPUT_FILE)
with open(CACHE_FILE, "r", encoding="utf-8") as f:
    cache = json.load(f)

# Build keys exactly like the lyrics_cache file: "artist|track" lowercased
keys = final_df["artist_name"].str.lower() + "|" + final_df["track_name"].str.lower()

# Overwrite lyrics from cache (NaN if no match)
final_df["lyrics"] = keys.map(cache)

# Save Parquet
final_df.to_parquet(OUTPUT_FILE, index=False)

# Save csv with utf-8-sig for proper display in Excel:
final_df.to_csv(OUTPUT_FILE.with_suffix(".csv"), index=False, encoding="utf-8-sig")

print(f"Updated lyrics saved to: {OUTPUT_FILE} and {OUTPUT_FILE.with_suffix('.csv')}")

Updated lyrics saved to: ../data/processed/deam_core_with_lyrics.parquet and ../data/processed/deam_core_with_lyrics.csv


In [8]:
print("\nFinal DataFrame Info:")
final_df.info()
display(final_df.head())


Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1802 entries, 0 to 1801
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   song_id       1802 non-null   int64  
 1   track_name    1802 non-null   object 
 2   artist_name   1802 non-null   object 
 3   valence_mean  1744 non-null   float64
 4   arousal_mean  1744 non-null   float64
 5   valence_std   1744 non-null   float64
 6   arousal_std   1744 non-null   float64
 7   lyrics        241 non-null    object 
dtypes: float64(4), int64(1), object(3)
memory usage: 112.8+ KB


Unnamed: 0,song_id,track_name,artist_name,valence_mean,arousal_mean,valence_std,arousal_std,lyrics
0,2,Tonight A Lonely Century,The New Mystikal Troubadours,3.1,3.0,0.94,0.63,
1,3,DD Groove,Kevin MacLeod,3.5,3.3,1.75,1.62,
2,4,Slow Burn,Kevin MacLeod,5.7,5.5,1.42,1.63,
3,5,Nothing Much,My Bubba & Mi,4.4,5.3,2.01,1.85,
4,7,Hustle,Kevin MacLeod,5.8,6.4,1.47,1.69,
