# Day 2 — Recommender Training + Evaluation (Option C)

This notebook consumes the **Day 1 parquet outputs** and produces:

- Sessionized listens (`listens_with_sessions.parquet`)
- Baseline **co-occurrence** recommender (`cooc_index.parquet`)
- **Item2Vec** embedding recommender (Gensim Word2Vec model + vectors parquet)
- Offline evaluation: **Next-track HitRate@K** for both models

**Expected inputs (from Day 1):**
- `data/processed/listens_event_level.parquet`
- `data/processed/tracks_modeling_table.parquet` (schema-clean)

**Artifacts written (Day 2):**
- `data/processed/listens_with_sessions.parquet`
- `data/models/cooc_index.parquet`
- `data/models/item2vec.model`
- `data/models/item2vec_vectors.parquet`
- `data/models/eval_metrics.json`


In [1]:
# --- Imports ---
from __future__ import annotations

from pathlib import Path
from collections import defaultdict, Counter
import json

import numpy as np
import pandas as pd
from tqdm.auto import tqdm


In [2]:
# --- Project paths (robust when notebook lives in notebooks/) ---
# Finds repo root by walking up until it finds 'data/' or 'config.py'.

HERE = Path().resolve()
ROOT = HERE
for _ in range(6):
    if (ROOT / "data").exists() or (ROOT / "config.py").exists():
        break
    ROOT = ROOT.parent

DATA_DIR = ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
MODEL_DIR = DATA_DIR / "models"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

LISTENS_FP = PROCESSED_DIR / "listens_event_level.parquet"
TRACKS_FP  = PROCESSED_DIR / "tracks_modeling_table.parquet"

print("ROOT:", ROOT)
print("LISTENS_FP exists:", LISTENS_FP.exists(), LISTENS_FP)
print("TRACKS_FP exists:", TRACKS_FP.exists(), TRACKS_FP)


ROOT: C:\Users\maxma\Documents\Spotify Project
LISTENS_FP exists: True C:\Users\maxma\Documents\Spotify Project\data\processed\listens_event_level.parquet
TRACKS_FP exists: True C:\Users\maxma\Documents\Spotify Project\data\processed\tracks_modeling_table.parquet


In [3]:
# --- Load Day 1 data ---
listens = pd.read_parquet(LISTENS_FP)
tracks  = pd.read_parquet(TRACKS_FP)

print("listens rows:", len(listens))
print("tracks rows:", len(tracks))
print("listens columns:", listens.columns.tolist())
print("tracks columns:", tracks.columns.tolist())


listens rows: 136946
tracks rows: 25516
listens columns: ['ts', 'date', 'year', 'month', 'year_month', 'dayofweek', 'hour', 'spotify_track_uri', 'track_id', 'track_name', 'artist_name', 'artist_id_primary', 'artist_name_primary', 'album_name', 'ms_played', 'seconds_played', 'duration_ms', 'listen_ratio', 'listen_bucket', 'skipped', 'primary_genre', 'artist_genres', 'track_popularity', 'album_release_date', 'reason_start', 'reason_end', 'shuffle', 'platform', 'conn_country', 'source_file']
tracks columns: ['spotify_track_uri', 'first_played_ts', 'last_played_ts', 'play_count', 'total_ms_played', 'avg_ms_played', 'avg_listen_ratio', 'median_listen_ratio', 'skip_rate', 'avg_seconds_played', 'median_seconds_played', 'duration_ms', 'track_id', 'artist_id_primary', 'artist_name_primary', 'track_popularity', 'explicit', 'album_release_date', 'album_release_date_precision', 'track_name', 'artist_name', 'album_name', 'primary_genre', 'days_since_last_play']


## 1) Sessionize listens

We create `session_id` based on a gap threshold (default: **45 minutes**).
We also create `next_track_uri` for **next-track prediction** evaluation.


In [4]:
# --- Sessionize ---
listens = listens.copy()
listens["ts"] = pd.to_datetime(listens["ts"], errors="coerce")
listens = listens.dropna(subset=["ts", "spotify_track_uri"]).sort_values("ts").reset_index(drop=True)

# Optional noise filter: drop extremely short plays (10 seconds)
if "ms_played" in listens.columns:
    listens = listens[listens["ms_played"].fillna(0) >= 10_000].copy()

SESSION_GAP_MIN = 45

gap_min = listens["ts"].diff().dt.total_seconds().div(60)
listens["new_session"] = gap_min.isna() | (gap_min > SESSION_GAP_MIN)
listens["session_id"] = listens["new_session"].cumsum().astype(int)
listens["session_pos"] = listens.groupby("session_id").cumcount()
listens["next_track_uri"] = listens.groupby("session_id")["spotify_track_uri"].shift(-1)

print("sessions:", listens["session_id"].nunique())
listens[["ts","session_id","session_pos","spotify_track_uri","next_track_uri"]].head(10)


sessions: 9044


Unnamed: 0,ts,session_id,session_pos,spotify_track_uri,next_track_uri
0,2012-08-31 17:21:11+00:00,1,0,spotify:track:3eMfBkKz0ZuffMqIVHhNr1,spotify:track:44eZ0RG3gWBfiD5o9pvIV9
1,2012-08-31 17:30:20+00:00,1,1,spotify:track:44eZ0RG3gWBfiD5o9pvIV9,spotify:track:0uQWGMWQAtpISoXTEi5as6
2,2012-08-31 17:31:17+00:00,1,2,spotify:track:0uQWGMWQAtpISoXTEi5as6,spotify:track:44eZ0RG3gWBfiD5o9pvIV9
3,2012-08-31 17:35:54+00:00,1,3,spotify:track:44eZ0RG3gWBfiD5o9pvIV9,spotify:track:0uQWGMWQAtpISoXTEi5as6
4,2012-08-31 17:36:17+00:00,1,4,spotify:track:0uQWGMWQAtpISoXTEi5as6,spotify:track:44eZ0RG3gWBfiD5o9pvIV9
5,2012-08-31 17:36:52+00:00,1,5,spotify:track:44eZ0RG3gWBfiD5o9pvIV9,spotify:track:45Zo6ftGzq6wRckCUrMoBJ
6,2012-08-31 17:39:34+00:00,1,6,spotify:track:45Zo6ftGzq6wRckCUrMoBJ,spotify:track:6b7TVeW6F7YLK1ZHDb2VA1
7,2012-08-31 17:43:03+00:00,1,7,spotify:track:6b7TVeW6F7YLK1ZHDb2VA1,spotify:track:44SNh4cxnJPTbT03o4H5qy
8,2012-08-31 17:43:14+00:00,1,8,spotify:track:44SNh4cxnJPTbT03o4H5qy,spotify:track:7r2tLINapK54V3oPmwfq06
11,2012-08-31 17:46:41+00:00,1,9,spotify:track:7r2tLINapK54V3oPmwfq06,


In [5]:
# Save sessionized listens
OUT_SESS_FP = PROCESSED_DIR / "listens_with_sessions.parquet"
listens.to_parquet(OUT_SESS_FP, index=False)
print("Wrote:", OUT_SESS_FP)


Wrote: C:\Users\maxma\Documents\Spotify Project\data\processed\listens_with_sessions.parquet


## 2) Build session sequences

Convert each session into an ordered list of `spotify_track_uri`.
Filter out very short sessions (<2 items).


In [6]:
sessions = (
    listens.groupby("session_id")["spotify_track_uri"]
    .apply(list)
    .tolist()
)

sessions = [s for s in sessions if len(s) >= 2]

print("num sessions (len>=2):", len(sessions))
print("example session len:", len(sessions[0]), "example:", sessions[0][:10])


num sessions (len>=2): 6983
example session len: 10 example: ['spotify:track:3eMfBkKz0ZuffMqIVHhNr1', 'spotify:track:44eZ0RG3gWBfiD5o9pvIV9', 'spotify:track:0uQWGMWQAtpISoXTEi5as6', 'spotify:track:44eZ0RG3gWBfiD5o9pvIV9', 'spotify:track:0uQWGMWQAtpISoXTEi5as6', 'spotify:track:44eZ0RG3gWBfiD5o9pvIV9', 'spotify:track:45Zo6ftGzq6wRckCUrMoBJ', 'spotify:track:6b7TVeW6F7YLK1ZHDb2VA1', 'spotify:track:44SNh4cxnJPTbT03o4H5qy', 'spotify:track:7r2tLINapK54V3oPmwfq06']


## 3) Baseline model — Co-occurrence recommender

Count track co-occurrence within a window (default: ±3 positions).
Writes `data/models/cooc_index.parquet`.


In [7]:
WINDOW = 3
TOPK_STORE = 200  # store for reranking later

cooc = defaultdict(Counter)

for seq in tqdm(sessions, desc="Building co-occurrence"):
    L = len(seq)
    for i, seed in enumerate(seq):
        left = max(0, i - WINDOW)
        right = min(L, i + WINDOW + 1)
        for j in range(left, right):
            if j == i:
                continue
            rec = seq[j]
            if seed != rec:
                cooc[seed][rec] += 1

print("seed tracks with neighbors:", len(cooc))


Building co-occurrence:   0%|          | 0/6983 [00:00<?, ?it/s]

seed tracks with neighbors: 22756


In [8]:
rows = []
for seed, ctr in cooc.items():
    for rec, score in ctr.most_common(TOPK_STORE):
        rows.append({"seed_uri": seed, "rec_uri": rec, "score": int(score)})

cooc_index = pd.DataFrame(rows)
print("cooc_index rows:", len(cooc_index))
cooc_index.head()


cooc_index rows: 356534


Unnamed: 0,seed_uri,rec_uri,score
0,spotify:track:3eMfBkKz0ZuffMqIVHhNr1,spotify:track:44eZ0RG3gWBfiD5o9pvIV9,2
1,spotify:track:3eMfBkKz0ZuffMqIVHhNr1,spotify:track:0uQWGMWQAtpISoXTEi5as6,1
2,spotify:track:44eZ0RG3gWBfiD5o9pvIV9,spotify:track:0uQWGMWQAtpISoXTEi5as6,6
3,spotify:track:44eZ0RG3gWBfiD5o9pvIV9,spotify:track:6b7TVeW6F7YLK1ZHDb2VA1,5
4,spotify:track:44eZ0RG3gWBfiD5o9pvIV9,spotify:track:45Zo6ftGzq6wRckCUrMoBJ,3


In [9]:
COOC_FP = MODEL_DIR / "cooc_index.parquet"
cooc_index.to_parquet(COOC_FP, index=False)
print("Wrote:", COOC_FP)


Wrote: C:\Users\maxma\Documents\Spotify Project\data\models\cooc_index.parquet


## 4) Offline evaluation — next-track HitRate@K

Time-based split by session start time:
- first 80% sessions → train
- last 20% sessions → test

Compute HitRate@K for co-occurrence and Item2Vec.


In [10]:
# Time split by session start time
session_start = listens.groupby("session_id")["ts"].min().sort_values()
cut = int(len(session_start) * 0.8)

train_sessions = set(session_start.index[:cut])
test_sessions  = set(session_start.index[cut:])

test_events = listens[listens["session_id"].isin(test_sessions)].copy()
test_events = test_events.dropna(subset=["spotify_track_uri", "next_track_uri"])

print("test events:", len(test_events))


test events: 19956


In [11]:
# Seed -> ranked rec list for cooc
seed_to_recs_cooc = (
    cooc_index.sort_values(["seed_uri","score"], ascending=[True, False])
    .groupby("seed_uri")["rec_uri"]
    .apply(list)
    .to_dict()
)

def hitrate_at_k_from_map(seed_series, target_series, seed_to_recs: dict, k: int) -> float:
    hits = 0
    total = 0
    for seed, target in zip(seed_series, target_series):
        recs = seed_to_recs.get(seed, [])
        if not recs:
            continue
        total += 1
        if target in recs[:k]:
            hits += 1
    return hits / total if total else 0.0

metrics = {}
for k in [10, 25, 50]:
    metrics[f"cooc_HitRate@{k}"] = hitrate_at_k_from_map(
        test_events["spotify_track_uri"],
        test_events["next_track_uri"],
        seed_to_recs_cooc,
        k,
    )

metrics


{'cooc_HitRate@10': 0.6044099223252317,
 'cooc_HitRate@25': 0.7408669506389376,
 'cooc_HitRate@50': 0.8295164119268353}

## 5) Item2Vec — Word2Vec embeddings on sessions

Learns dense track embeddings using session sequences (tracks as tokens).
Artifacts:
- `data/models/item2vec.model`
- `data/models/item2vec_vectors.parquet`


In [12]:
# Try to import gensim; if missing, install:
# !pip install gensim

try:
    from gensim.models import Word2Vec
except Exception as e:
    raise RuntimeError(
        "gensim is required for Item2Vec. Install with: pip install gensim"
    ) from e


In [13]:
# Train Word2Vec on sessions (Item2Vec)
W2V_WINDOW = 5
VECTOR_SIZE = 64
MIN_COUNT = 3
EPOCHS = 10
WORKERS = 4

w2v = Word2Vec(
    sentences=sessions,
    vector_size=VECTOR_SIZE,
    window=W2V_WINDOW,
    min_count=MIN_COUNT,
    sg=1,           # skip-gram
    negative=10,
    workers=WORKERS,
    epochs=EPOCHS,
)

print("Item2Vec vocab size:", len(w2v.wv))


Item2Vec vocab size: 7242


In [14]:
# Save the model
ITEM2VEC_FP = MODEL_DIR / "item2vec.model"
w2v.save(str(ITEM2VEC_FP))
print("Wrote:", ITEM2VEC_FP)


Wrote: C:\Users\maxma\Documents\Spotify Project\data\models\item2vec.model


In [15]:
# Export vectors to parquet (uri + vector list)
uris = list(w2v.wv.index_to_key)
vecs = [w2v.wv[u].astype(float) for u in uris]

item2vec_vectors = pd.DataFrame({
    "spotify_track_uri": uris,
    "vector": [v.tolist() for v in vecs],
})

ITEM2VEC_VECTORS_FP = MODEL_DIR / "item2vec_vectors.parquet"
item2vec_vectors.to_parquet(ITEM2VEC_VECTORS_FP, index=False)
print("Wrote:", ITEM2VEC_VECTORS_FP)
item2vec_vectors.head()


Wrote: C:\Users\maxma\Documents\Spotify Project\data\models\item2vec_vectors.parquet


Unnamed: 0,spotify_track_uri,vector
0,spotify:track:4X8VB7lmL7o4QiypDYmqaA,"[0.31985145807266235, -0.03834826499223709, -0..."
1,spotify:track:4mVzCmTZeBOkvSQsuvFQWh,"[0.5267911553382874, -0.10194297879934311, -0...."
2,spotify:track:4XTJFHSPamFmR6Xs6rM0kD,"[-0.2505874037742615, -0.10925857722759247, 1...."
3,spotify:track:64BvArPNcub2hM9m90lRFw,"[-0.06827805191278458, -0.08065135776996613, 1..."
4,spotify:track:22bdXU26ewL0ji9K9acT5J,"[0.23278702795505524, -0.2809610962867737, -0...."


In [16]:
# Build seed->recs map for Item2Vec (compute on demand for speed)
def recommend_item2vec(seed_uri: str, k: int = 10) -> list[str]:
    if seed_uri in w2v.wv:
        return [uri for uri, _ in w2v.wv.most_similar(seed_uri, topn=k)]
    return []

# Evaluate Item2Vec HitRate@K (on-demand similarity)
def hitrate_item2vec_at_k(k: int) -> float:
    hits = 0
    total = 0
    for seed, target in zip(test_events["spotify_track_uri"], test_events["next_track_uri"]):
        if seed not in w2v.wv:
            continue
        recs = recommend_item2vec(seed, k)
        total += 1
        if target in recs:
            hits += 1
    return hits / total if total else 0.0

for k in [10, 25, 50]:
    metrics[f"item2vec_HitRate@{k}"] = hitrate_item2vec_at_k(k)

metrics


{'cooc_HitRate@10': 0.6044099223252317,
 'cooc_HitRate@25': 0.7408669506389376,
 'cooc_HitRate@50': 0.8295164119268353,
 'item2vec_HitRate@10': 0.2961576233734348,
 'item2vec_HitRate@25': 0.351583599312546,
 'item2vec_HitRate@50': 0.3962067272280874}

In [17]:
# Persist metrics
METRICS_FP = MODEL_DIR / "eval_metrics.json"
with METRICS_FP.open("w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)
print("Wrote:", METRICS_FP)


Wrote: C:\Users\maxma\Documents\Spotify Project\data\models\eval_metrics.json


## 6) Quick inference helpers (for Day 3 dashboard)

These can be moved into `src/recs/recommender.py` later.


In [18]:
def recommend_cooc(seed_uri: str, k: int = 10) -> list[str]:
    return seed_to_recs_cooc.get(seed_uri, [])[:k]

seed = listens["spotify_track_uri"].iloc[0]
print("seed:", seed)
print("cooc:", recommend_cooc(seed, 10))
print("item2vec:", recommend_item2vec(seed, 10))


seed: spotify:track:3eMfBkKz0ZuffMqIVHhNr1
cooc: ['spotify:track:44eZ0RG3gWBfiD5o9pvIV9', 'spotify:track:0uQWGMWQAtpISoXTEi5as6']
item2vec: []


## 7) Stretch — Lyrics-based reranking (Genius + VADER)

This section uses the `src/lyrics/` scripts you created to:
1) Fetch/cache lyrics for your most-played tracks (local cache; **DO NOT COMMIT RAW LYRICS**)
2) Build `lyrics_features.parquet` (safe, derived features only)
3) Compute your *recent mood profile* from listened tracks
4) Rerank candidate recommendations from both models to better match lyric mood

### Expected config
Your `config.py` at repo root should include:

```python
genius = {"access_token": "..."}
```

### Outputs
- `data/processed/lyrics_features.parquet`
- `data/models/recs_cooc_lyrics_reranked.parquet`
- `data/models/recs_item2vec_lyrics_reranked.parquet`
- `data/models/eval_metrics.json` (updated with rerank metrics)


In [19]:
# --- Imports for lyrics pipeline ---
import sys

if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

try:
    import config
except Exception as e:
    raise RuntimeError(
        "Could not import config.py from repo root. "
        "Ensure config.py exists at ROOT and contains genius['access_token']."
    ) from e

from src.lyrics.fetch_lyrics import fetch_lyrics_for_tracks, GeniusConfig
from src.lyrics.pipeline import write_lyrics_features

print("Imported lyrics modules OK.")


Imported lyrics modules OK.


In [20]:
# --- Build lyrics features for top-N tracks ---
TOP_N_LYRICS = 300

needed_cols = ["spotify_track_uri", "artist_name_primary"]
missing_cols = [c for c in needed_cols if c not in tracks.columns]
if missing_cols:
    raise RuntimeError(f"tracks missing required columns for lyrics fetch: {missing_cols}")

# track_name is required for good Genius search. If absent, try to derive from listens.
if "track_name" not in tracks.columns:
    print("WARNING: tracks['track_name'] not found. Trying to derive from listens if available.")
    if "track_name" in listens.columns:
        name_map = (
            listens.dropna(subset=["spotify_track_uri", "track_name"])
            .groupby("spotify_track_uri")["track_name"]
            .agg(lambda x: x.value_counts().index[0])
            .to_dict()
        )
        tracks["track_name"] = tracks["spotify_track_uri"].map(name_map)
    else:
        raise RuntimeError(
            "No track_name available in tracks or listens. "
            "Add track_name during Day 1 build for best Genius matching."
        )

# Select top tracks by play_count (fallback: total_ms_played)
if "play_count" in tracks.columns:
    top_tracks = tracks.sort_values("play_count", ascending=False).head(TOP_N_LYRICS)
elif "total_ms_played" in tracks.columns:
    top_tracks = tracks.sort_values("total_ms_played", ascending=False).head(TOP_N_LYRICS)
else:
    raise RuntimeError("tracks must include play_count or total_ms_played for TOP_N selection.")

candidates = top_tracks[["spotify_track_uri", "artist_name_primary", "track_name"]].dropna().to_dict("records")

LYRICS_CACHE_FP = (DATA_DIR / "cache" / "lyrics_raw.jsonl")
LYRICS_OUT_FP = (PROCESSED_DIR / "lyrics_features.parquet")
LYRICS_CACHE_FP.parent.mkdir(parents=True, exist_ok=True)

if not hasattr(config, "genius") or "access_token" not in config.genius:
    raise RuntimeError("config.genius['access_token'] missing. Add genius={'access_token': '...'} to config.py")

genius_cfg = GeniusConfig(access_token=config.genius["access_token"], sleep_seconds=0.3, retries=1, timeout=8)

print("Fetching lyrics for candidates:", len(candidates))
print("Cache:", LYRICS_CACHE_FP)
print("Output:", LYRICS_OUT_FP)

fetch_lyrics_for_tracks(candidates, cache_fp=LYRICS_CACHE_FP, genius_cfg=genius_cfg, limit=150)
fetch_lyrics_for_tracks(candidates, cache_fp=LYRICS_CACHE_FP, genius_cfg=genius_cfg, limit=150)

write_lyrics_features(cache_fp=LYRICS_CACHE_FP, out_fp=LYRICS_OUT_FP)

lyrics_feats = pd.read_parquet(LYRICS_OUT_FP)
print("lyrics_feats rows:", len(lyrics_feats))
lyrics_feats.head()


Fetching lyrics for candidates: 300
Cache: C:\Users\maxma\Documents\Spotify Project\data\cache\lyrics_raw.jsonl
Output: C:\Users\maxma\Documents\Spotify Project\data\processed\lyrics_features.parquet
[lyrics] fetched 25 new (seen=25, cache=25), elapsed=46.0s
[lyrics] fetched 50 new (seen=50, cache=50), elapsed=99.6s
[lyrics] fetched 75 new (seen=75, cache=75), elapsed=151.0s
[lyrics] fetched 100 new (seen=100, cache=100), elapsed=199.6s
[lyrics] fetched 125 new (seen=125, cache=125), elapsed=251.4s
[lyrics] fetched 150 new (seen=150, cache=150), elapsed=303.6s
[lyrics] done. new=150, seen=151, cache=150, elapsed=303.6s
[lyrics] fetched 25 new (seen=175, cache=175), elapsed=51.2s
[lyrics] fetched 50 new (seen=200, cache=200), elapsed=105.2s
[lyrics] fetched 75 new (seen=225, cache=225), elapsed=153.9s
[lyrics] fetched 100 new (seen=250, cache=250), elapsed=199.6s
[lyrics] fetched 125 new (seen=275, cache=275), elapsed=242.6s
[lyrics] fetched 150 new (seen=300, cache=300), elapsed=287.4s

Unnamed: 0,spotify_track_uri,genius_song_id,genius_full_title,lyrics_found,contains_cjk,lyric_len_chars,lyric_len_words,unique_word_ratio,vader_compound,vader_pos,vader_neg,vader_neu
0,spotify:track:4X8VB7lmL7o4QiypDYmqaA,,Passing Through a Screen Door by The Wonder Years,True,False,1747,333,0.483483,-0.9309,0.109,0.135,0.756
1,spotify:track:4mVzCmTZeBOkvSQsuvFQWh,,Came Out Swinging by The Wonder Years,True,False,2062,401,0.413965,0.8761,0.142,0.14,0.719
2,spotify:track:4XTJFHSPamFmR6Xs6rM0kD,,Forty Days by Streetlight Manifesto,True,False,1613,325,0.421538,-0.9836,0.107,0.172,0.722
3,spotify:track:2SSFvQBwsxeazzo7z5l9gD,,"Delirious (Boneless) by Steve Aoki, Chris Lake...",True,False,2511,480,0.295833,0.97,0.084,0.027,0.889
4,spotify:track:64BvArPNcub2hM9m90lRFw,,Would You Be Impressed? by Streetlight Manifesto,True,False,1459,299,0.51505,-0.6863,0.108,0.094,0.798


In [21]:
# --- Join lyrics features into track table ---
tracks_with_lyrics = tracks.merge(
    lyrics_feats,
    on="spotify_track_uri",
    how="left",
    validate="1:1"
)

cols = [c for c in ["spotify_track_uri","track_name","artist_name_primary","lyrics_found","vader_compound"] if c in tracks_with_lyrics.columns]
tracks_with_lyrics[cols].head(10)


Unnamed: 0,spotify_track_uri,track_name,artist_name_primary,lyrics_found,vader_compound
0,spotify:track:002AzLaJtX4Tyi7Yv0J49w,USED TO,Woo,,
1,spotify:track:003FTlCpBTM4eSqYSWPv4H,"Swing, Swing",The All-American Rejects,,
2,spotify:track:003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,,
3,spotify:track:004VU4cWTkRqVMrlv8KW3D,Magnolia,The Hush Sound,,
4,spotify:track:0054onP0Kw05Ws8rrhkIlx,Suburbia,The Wonder Years,,
5,spotify:track:005lwxGU1tms6HGELIcUv9,I Kissed A Girl,Katy Perry,,
6,spotify:track:006hfStQk9L7MVT9RcDeva,A Part Of Me (feat. Laura Whiteside),Neck Deep,,
7,spotify:track:007PPvZtGDYHSEhYPxqIfC,Been A While,6LACK,,
8,spotify:track:007lsEHi6fP9LoYB7czYUa,Divine Sorrow,Wyclef Jean,,
9,spotify:track:008DIFJdXuk1Up3xdimYAs,Flute of the Traveler,StreamBeats by Harris Heller,,


In [22]:
# --- Compute a 'recent mood profile' from listens (last 30 days) ---
listen_mood = listens.merge(
    tracks_with_lyrics[["spotify_track_uri","vader_compound","lyrics_found"]],
    on="spotify_track_uri",
    how="left"
)

max_ts = listen_mood["ts"].max()
start_ts = max_ts - pd.Timedelta(days=30)

recent = listen_mood[listen_mood["ts"] >= start_ts].copy()

if "listen_ratio" in recent.columns:
    w = recent["listen_ratio"].fillna(0.0).clip(lower=0.0)
elif "ms_played" in recent.columns:
    w = recent["ms_played"].fillna(0.0).clip(lower=0.0)
else:
    w = pd.Series(np.ones(len(recent)), index=recent.index)

recent = recent.dropna(subset=["vader_compound"])
if len(recent) == 0:
    raise RuntimeError("No recent listens have vader_compound. Increase TOP_N_LYRICS or check lyric coverage.")

user_mood = float(np.average(recent["vader_compound"].values, weights=w.loc[recent.index].values))
print("Recent mood window:", start_ts, "to", max_ts)
print("User recent mood (weighted avg vader_compound):", user_mood)


Recent mood window: 2023-07-01 21:19:07+00:00 to 2023-07-31 21:19:07+00:00
User recent mood (weighted avg vader_compound): 0.1344522877158427


In [23]:
# --- Reranking function ---
def rerank_by_lyrics_mood(
    seed_uri: str,
    rec_uris: list[str],
    base_scores: dict[str, float] | None,
    track_lyrics_df: pd.DataFrame,
    user_mood: float,
    k: int = 25,
    alpha_base: float = 0.35,
    beta_mood: float = 0.65,
) -> pd.DataFrame:
    '''
    Returns top-k reranked recs as a DataFrame:
    seed_uri, rec_uri, base_score, vader_compound, mood_distance, final_score
    '''
    if not rec_uris:
        return pd.DataFrame(columns=["seed_uri","rec_uri","base_score","vader_compound","mood_distance","final_score"])

    cand = pd.DataFrame({"rec_uri": rec_uris})
    tmp = track_lyrics_df[["spotify_track_uri","vader_compound"]].rename(columns={"spotify_track_uri":"rec_uri"})
    cand = cand.merge(tmp, on="rec_uri", how="left")

    if base_scores is None:
        cand["base_score"] = 0.0
    else:
        cand["base_score"] = cand["rec_uri"].map(base_scores).fillna(0.0).astype(float)

    cand["mood_distance"] = (cand["vader_compound"] - user_mood).abs()
    if cand["mood_distance"].notna().any():
        cand["mood_distance"] = cand["mood_distance"].fillna(cand["mood_distance"].max())
    else:
        cand["mood_distance"] = 1.0

    if cand["base_score"].max() > 0:
        cand["base_norm"] = cand["base_score"] / cand["base_score"].max()
    else:
        cand["base_norm"] = 0.0

    md = cand["mood_distance"]
    if md.max() > 0:
        cand["mood_sim"] = 1.0 - (md / md.max())
    else:
        cand["mood_sim"] = 1.0

    cand["final_score"] = alpha_base * cand["base_norm"] + beta_mood * cand["mood_sim"]
    cand = cand.sort_values("final_score", ascending=False).head(k).copy()
    cand.insert(0, "seed_uri", seed_uri)
    return cand[["seed_uri","rec_uri","base_score","vader_compound","mood_distance","final_score"]]


In [24]:
# --- Build reranked recommendation indices for both models ---
cooc_scores_map = cooc_index.set_index(["seed_uri","rec_uri"])["score"].to_dict()

def _cooc_base_scores(seed_uri: str) -> dict[str, float]:
    recs = seed_to_recs_cooc.get(seed_uri, [])
    return {r: float(cooc_scores_map.get((seed_uri, r), 0.0)) for r in recs}

def _item2vec_recs_and_scores(seed_uri: str, topn: int = 200):
    if seed_uri not in w2v.wv:
        return [], {}
    sims = w2v.wv.most_similar(seed_uri, topn=topn)
    recs = [u for u, _ in sims]
    scores = {u: float(s) for u, s in sims}
    return recs, scores

RERANK_TOPK = 50
SEEDS_TO_BUILD = 5000  # cap runtime; raise later if desired
TOPK_STORE_LOCAL = 200

track_lyrics_df = tracks_with_lyrics.copy()

reranked_cooc_rows = []
for seed_uri in tqdm(list(seed_to_recs_cooc.keys())[:SEEDS_TO_BUILD], desc="Reranking cooc"):
    recs = seed_to_recs_cooc.get(seed_uri, [])[:TOPK_STORE_LOCAL]
    rr = rerank_by_lyrics_mood(seed_uri, recs, _cooc_base_scores(seed_uri), track_lyrics_df, user_mood, k=RERANK_TOPK)
    reranked_cooc_rows.append(rr)

reranked_cooc = pd.concat(reranked_cooc_rows, ignore_index=True) if reranked_cooc_rows else pd.DataFrame()
print("reranked_cooc rows:", len(reranked_cooc))

reranked_i2v_rows = []
seed_list = list(seed_to_recs_cooc.keys())[:SEEDS_TO_BUILD]
for seed_uri in tqdm(seed_list, desc="Reranking item2vec"):
    recs, base_scores = _item2vec_recs_and_scores(seed_uri, topn=TOPK_STORE_LOCAL)
    if not recs:
        continue
    rr = rerank_by_lyrics_mood(seed_uri, recs, base_scores, track_lyrics_df, user_mood, k=RERANK_TOPK)
    reranked_i2v_rows.append(rr)

reranked_item2vec = pd.concat(reranked_i2v_rows, ignore_index=True) if reranked_i2v_rows else pd.DataFrame()
print("reranked_item2vec rows:", len(reranked_item2vec))

reranked_cooc.head()


Reranking cooc:   0%|          | 0/5000 [00:00<?, ?it/s]

reranked_cooc rows: 59027


Reranking item2vec:   0%|          | 0/5000 [00:00<?, ?it/s]

reranked_item2vec rows: 79450


Unnamed: 0,seed_uri,rec_uri,base_score,vader_compound,mood_distance,final_score
0,spotify:track:002AzLaJtX4Tyi7Yv0J49w,spotify:track:1a9VIDVGr7rF2eNPQX9zIy,2.0,,1.0,0.35
1,spotify:track:002AzLaJtX4Tyi7Yv0J49w,spotify:track:7GU4sdexteLwpjIZ36QPFL,2.0,,1.0,0.35
2,spotify:track:002AzLaJtX4Tyi7Yv0J49w,spotify:track:3C2iUQ6S34tW7C6BKt8der,2.0,,1.0,0.35
3,spotify:track:002AzLaJtX4Tyi7Yv0J49w,spotify:track:4FFC7McrnnwAWKbAqbckTP,2.0,,1.0,0.35
4,spotify:track:002AzLaJtX4Tyi7Yv0J49w,spotify:track:1llynwiSRD0fZJbpObGy4v,1.0,,1.0,0.175


In [25]:
# --- Save reranked indices ---
COOC_RR_FP = MODEL_DIR / "recs_cooc_lyrics_reranked.parquet"
I2V_RR_FP  = MODEL_DIR / "recs_item2vec_lyrics_reranked.parquet"

reranked_cooc.to_parquet(COOC_RR_FP, index=False)
reranked_item2vec.to_parquet(I2V_RR_FP, index=False)

print("Wrote:", COOC_RR_FP)
print("Wrote:", I2V_RR_FP)


Wrote: C:\Users\maxma\Documents\Spotify Project\data\models\recs_cooc_lyrics_reranked.parquet
Wrote: C:\Users\maxma\Documents\Spotify Project\data\models\recs_item2vec_lyrics_reranked.parquet


In [26]:
# --- Evaluate reranked models (recommended) ---
def build_seed_to_recs_from_reranked(df_rr: pd.DataFrame) -> dict[str, list[str]]:
    if df_rr.empty:
        return {}
    return (
        df_rr.sort_values(["seed_uri","final_score"], ascending=[True, False])
        .groupby("seed_uri")["rec_uri"]
        .apply(list)
        .to_dict()
    )

seed_to_recs_cooc_rr = build_seed_to_recs_from_reranked(reranked_cooc)
seed_to_recs_i2v_rr  = build_seed_to_recs_from_reranked(reranked_item2vec)

for k in [10, 25, 50]:
    metrics[f"cooc_lyrics_rerank_HitRate@{k}"] = hitrate_at_k_from_map(
        test_events["spotify_track_uri"], test_events["next_track_uri"], seed_to_recs_cooc_rr, k
    )
    metrics[f"item2vec_lyrics_rerank_HitRate@{k}"] = hitrate_at_k_from_map(
        test_events["spotify_track_uri"], test_events["next_track_uri"], seed_to_recs_i2v_rr, k
    )

# Persist updated metrics
METRICS_FP = MODEL_DIR / "eval_metrics.json"
with METRICS_FP.open("w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)

metrics


{'cooc_HitRate@10': 0.6044099223252317,
 'cooc_HitRate@25': 0.7408669506389376,
 'cooc_HitRate@50': 0.8295164119268353,
 'item2vec_HitRate@10': 0.2961576233734348,
 'item2vec_HitRate@25': 0.351583599312546,
 'item2vec_HitRate@50': 0.3962067272280874,
 'cooc_lyrics_rerank_HitRate@10': 0.615032080659945,
 'item2vec_lyrics_rerank_HitRate@10': 0.31536537927202,
 'cooc_lyrics_rerank_HitRate@25': 0.7703941338221815,
 'item2vec_lyrics_rerank_HitRate@25': 0.3642678521811614,
 'cooc_lyrics_rerank_HitRate@50': 0.8716773602199817,
 'item2vec_lyrics_rerank_HitRate@50': 0.40955821061405945}