# Spotify Mood + Popularity Recommender (Dataset B)
This notebook:
1. Loads Dataset B (charts + audio features)
2. Loads the trained mood Pipeline (feature engineering + LGBM)
3. Infers mood probabilities `P_0..P_3` + `mood_pred` + `mood_conf`
4. Builds **time-series trend features** from `snapshot_date`
5. Creates a `recommend()` function (dedup + optional artist diversity)

**Note:** Adjust file paths in the first section to match your repo.

## 0) Imports & paths

In [1]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import joblib

pd.set_option("display.max_columns", 200)


In [2]:
# ---- Paths (edit if needed) ----
RAW_PATH = Path("../data/processed/filtered_countries.csv")   # Dataset B
MODEL_PATH = Path("../artifacts/lgbm_only/lgbm/model.joblib") # trained Pipeline

# Make repo root importable (fixes: ModuleNotFoundError: No module named 'src')
ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(ROOT))

print("RAW_PATH:", RAW_PATH.resolve())
print("MODEL_PATH:", MODEL_PATH.resolve())
print("ROOT:", ROOT)


RAW_PATH: E:\NTI-project\spotify-mood-mlops\data\processed\filtered_countries.csv
MODEL_PATH: E:\NTI-project\spotify-mood-mlops\artifacts\lgbm_only\lgbm\model.joblib
ROOT: E:\NTI-project\spotify-mood-mlops


## 1) Load Dataset B (full)

In [3]:
df_B = pd.read_csv(RAW_PATH)
print("Rows:", len(df_B))
display(df_B.head(3))


Rows: 231646


Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country,snapshot_date,popularity,is_explicit,duration_ms,album_name,album_release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,2HRgqmZQC0MC7GeNuDIXHN,Seven (feat. Latto) (Explicit Ver.),"Jung Kook, Latto",1,0,0,VN,2025-06-11,85,True,183550,GOLDEN,2023-11-03,0.79,0.831,11,-4.185,1,0.044,0.312,0.0,0.0797,0.872,124.987,4
1,01qFKNWq73UfEslI0GvumE,3D (feat. Jack Harlow),"Jung Kook, Jack Harlow",2,2,2,VN,2025-06-11,79,True,201812,3D : The Remixes,2023-10-02,0.853,0.824,1,-3.287,1,0.103,0.0322,0.0,0.0859,0.888,108.044,4
2,1DVYafsLmcQySKkJnY4RCs,Phép Màu - Đàn Cá Gỗ Original Soundtrack,"MAYDAYs, Minh Tốc & Lam",3,-1,-1,VN,2025-06-11,74,False,266666,Phép Màu (Đàn Cá Gỗ Original Soundtrack),2025-02-21,0.492,0.508,7,-6.907,1,0.0355,0.128,0.0,0.254,0.249,143.857,4


In [4]:
# Basic cleanup / normalization
df_B["country"] = df_B["country"].astype(str).str.strip().str.upper()
df_B["snapshot_date"] = pd.to_datetime(df_B["snapshot_date"], errors="coerce")

# Rename duration to match training feature name used in Dataset A
if "duration_ms" in df_B.columns and "duration (ms)" not in df_B.columns:
    df_B = df_B.rename(columns={"duration_ms": "duration (ms)"})

# Ensure numeric
num_cols = [
    "daily_rank","daily_movement","weekly_movement","popularity","loudness",
    "danceability","energy","speechiness","acousticness","instrumentalness",
    "liveness","valence","tempo","duration (ms)"
]
for c in num_cols:
    if c in df_B.columns:
        df_B[c] = pd.to_numeric(df_B[c], errors="coerce")

# Drop rows missing critical columns
df_B = df_B.dropna(subset=["spotify_id","snapshot_date","popularity","daily_rank"])
print("After cleanup rows:", len(df_B))


After cleanup rows: 231646


## 2) Load mood model Pipeline and infer mood on Dataset B

In [5]:
model = joblib.load(MODEL_PATH)
print(model)


Pipeline(steps=[('feat_eng',
                 FunctionTransformer(func=<function add_features at 0x00000254A9F793A0>,
                                     kw_args={'feature_cols': ['duration (ms)',
                                                               'danceability',
                                                               'energy',
                                                               'loudness',
                                                               'speechiness',
                                                               'acousticness',
                                                               'instrumentalness',
                                                               'liveness',
                                                               'valence',
                                                               'tempo']})),
                ('model',
                 LGBMClassifier(colsample_bytree=0.9, learning_rate=0.03,
          

In [6]:
# The pipeline already includes feature engineering (intensity, rhythm_drive, calm_score).
# So we only pass the 10 base columns it expects.
BASE_COLS = [
    "duration (ms)",
    "danceability",
    "energy",
    "loudness",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
]

missing = [c for c in BASE_COLS if c not in df_B.columns]
if missing:
    raise KeyError(f"Missing required base columns: {missing}")

X_B = df_B[BASE_COLS].copy()

proba = model.predict_proba(X_B)
classes = model.classes_
print("Classes:", classes)

for i, c in enumerate(classes):
    df_B[f"P_{c}"] = proba[:, i]

df_B["mood_pred"] = classes[np.argmax(proba, axis=1)]
df_B["mood_conf"] = proba.max(axis=1)

display(df_B[["spotify_id","name","artists","country","mood_pred","mood_conf"] + [f"P_{c}" for c in classes]].head(5))


Classes: [0 1 2 3]


Unnamed: 0,spotify_id,name,artists,country,mood_pred,mood_conf,P_0,P_1,P_2,P_3
0,2HRgqmZQC0MC7GeNuDIXHN,Seven (feat. Latto) (Explicit Ver.),"Jung Kook, Latto",VN,1,0.999064,1.107526e-06,0.999064,0.000935,2.158877e-07
1,01qFKNWq73UfEslI0GvumE,3D (feat. Jack Harlow),"Jung Kook, Jack Harlow",VN,1,0.999903,1.593838e-07,0.999903,9.7e-05,1.150299e-07
2,1DVYafsLmcQySKkJnY4RCs,Phép Màu - Đàn Cá Gỗ Original Soundtrack,"MAYDAYs, Minh Tốc & Lam",VN,0,0.992342,0.9923418,0.007654,3e-06,7.995122e-07
3,6cc5PzJUddXskJGHJINw9F,DANCING IN THE DARK,SOOBIN,VN,1,0.964305,0.03569211,0.964305,1e-06,1.676231e-06
4,27xkOIER6uDLKALIelHylZ,Don’t Say You Love Me,Jin,VN,1,0.999943,5.394443e-05,0.999943,3e-06,3.324864e-07


## 3) Popularity normalization

In [7]:
# Normalize popularity within each country (reduces market-size bias)
df_B["pop_norm"] = (
    df_B.groupby("country")["popularity"]
        .transform(lambda x: (x - x.min()) / (x.max() - x.min() + 1e-6))
)
df_B["pop_norm"] = df_B["pop_norm"].fillna(0.0)


## 4) Time-series trend features from snapshot_date
We compute a rolling **slope of rank** over a window (e.g., 30 days). Negative slope means rank is improving (getting smaller).

In [8]:
WINDOW = 30   # days
MIN_PTS = 7   # min points in window

df_B = df_B.sort_values(["country","spotify_id","snapshot_date"])

def _rolling_slope(arr: np.ndarray) -> float:
    if len(arr) < MIN_PTS:
        return np.nan
    y = arr.astype(float)
    t = np.arange(len(y), dtype=float)
    return np.polyfit(t, y, 1)[0]

# Smooth rank first
df_B["rank_roll_mean"] = (
    df_B.groupby(["country","spotify_id"])["daily_rank"]
        .transform(lambda x: x.rolling(WINDOW, min_periods=MIN_PTS).mean())
)

# Rolling slope on smoothed rank
df_B["rank_trend_slope"] = (
    df_B.groupby(["country","spotify_id"])["rank_roll_mean"]
        .transform(lambda x: x.rolling(WINDOW, min_periods=MIN_PTS)
                           .apply(lambda w: _rolling_slope(w.values), raw=False))
)

# Convert to trend score where "improving rank" => higher
df_B["trend_raw"] = -df_B["rank_trend_slope"]

df_B["trend_score"] = (
    df_B.groupby("country")["trend_raw"]
        .transform(lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x) + 1e-6))
)

df_B["trend_score"] = df_B["trend_score"].fillna(0.5)

display(df_B[["spotify_id","country","snapshot_date","daily_rank","rank_roll_mean","rank_trend_slope","trend_score"]].head(5))


Unnamed: 0,spotify_id,country,snapshot_date,daily_rank,rank_roll_mean,rank_trend_slope,trend_score
231633,003vvx7Niy0yvhvHt4a68B,AU,2023-10-18,38,,,0.5
231233,003vvx7Niy0yvhvHt4a68B,AU,2023-10-19,38,,,0.5
230832,003vvx7Niy0yvhvHt4a68B,AU,2023-10-20,37,,,0.5
230429,003vvx7Niy0yvhvHt4a68B,AU,2023-10-21,34,,,0.5
230030,003vvx7Niy0yvhvHt4a68B,AU,2023-10-22,35,,,0.5


## 5) Build a serving table for the latest snapshot_date
We recommend from the latest date only, and deduplicate per (country, spotify_id).

In [9]:
latest_date = df_B["snapshot_date"].max()
df_latest = df_B[df_B["snapshot_date"] == latest_date].copy()

# Dedup within latest snapshot: keep best rank per song
df_latest = (
    df_latest.sort_values(["country","spotify_id","daily_rank"])
             .drop_duplicates(subset=["country","spotify_id"], keep="first")
             .copy()
)

print("latest_date:", latest_date)
print("df_latest rows:", len(df_latest))
display(df_latest.head(3))


latest_date: 2025-06-11 00:00:00
df_latest rows: 400


Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country,snapshot_date,popularity,is_explicit,duration (ms),album_name,album_release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,P_0,P_1,P_2,P_3,mood_pred,mood_conf,pop_norm,rank_roll_mean,rank_trend_slope,trend_raw,trend_score
389,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,40,1,-3,AU,2025-06-11,90,False,222973,Hot Fuss,2004-01-01,0.352,0.911,1,-5.23,1,0.0747,0.00121,0.0,0.0995,0.236,148.033,4,2e-06,6.2e-05,0.999936,4.460687e-07,2,0.999936,0.9,38.6,-0.04657,0.04657,0.467786
357,04emojnbYkrRmv5qtJcgVP,What I Want (feat. Tate McRae),"Morgan Wallen, Tate McRae",8,1,0,AU,2025-06-11,92,False,184517,I’m The Problem,2025-05-16,0.657,0.699,9,-3.92,1,0.0262,0.639,0.0,0.148,0.495,115.998,4,0.002388,0.997425,0.000185,2.414959e-06,1,0.997425,0.92,13.038462,,,0.5
353,0FTmksd2dxiE5e3rWyJXs6,back to friends,sombr,4,0,-1,AU,2025-06-11,98,False,199032,back to friends,2024-12-27,0.436,0.723,1,-2.291,1,0.0301,9.4e-05,8.8e-05,0.0929,0.235,92.855,4,0.272782,0.028701,0.698497,1.997288e-05,2,0.698497,0.98,3.266667,-0.453148,0.453148,0.634124


## 6) Recommender function (mood + trend + popularity)
Includes:
- optional country filter
- explicit filter
- confidence filter
- dedup by spotify_id (already done in df_latest)
- optional artist diversity cap

In [10]:
def recommend(
    df_latest: pd.DataFrame,
    mood_idx: int,
    k: int = 20,
    country: str | None = None,
    allow_explicit: bool = True,
    min_conf: float = 0.0,
    diversify_artist: bool = True,
    max_per_artist: int = 2,
    weights: tuple[float, float, float] = (0.60, 0.25, 0.15),  # (mood, trend, pop)
) -> pd.DataFrame:
    w_mood, w_trend, w_pop = weights
    d = df_latest.copy()

    if country is not None:
        country = str(country).strip().upper()
        d = d[d["country"] == country]

    if not allow_explicit:
        d = d[d["is_explicit"] == False]

    if min_conf > 0:
        d = d[d["mood_conf"] >= min_conf]

    pcol = f"P_{mood_idx}"
    if pcol not in d.columns:
        raise KeyError(f"Missing {pcol} in df_latest")

    d["score"] = w_mood * d[pcol] + w_trend * d["trend_score"] + w_pop * d["pop_norm"]
    d = d.sort_values("score", ascending=False)

    if diversify_artist:
        out = []
        counts = {}
        for _, row in d.iterrows():
            a = row["artists"]
            counts[a] = counts.get(a, 0)
            if counts[a] < max_per_artist:
                out.append(row)
                counts[a] += 1
            if len(out) >= k:
                break
        d = pd.DataFrame(out)
    else:
        d = d.head(k)

    out_cols = [
        "spotify_id","name","artists","country",
        "score","popularity","daily_rank","daily_movement","weekly_movement",
        "trend_score","mood_pred","mood_conf",pcol
    ]
    out_cols = [c for c in out_cols if c in d.columns]
    return d[out_cols].reset_index(drop=True)


## 7) Demo & quick checks

In [14]:
# Update these to your mapping if you want names instead of indices
MOOD_MAP = {0:"calm", 1:"happy", 2:"sad", 3:"energy"}

# Example demos (adjust country / mood_idx)
demo = recommend(df_latest, mood_idx=1, country="VN", k=10, allow_explicit=True, min_conf=0.2)
print("Demo mood:", MOOD_MAP.get(1, 1))
display(demo)


Demo mood: happy


Unnamed: 0,spotify_id,name,artists,country,score,popularity,daily_rank,daily_movement,weekly_movement,trend_score,mood_pred,mood_conf,P_1
0,354swDEk1Zdo9y57fEqmRg,ĐOÁ HOA,"TeuYungBoy, BIG WIND, DONAL",VN,0.870657,69,18,-7,-5,0.668837,1,0.999913,0.999913
1,27xkOIER6uDLKALIelHylZ,Don’t Say You Love Me,Jin,VN,0.864466,93,5,0,-2,0.5,1,0.999943,0.999943
2,2HRgqmZQC0MC7GeNuDIXHN,Seven (feat. Latto) (Explicit Ver.),"Jung Kook, Latto",VN,0.842793,85,1,0,0,0.46342,1,0.999064,0.999064
3,4wJ5Qq0jBN4ajy7ouZIV1c,APT.,"ROSÉ, Bruno Mars",VN,0.841553,89,31,-11,3,0.432875,1,0.999724,0.999724
4,7tI8dRuH2Yc6RuoTjxo4dU,Who,Jimin,VN,0.836249,90,19,-5,1,0.521661,1,0.951389,0.951389
5,4tqXSj46umlymdIxqY8zso,giá như,SOOBIN,VN,0.833271,66,24,-5,-8,0.583925,1,0.980482,0.980482
6,31VNCmwspR7nVJ6kruUuJt,Đừng Làm Trái Tim Anh Đau,Sơn Tùng M-TP,VN,0.830461,71,15,-2,-3,0.495875,1,0.999987,0.999987
7,01qFKNWq73UfEslI0GvumE,3D (feat. Jack Harlow),"Jung Kook, Jack Harlow",VN,0.826263,79,2,2,2,0.431283,1,0.999903,0.999903
8,1K0HQ30Wc11okzlcnFA7Ub,Không Thể Say,HIEUTHUHAI,VN,0.818784,70,23,-1,-8,0.460208,1,0.997887,0.997887
9,26zqmmkqUK6mCc87XDzPym,"CHÂN THÀNH (feat. RHYDER, Captain, Quang Hùng ...","ANH TRAI ""SAY HI"", RHYDER, CAPTAIN BOY, Quang ...",VN,0.811839,67,37,5,-4,0.449485,1,0.99828,0.99828


In [12]:
# Sanity checks
print("Mood distribution (latest):")
print(df_latest["mood_pred"].value_counts())

print("\nCountry distribution (latest):")
print(df_latest["country"].value_counts().head(15))

print("\nAverage P_target_mood in demo:", demo["P_1"].mean() if "P_1" in demo.columns else None)
print("Average mood_conf in demo:", demo["mood_conf"].mean())


Mood distribution (latest):
mood_pred
1    245
0    107
2     48
Name: count, dtype: int64

Country distribution (latest):
country
AU    50
CA    50
HK    50
JP    50
KR    50
TW    50
US    50
VN    50
Name: count, dtype: int64

Average P_target_mood in demo: 0.9908909784702583
Average mood_conf in demo: 0.9908909784702583


## 8) Save outputs (optional)

In [13]:
OUT_PATH = Path("datasetB_latest_with_scores.parquet")
df_latest.to_parquet(OUT_PATH, index=False)
print("Saved:", OUT_PATH.resolve())


Saved: E:\NTI-project\spotify-mood-mlops\notebooks\datasetB_latest_with_scores.parquet
