In [12]:
# Confirmed paths
csv_path   = "data/spotify_dataset.csv"
json1_path = "data/900k Definitive Spotify Dataset.json"
json2_path = "data/final_milliondataset_BERT_500K_revised.json"

import os, re, json
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np

# Output dirs
Path("data/interim").mkdir(parents=True, exist_ok=True)
Path("data/processed").mkdir(parents=True, exist_ok=True)
Path("app/artifacts").mkdir(parents=True, exist_ok=True)

In [14]:
def read_jsonl(path, n=None):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                # light repair attempt (handles escaped slashes)
                try:
                    rows.append(json.loads(line.replace("\\/", "/")))
                except Exception:
                    continue
            if n and (i+1) >= n:
                break
    return pd.DataFrame(rows)

def to_float(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().lower().replace("db", "").replace("−","-")
    s = re.sub(",", "", s)
    m = re.search(r"[-+]?\d*\.?\d+", s)
    return float(m.group(0)) if m else np.nan

def parse_tempo(x):
    if pd.isna(x): return np.nan
    v = to_float(x)
    if v is np.nan: return np.nan
    # If given as 0-1 normalize to ~40–220 BPM
    if 0 < v <= 1.0:
        return 40.0 + v * (220.0 - 40.0)
    return v

def parse_loudness_db(x):
    if pd.isna(x): return np.nan
    v = to_float(x)
    if v is np.nan: return np.nan
    if 0 <= v <= 1.0:
        return -30.0 + v * 30.0
    return v

def parse_length_mmss(x):
    if pd.isna(x): return np.nan
    s = str(x).strip()
    m = re.match(r"^\s*(\d{1,2}):(\d{1,2})\s*$", s)
    if m:
        return int(m.group(1))*60 + int(m.group(2))
    try:
        return float(s)
    except:
        return np.nan

_MONTHS = {
    "january":1,"february":2,"march":3,"april":4,"may":5,"june":6,
    "july":7,"august":8,"september":9,"october":10,"november":11,"december":12
}
def parse_release_date(x):
    if pd.isna(x): return pd.NaT
    s = str(x).strip()
    # ISO first
    ts = pd.to_datetime(s, errors="coerce")
    if pd.notna(ts): return ts
    # "29th April 2013"
    s2 = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", s.lower())
    m = re.match(r"^\s*(\d{1,2})\s+([a-z]+)\s+(\d{4})\s*$", s2)
    if m:
        day = int(m.group(1)); mon = _MONTHS.get(m.group(2), None); year = int(m.group(3))
        if mon:
            try:
                return pd.Timestamp(year=year, month=mon, day=day)
            except:
                return pd.NaT
    return pd.to_datetime(s, errors="coerce")

def norm_key(s):
    if pd.isna(s): return ""
    return re.sub(r"\s+", " ", str(s).strip().lower())


In [None]:
df_j2 = read_jsonl(json2_path)   # revised BERT 500K (primary)
df_j1 = read_jsonl(json1_path)
df_csv = pd.read_csv(csv_path, low_memory=False)

print("json2:", df_j2.shape)
print("json1:", df_j1.shape)
print("csv  :", df_csv.shape)

In [None]:
common_map = {
    "Artist(s)": "artist", "artist": "artist", "artists": "artist",
    "song": "track_name", "Song": "track_name", "track_name": "track_name",
    "text": "lyrics", "Lyrics": "lyrics",
    "Genre": "genre", "genre": "genre",
    "Album": "album", "album": "album",
    "ISRC": "isrc", "isrc": "isrc",
    "Release Date": "release_date", "release_date": "release_date",
    "Key": "key", "key": "key",
    "Time signature": "time_signature", "time_signature": "time_signature",
    "Length": "length", "length": "length",
    "Explicit": "explicit", "explicit": "explicit",
    "emotion": "emotion",
    "Tempo": "tempo", "tempo": "tempo",
    "Loudness (db)": "loudness_db", "loudness": "loudness_db",
    "Energy": "energy", "energy": "energy",
    "Danceability": "danceability", "danceability": "danceability",
    "Positiveness": "valence", "valence": "valence",
    "Speechiness": "speechiness", "speechiness": "speechiness",
    "Liveness": "liveness", "liveness": "liveness",
    "Acousticness": "acousticness", "acousticness": "acousticness",
    "Instrumentalness": "instrumentalness", "instrumentalness": "instrumentalness",
    "Popularity": "popularity", "popularity": "popularity",
}

def rename_keep(df):
    df = df.rename(columns={c: common_map.get(c, c) for c in df.columns})
    keep = [
        "isrc","artist","track_name","lyrics","genre","album","release_date","key",
        "time_signature","length","explicit","emotion","tempo","loudness_db",
        "energy","danceability","valence","speechiness","liveness","acousticness",
        "instrumentalness","popularity"
    ]
    return df[[c for c in keep if c in df.columns]].copy()

df_j2 = rename_keep(df_j2)
df_j1 = rename_keep(df_j1)
df_csv = rename_keep(df_csv)

df_j2.head(2).T

In [5]:
import re
import numpy as np

def clean_df(df):
    if "tempo" in df:          df["tempo"] = df["tempo"].apply(parse_tempo)
    if "loudness_db" in df:    df["loudness_db"] = df["loudness_db"].apply(parse_loudness_db)
    if "length" in df:         df["length_s"] = df["length"].apply(parse_length_mmss)
    if "release_date" in df:   df["release_date"] = df["release_date"].apply(parse_release_date)
    # numeric ints that came as strings
    for col in ["energy","danceability","valence","speechiness","liveness",
                "acousticness","instrumentalness","popularity"]:
        if col in df:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    # explicit to bool
    if "explicit" in df:
        df["explicit"] = df["explicit"].astype(str).str.strip().str.lower().map(
            {"true": True, "1": True, "yes": True, "y": True, "explicit": True,
             "false": False, "0": False, "no": False, "n": False}).fillna(np.nan)

    # normalized join key
    for c in ["artist","track_name"]:
        if c in df:
            df[c] = df[c].astype(str).str.strip()
    df["join_key"] = (df.get("artist","").apply(norm_key) + " — " +
                      df.get("track_name","").apply(norm_key))
    # prefer isrc uppercase
    if "isrc" in df:
        df["isrc"] = df["isrc"].astype(str).str.strip().str.upper()
    return df

df_j2 = clean_df(df_j2)
df_j1 = clean_df(df_j1)
df_csv = clean_df(df_csv)

df_j2.shape, df_j1.shape, df_csv.shape

((551443, 24), (498052, 23), (551443, 23))

In [6]:
# choose base
base = df_j2.copy()

# helper: left-prefer combine
def coalesce(left, right, cols):
    out = left.copy()
    for c in cols:
        if c in right.columns:
            if c not in out.columns:
                out[c] = np.nan
            out[c] = out[c].combine_first(right[c])
    return out

# first join on ISRC where available
def smart_merge(left, right):
    left = left.copy()
    right = right.copy()

    # split rows with and without isrc
    l_isrc = left[left["isrc"].notna()] if "isrc" in left else left.iloc[0:0]
    l_no   = left[left["isrc"].isna()]  if "isrc" in left else left

    r_isrc = right[right["isrc"].notna()] if "isrc" in right else right.iloc[0:0]
    r_no   = right[right["isrc"].isna()]  if "isrc" in right else right

    # 1) isrc join
    if len(l_isrc) and len(r_isrc):
        m1 = pd.merge(l_isrc, r_isrc, on="isrc", how="left", suffixes=("", "_r"))
    else:
        m1 = l_isrc

    # 2) join-key join for those without isrc
    if len(l_no) and len(r_no):
        m2 = pd.merge(l_no, r_no, on="join_key", how="left", suffixes=("", "_r"))
    else:
        m2 = l_no

    merged = pd.concat([m1, m2], ignore_index=True)

    # coalesce _r columns
    all_cols = set(keep_cols + ["length_s","join_key","isrc"])
    for c in list(all_cols):
        c_r = c + "_r"
        if c in merged.columns and c_r in merged.columns:
            merged[c] = merged[c].combine_first(merged[c_r])
            merged.drop(columns=[c_r], inplace=True, errors="ignore")

    # drop any residual *_r
    merged = merged[[c for c in merged.columns if not c.endswith("_r")]]
    return merged

# merge json1 then csv
merged = smart_merge(base, df_j1)
merged = smart_merge(merged, df_csv)

# final column order
final_cols = [
    "isrc","artist","track_name","album","genre","lyrics","emotion",
    "release_date","key","time_signature","length_s","explicit",
    "tempo","loudness_db","energy","danceability","valence","speechiness",
    "liveness","acousticness","instrumentalness","popularity","join_key"
]
final = merged[[c for c in final_cols if c in merged.columns]].copy()

# deduplicate by ISRC first, then by join_key
if "isrc" in final:
    final = final.sort_values(["isrc","release_date"], na_position="last").drop_duplicates(subset=["isrc"], keep="first")
final = final.sort_values(["join_key","release_date"], na_position="last").drop_duplicates(subset=["join_key"], keep="first")

final.shape

(445069, 23)

In [10]:
from pathlib import Path
import pandas as pd

# paths
full_pkl   = Path("data/interim/spotify_clean.pkl")
subset_pkl = Path("data/interim/spotify_50k.pkl")
full_csv   = Path("data/interim/spotify_clean.csv")
subset_csv = Path("data/interim/spotify_50k.csv")

# ensure dir exists
full_pkl.parent.mkdir(parents=True, exist_ok=True)

# make subset
n_subset = min(50000, len(final))
subset = final.sample(n_subset, random_state=42)

# save (engine-free)
final.to_pickle(full_pkl)
subset.to_pickle(subset_pkl)
final.to_csv(full_csv, index=False)
subset.to_csv(subset_csv, index=False)

print(" Saved without Parquet:")
print("  PKL:", full_pkl, "|", subset_pkl)
print("  CSV:", full_csv,  "|", subset_csv)
print("Shapes:", final.shape, subset.shape)

 Saved without Parquet:
  PKL: data\interim\spotify_clean.pkl | data\interim\spotify_50k.pkl
  CSV: data\interim\spotify_clean.csv | data\interim\spotify_50k.csv
Shapes: (445069, 23) (50000, 23)


In [11]:
import pandas as pd

# fast dev loop
df = pd.read_pickle("data/interim/spotify_50k.pkl")

# later (full set)
# df = pd.read_pickle("data/interim/spotify_clean.pkl")


# Load the 50k subset for fast iteration (switch to full later)
import os, numpy as np, pandas as pd
from tqdm import tqdm

subset_path = "data/interim/spotify_50k.parquet"   # <- use full later: data/interim/spotify_clean.parquet
df = pd.read_parquet(subset_path)
len(df), df.columns.tolist()[:12]

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:
# Lyric + meta embeddings with Sentence-BERT
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/all-MiniLM-L6-v2"  # fast & good
model = SentenceTransformer(model_name)

# fallback text if lyrics missing
texts_lyrics = df["lyrics"].fillna(df["track_name"] + " " + df["artist"])
texts_meta   = (df["artist"].fillna("") + " | "
                + df.get("genre", pd.Series([""]*len(df))).fillna("") + " | "
                + df["track_name"].fillna(""))

batch = 512

def batched_encode(texts):
    out = []
    for i in tqdm(range(0, len(texts), batch)):
        chunk = texts.iloc[i:i+batch].tolist()
        E = model.encode(chunk, normalize_embeddings=True, show_progress_bar=False)
        out.append(E)
    return np.vstack(out).astype("float32")

lyrics_emb = batched_encode(texts_lyrics)
meta_emb   = batched_encode(texts_meta)

lyrics_emb.shape, meta_emb.shape

In [None]:
# Numeric features block (scaled)
from sklearn.preprocessing import StandardScaler

num_cols = [c for c in [
    "energy","danceability","valence","speechiness","liveness","acousticness",
    "instrumentalness","popularity","tempo","loudness_db","length_s"
] if c in df.columns]

X_num = df[num_cols].fillna(df[num_cols].median())
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num).astype("float32")
X_num_scaled.shape

In [None]:
# Emotion block (optional one-hot if present)
from sklearn.preprocessing import MultiLabelBinarizer

if "emotion" in df.columns:
    emo_lists = df["emotion"].fillna("").astype(str).str.split(",").apply(
        lambda xs: [x.strip().lower() for x in xs if x.strip()]
    )
    mlb = MultiLabelBinarizer(sparse_output=False)
    emo_mat = mlb.fit_transform(emo_lists).astype("float32")
else:
    emo_mat = np.zeros((len(df), 0), dtype="float32")

emo_mat.shape


In [None]:
# Fuse into a single hybrid vector and L2-normalize
from sklearn.preprocessing import normalize
import numpy as np

w_lyrics, w_meta, w_num, w_emo = 0.6, 0.2, 0.15, 0.05

# normalize numeric & emotion blocks first (safeguard)
X_num_n = normalize(X_num_scaled) if X_num_scaled.shape[1] else X_num_scaled
emo_n   = normalize(emo_mat) if emo_mat.shape[1] else emo_mat

# combine: text channels are same dim; then append numeric/emotion channels
text_block = (w_lyrics*lyrics_emb + w_meta*meta_emb)
hybrid = np.hstack([text_block, w_num*X_num_n, w_emo*emo_n]).astype("float32")
hybrid = normalize(hybrid).astype("float32")

hybrid.shape


In [None]:
# Save artifacts for reuse
import joblib, os

os.makedirs("data/processed", exist_ok=True)
np.save("data/processed/hybrid_emb_50k.npy", hybrid)
df_items = df[["artist","track_name","album","genre","release_date"]].copy()
df_items.to_parquet("data/processed/items_50k.parquet", index=False)
joblib.dump({"scaler": scaler, "num_cols": num_cols}, "data/processed/prepro.pkl")

"saved: hybrid_emb_50k.npy, items_50k.parquet, prepro.pkl"

In [None]:
import faiss, numpy as np, os

xb = np.load("data/processed/hybrid_emb_50k.npy")
d = xb.shape[1]
index = faiss.IndexFlatIP(d)      # inner product == cosine if vectors are L2-normalized
index.add(xb)
faiss.write_index(index, "data/processed/faiss_ip_50k.index")
d, index.ntotal

In [None]:
# figure out "tail" size (numeric + emotion dims)
text_dim = lyrics_emb.shape[1]        # same as meta_emb.shape[1]
tail_dim = hybrid.shape[1] - (text_dim + text_dim)  # rest is numeric+emotion

import joblib
from sklearn.preprocessing import normalize

class HybridQueryEncoder:
    def __init__(self, model_name, w_lyrics=0.6, w_meta=0.2, tail_dim=0):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer(model_name)
        self.w_lyrics = w_lyrics
        self.w_meta = w_meta
        self.tail_dim = tail_dim

    def encode(self, text: str):
        E = self.model.encode([text], normalize_embeddings=True).astype("float32")
        q_text = self.w_lyrics*E + self.w_meta*E
        tail = np.zeros((1, self.tail_dim), dtype="float32")
        q = np.hstack([q_text, tail])
        return normalize(q).astype("float32")

enc = HybridQueryEncoder(model_name, w_lyrics=0.6, w_meta=0.2, tail_dim=tail_dim)
joblib.dump(enc, "data/processed/query_encoder.pkl"), tail_dim

In [None]:
import pandas as pd, numpy as np, faiss, joblib

items = pd.read_parquet("data/processed/items_50k.parquet")
index = faiss.read_index("data/processed/faiss_ip_50k.index")
enc   = joblib.load("data/processed/query_encoder.pkl")
emb   = np.load("data/processed/hybrid_emb_50k.npy")

def rec_by_text(q, k=10):
    qv = enc.encode(q)
    scores, ids = index.search(qv, k)
    out = items.iloc[ids[0]].copy()
    out["score"] = scores[0]
    return out

def rec_like_seed(seed_row_index, k=10):
    qv = emb[seed_row_index:seed_row_index+1]
    scores, ids = index.search(qv, k+1)
    ids = [i for i in ids[0] if i != seed_row_index][:k]
    out = items.iloc[ids].copy()
    out["score"] = (emb[ids] @ qv.T).ravel()
    return out

# Try it:
display(rec_by_text("melancholic acoustic ballad", 10))
seed_idx = 0
print("Seed:", items.iloc[seed_idx].to_dict())
display(rec_like_seed(seed_idx, 10))

In [None]:
import shutil, os
os.makedirs("app/artifacts", exist_ok=True)

for fn in [
    "data/processed/faiss_ip_50k.index",
    "data/processed/items_50k.parquet",
    "data/processed/query_encoder.pkl",
    # optional: bring the embedding if you want same-like-seed search without FAISS
    # "data/processed/hybrid_emb_50k.npy",
]:
    shutil.copy(fn, "app/artifacts/")

"artifacts copied to app/artifacts"