In [1]:
from pathlib import Path
# your confirmed paths
csv_path   = "data/spotify_dataset.csv"
json1_path = "data/900k Definitive Spotify Dataset.json"
json2_path = "data/final_milliondataset_BERT_500K_revised.json"

# outputs
Path("data/interim").mkdir(parents=True, exist_ok=True)
Path("data/processed").mkdir(parents=True, exist_ok=True)
Path("app/artifacts").mkdir(parents=True, exist_ok=True)
print("Folders ready ")

Folders ready 


In [2]:
import json, re
import numpy as np
import pandas as pd
from tqdm import tqdm

def read_jsonl(path, n=None):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            line = line.strip()
            if not line: 
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                try:
                    rows.append(json.loads(line.replace("\\/", "/")))
                except Exception:
                    continue
            if n and (i+1) >= n:
                break
    return pd.DataFrame(rows)

def to_float(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().lower().replace("db","").replace("−","-")
    s = re.sub(",", "", s)
    m = re.search(r"[-+]?\d*\.?\d+", s)
    return float(m.group(0)) if m else np.nan

def parse_tempo(x):
    if pd.isna(x): return np.nan
    v = to_float(x)
    if v is np.nan: return np.nan
    return 40.0 + v*(220.0-40.0) if 0 < v <= 1.0 else v

def parse_loudness_db(x):
    if pd.isna(x): return np.nan
    v = to_float(x)
    if v is np.nan: return np.nan
    return -30.0 + v*30.0 if 0 <= v <= 1.0 else v

def parse_length_mmss(x):
    if pd.isna(x): return np.nan
    s = str(x).strip()
    m = re.match(r"^\s*(\d{1,2}):(\d{1,2})\s*$", s)
    if m: return int(m.group(1))*60 + int(m.group(2))
    try: return float(s)
    except: return np.nan

_MONTHS = {m:i for i,m in enumerate(
    ["january","february","march","april","may","june","july","august","september","october","november","december"],1)}
def parse_release_date(x):
    if pd.isna(x): return pd.NaT
    s = str(x).strip()
    ts = pd.to_datetime(s, errors="ignore")
    if isinstance(ts, pd.Timestamp): return ts
    s2 = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", s.lower())
    m = re.match(r"^\s*(\d{1,2})\s+([a-z]+)\s+(\d{4})\s*$", s2)
    if m:
        day = int(m.group(1)); mon = _MONTHS.get(m.group(2)); year = int(m.group(3))
        if mon:
            try: return pd.Timestamp(year, mon, day)
            except: return pd.NaT
    return pd.to_datetime(s, errors="coerce")

def norm_key(s):
    if pd.isna(s): return ""
    return re.sub(r"\s+"," ", str(s).strip().lower())

In [3]:
df_j2 = read_jsonl(json2_path)             # revised BERT 500K (primary)
df_j1 = read_jsonl(json1_path)
df_csv = pd.read_csv(csv_path, low_memory=False)
print("json2:", df_j2.shape, "| json1:", df_j1.shape, "| csv:", df_csv.shape)

json2: (551443, 33) | json1: (498052, 31) | csv: (551443, 39)


In [4]:
common_map = {
    "Artist(s)":"artist","artists":"artist","artist":"artist",
    "song":"track_name","Song":"track_name","track_name":"track_name",
    "text":"lyrics","Lyrics":"lyrics",
    "Genre":"genre","genre":"genre",
    "Album":"album","album":"album",
    "ISRC":"isrc","isrc":"isrc",
    "Release Date":"release_date","release_date":"release_date",
    "Key":"key","key":"key",
    "Time signature":"time_signature","time_signature":"time_signature",
    "Length":"length","length":"length",
    "Explicit":"explicit","explicit":"explicit",
    "emotion":"emotion",
    "Tempo":"tempo","tempo":"tempo",
    "Loudness (db)":"loudness_db","loudness":"loudness_db",
    "Energy":"energy","energy":"energy",
    "Danceability":"danceability","danceability":"danceability",
    "Positiveness":"valence","valence":"valence",
    "Speechiness":"speechiness","speechiness":"speechiness",
    "Liveness":"liveness","liveness":"liveness",
    "Acousticness":"acousticness","acousticness":"acousticness",
    "Instrumentalness":"instrumentalness","instrumentalness":"instrumentalness",
    "Popularity":"popularity","popularity":"popularity",
}
def rename_keep(df):
    df = df.rename(columns={c: common_map.get(c,c) for c in df.columns})
    keep = ["isrc","artist","track_name","lyrics","genre","album","release_date","key",
            "time_signature","length","explicit","emotion","tempo","loudness_db",
            "energy","danceability","valence","speechiness","liveness","acousticness",
            "instrumentalness","popularity"]
    return df[[c for c in keep if c in df.columns]].copy()

df_j2 = rename_keep(df_j2)
df_j1 = rename_keep(df_j1)
df_csv = rename_keep(df_csv)
df_j2.head(2).T

Unnamed: 0,0,1
isrc,GBBPW1200312,GBBPW1200314
artist,!!!,!!!
track_name,Even When the Waters Cold,One Girl / One Boy
lyrics,Friends told her she was better off at the bot...,"Well I heard it, playing soft\nFrom a drunken ..."
genre,hip hop,hip hop
album,Thr!!!er,Thr!!!er
release_date,29th April 2013,29th April 2013
key,D min,A# min
time_signature,4/4,4/4
length,03:47,04:03


In [5]:
def clean_df(df):
    df = df.copy()
    if "tempo" in df:          df["tempo"] = df["tempo"].apply(parse_tempo)
    if "loudness_db" in df:    df["loudness_db"] = df["loudness_db"].apply(parse_loudness_db)
    if "length" in df:         df["length_s"] = df["length"].apply(parse_length_mmss)
    if "release_date" in df:   df["release_date"] = df["release_date"].apply(parse_release_date)
    for c in ["energy","danceability","valence","speechiness","liveness",
              "acousticness","instrumentalness","popularity"]:
        if c in df: df[c] = pd.to_numeric(df[c], errors="coerce")
    if "explicit" in df:
        df["explicit"] = df["explicit"].astype(str).str.strip().str.lower().map(
            {"true": True,"1": True,"yes": True,"y": True,"explicit": True,
             "false": False,"0": False,"no": False,"n": False}
        ).astype("boolean")
    for c in ["artist","track_name"]:
        if c in df: df[c] = df[c].astype(str).str.strip()
    df["join_key"] = (df.get("artist","").apply(norm_key) + " — " +
                      df.get("track_name","").apply(norm_key))
    if "isrc" in df: df["isrc"] = df["isrc"].astype(str).str.strip().str.upper()
    return df

df_j2 = clean_df(df_j2)
df_j1 = clean_df(df_j1)
df_csv = clean_df(df_csv)
df_j2.shape, df_j1.shape, df_csv.shape

  ts = pd.to_datetime(s, errors="ignore")
  ts = pd.to_datetime(s, errors="ignore")
  ts = pd.to_datetime(s, errors="ignore")


((551443, 24), (498052, 23), (551443, 23))

In [6]:
def smart_merge(left, right):
    left = left.copy(); right = right.copy()
    l_isrc = left[left["isrc"].notna()] if "isrc" in left else left.iloc[0:0]
    l_no   = left[left["isrc"].isna()]  if "isrc" in left else left
    r_isrc = right[right["isrc"].notna()] if "isrc" in right else right.iloc[0:0]
    r_no   = right[right["isrc"].isna()]  if "isrc" in right else right

    m1 = pd.merge(l_isrc, r_isrc, on="isrc", how="left", suffixes=("", "_r")) if len(l_isrc) and len(r_isrc) else l_isrc
    m2 = pd.merge(l_no,   r_no,   on="join_key", how="left", suffixes=("", "_r")) if len(l_no) and len(r_no) else l_no
    merged = pd.concat([m1, m2], ignore_index=True)

    for c in list(merged.columns):
        if c.endswith("_r"):
            base = c[:-2]
            if base in merged.columns:
                merged[base] = merged[base].combine_first(merged[c])
            merged.drop(columns=[c], inplace=True, errors="ignore")
    return merged

merged = smart_merge(df_j2, df_j1)
merged = smart_merge(merged, df_csv)

final_cols = ["isrc","artist","track_name","album","genre","lyrics","emotion",
              "release_date","key","time_signature","length_s","explicit",
              "tempo","loudness_db","energy","danceability","valence","speechiness",
              "liveness","acousticness","instrumentalness","popularity","join_key"]
final = merged[[c for c in final_cols if c in merged.columns]].copy()

if "isrc" in final:
    final = final.sort_values(["isrc","release_date"], na_position="last") \
                 .drop_duplicates(subset=["isrc"], keep="first")
final = final.sort_values(["join_key","release_date"], na_position="last") \
             .drop_duplicates(subset=["join_key"], keep="first")

display(final.sample(5))
print("Shape:", final.shape)

Unnamed: 0,isrc,artist,track_name,album,genre,lyrics,emotion,release_date,key,time_signature,...,loudness_db,energy,danceability,valence,speechiness,liveness,acousticness,instrumentalness,popularity,join_key
406590,USA2P0721820,Sagat,Funk Dat,Funk Dat (Why Is It?),hip hop,[Intro]\nFunk dat!\n\n[Verse 1]\nQuestion:\nWh...,sadness,1993-01-01,D# min,4/4,...,-15.3,44,89,78,10,4,47,2,27,sagat — funk dat
315461,USUM70607498,"Method Man,Fat Joe,Styles P",Yameen,4:21...The Day After,hip hop,"[Intro: Method Man]\nYo, I'm bout to hit you w...",love,2006-01-01,D Maj,4/4,...,-7.09,67,61,73,32,5,9,0,24,"method man,fat joe,styles p — yameen"
47497,USSM10024329,Bessie Smith,Any Womans Blues,"The Complete Recordings, Vol. 1","blues,jazz",My man ain't acting right\nHe stays out late a...,sadness,1991-04-08,B Maj,4/4,...,-18.46,3,56,33,4,15,100,3,11,bessie smith — any womans blues
299218,NO5281301050,Madcon,Unbreakable,Icon,"hip hop,electropop",[Hook: Tshawe]\nDon't judge my way\nCan't take...,joy,2013-09-26,B min,4/4,...,-1.24,90,68,34,6,8,4,0,6,madcon — unbreakable
176844,US4D40400300,Gentle Giant,His Last Voyage,Free Hand,"progressive rock,rock","Rose in early morning, as the light came throu...",sadness,1975-07-01,F Maj,3/4,...,-11.77,36,32,42,4,11,74,88,10,gentle giant — his last voyage


Shape: (445069, 23)


In [7]:
from pathlib import Path
full_pkl   = Path("data/interim/spotify_clean.pkl")
subset_pkl = Path("data/interim/spotify_50k.pkl")
full_csv   = Path("data/interim/spotify_clean.csv")
subset_csv = Path("data/interim/spotify_50k.csv")
full_pkl.parent.mkdir(parents=True, exist_ok=True)

n_subset = min(50000, len(final))
subset = final.sample(n_subset, random_state=42)

final.to_pickle(full_pkl)
subset.to_pickle(subset_pkl)
final.to_csv(full_csv, index=False)
subset.to_csv(subset_csv, index=False)

print("Saved:", full_pkl, subset_pkl, full_csv, subset_csv)

Saved: data\interim\spotify_clean.pkl data\interim\spotify_50k.pkl data\interim\spotify_clean.csv data\interim\spotify_50k.csv


In [8]:
df = pd.read_pickle("data/interim/spotify_50k.pkl")
len(df), df.columns.tolist()[:12]

(50000,
 ['isrc',
  'artist',
  'track_name',
  'album',
  'genre',
  'lyrics',
  'emotion',
  'release_date',
  'key',
  'time_signature',
  'length_s',
  'explicit'])

In [9]:
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

texts_lyrics = df["lyrics"].fillna(df["track_name"] + " " + df["artist"])
texts_meta   = (df["artist"].fillna("") + " | "
                + df.get("genre", pd.Series([""]*len(df))).fillna("") + " | "
                + df["track_name"].fillna(""))

batch = 512
def batched_encode(texts):
    out = []
    for i in tqdm(range(0, len(texts), batch)):
        chunk = texts.iloc[i:i+batch].tolist()
        E = model.encode(chunk, normalize_embeddings=True, show_progress_bar=False)
        out.append(E)
    return np.vstack(out).astype("float32")

lyrics_emb = batched_encode(texts_lyrics)
meta_emb   = batched_encode(texts_meta)
lyrics_emb.shape, meta_emb.shape

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
100%|███████████████████████████████████████████████████████████████████████████████| 98/98 [5:02:01<00:00, 184.91s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 98/98 [10:43<00:00,  6.57s/it]


((50000, 384), (50000, 384))

In [10]:
from sklearn.preprocessing import StandardScaler

num_cols = [c for c in [
    "energy","danceability","valence","speechiness","liveness",
    "acousticness","instrumentalness","popularity","tempo","loudness_db","length_s"
] if c in df.columns]

X_num = df[num_cols].fillna(df[num_cols].median())
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num).astype("float32")

from sklearn.preprocessing import MultiLabelBinarizer
if "emotion" in df.columns:
    emo_lists = df["emotion"].fillna("").astype(str).str.split(",").apply(
        lambda xs: [x.strip().lower() for x in xs if x.strip()]
    )
    mlb = MultiLabelBinarizer(sparse_output=False)
    emo_mat = mlb.fit_transform(emo_lists).astype("float32")
else:
    emo_mat = np.zeros((len(df), 0), dtype="float32")

X_num_scaled.shape, emo_mat.shape

((50000, 11), (50000, 7))

In [11]:
from sklearn.preprocessing import normalize

w_lyrics, w_meta, w_num, w_emo = 0.6, 0.2, 0.15, 0.05
X_num_n = normalize(X_num_scaled) if X_num_scaled.shape[1] else X_num_scaled
emo_n   = normalize(emo_mat) if emo_mat.shape[1] else emo_mat

text_block = (w_lyrics*lyrics_emb + w_meta*meta_emb)
hybrid = np.hstack([text_block, w_num*X_num_n, w_emo*emo_n]).astype("float32")
hybrid = normalize(hybrid).astype("float32")

np.save("data/processed/hybrid_emb_50k.npy", hybrid)

items = df[["artist","track_name","album","genre","release_date"]].copy().reset_index(drop=True)
items.to_pickle("data/processed/items_50k.pkl")
hybrid.shape, items.shape

((50000, 402), (50000, 5))

In [12]:
import faiss, numpy as np

xb = np.load("data/processed/hybrid_emb_50k.npy")
d = xb.shape[1]
index = faiss.IndexFlatIP(d)   # cosine if L2-normalized
index.add(xb)
faiss.write_index(index, "data/processed/faiss_ip_50k.index")
d, index.ntotal

(402, 50000)

In [13]:
import joblib
from sklearn.preprocessing import normalize

text_dim = lyrics_emb.shape[1]
tail_dim = hybrid.shape[1] - (text_dim + text_dim)

class HybridQueryEncoder:
    def __init__(self, model_name, w_lyrics=0.6, w_meta=0.2, tail_dim=0):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer(model_name)
        self.w_lyrics, self.w_meta, self.tail_dim = w_lyrics, w_meta, tail_dim
    def encode(self, text: str):
        E = self.model.encode([text], normalize_embeddings=True).astype("float32")
        q_text = self.w_lyrics*E + self.w_meta*E
        tail = np.zeros((1, self.tail_dim), dtype="float32")
        q = np.hstack([q_text, tail])
        return normalize(q).astype("float32")

enc = HybridQueryEncoder(model_name, 0.6, 0.2, tail_dim)
joblib.dump(enc, "app/artifacts/query_encoder.pkl")

import shutil
shutil.copy("data/processed/faiss_ip_50k.index", "app/artifacts/faiss_ip_50k.index")
shutil.copy("data/processed/items_50k.pkl",       "app/artifacts/items_50k.pkl")
print("Artifacts ready in app/artifacts ")

Artifacts ready in app/artifacts 


In [16]:
import numpy as np, joblib
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

# 1) figure out dimensions from your saved artifacts
hybrid = np.load("data/processed/hybrid_emb_50k.npy")
hybrid_dim = hybrid.shape[1]

model_name = "sentence-transformers/all-MiniLM-L6-v2"
_text_model = SentenceTransformer(model_name)
text_dim = _text_model.get_sentence_embedding_dimension()

# since we SUMMED lyrics+meta, tail = hybrid - text_dim
tail_dim = int(hybrid_dim - text_dim)
assert tail_dim >= 0, (hybrid_dim, text_dim, tail_dim)

# 2) rebuild a consistent query encoder
class HybridQueryEncoder:
    def __init__(self, model_name, w_lyrics=0.6, w_meta=0.2, tail_dim=0):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer(model_name)
        self.w_lyrics = w_lyrics
        self.w_meta = w_meta
        self.tail_dim = tail_dim
    def encode(self, text: str):
        # we used the SAME embedding for lyrics+meta and then weighted SUM
        E = self.model.encode([text], normalize_embeddings=True).astype("float32")  # (1, text_dim)
        q_text = (self.w_lyrics + self.w_meta) * E                                  # (1, text_dim)
        tail = np.zeros((1, self.tail_dim), dtype="float32")                        # (1, tail_dim)
        q = np.hstack([q_text, tail])                                               # (1, hybrid_dim)
        return normalize(q).astype("float32")

enc = HybridQueryEncoder(model_name, w_lyrics=0.6, w_meta=0.2, tail_dim=tail_dim)

# 3) overwrite the old encoder file
joblib.dump(enc, "app/artifacts/query_encoder.pkl");

print("Rebuilt encoder")
print("hybrid_dim:", hybrid_dim, "| text_dim:", text_dim, "| tail_dim:", tail_dim)

Rebuilt encoder
hybrid_dim: 402 | text_dim: 384 | tail_dim: 18


In [None]:
from pathlib import Path
import shutil

src_files = {
    "data/processed/items_50k.pkl": "app/artifacts/items_50k.pkl",
    "data/processed/faiss_ip_50k.index": "app/artifacts/faiss_ip_50k.index",
    # encoder is already in the right place — copy only if it’s elsewhere
}

Path("app/artifacts").mkdir(parents=True, exist_ok=True)

for src, dst in src_files.items():
    src, dst = Path(src), Path(dst)
    if not dst.exists():
        shutil.copy(src, dst)
        print(f" Copied {src.name} → {dst.parent}")
    else:
        print(f" Skipped (already exists): {dst.name}")

print("\n All artifacts are now in app/artifacts/")
