In [1]:
import os

# Limit BLAS thread pools BEFORE importing numpy / implicit
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

In [2]:
import numpy as np
import pandas as pd

from pathlib import Path
from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares

from sklearn.model_selection import train_test_split

PROJECT_ROOT = Path("..").resolve()
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

INTERACTIONS_PATH = DATA_PROCESSED / "interactions.parquet"
TRACK_META_PATH   = DATA_PROCESSED / "track_metadata.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("INTERACTIONS_PATH:", INTERACTIONS_PATH)
print("TRACK_META_PATH:", TRACK_META_PATH)



  from .autonotebook import tqdm as notebook_tqdm


PROJECT_ROOT: C:\Users\Kiera\Music_Recommender
INTERACTIONS_PATH: C:\Users\Kiera\Music_Recommender\data\processed\interactions.parquet
TRACK_META_PATH: C:\Users\Kiera\Music_Recommender\data\processed\track_metadata.csv


In [3]:
interactions = pd.read_parquet(INTERACTIONS_PATH)

print("Raw interactions shape:", interactions.shape)
display(interactions.head())

# Optional: sample playlists to keep ALS light
# (adjust N_PLAYLISTS_UPPER if you want more)
N_PLAYLISTS_UPPER = 50000

unique_pids = interactions["pid"].unique()
print("Total unique playlists:", len(unique_pids))

if len(unique_pids) > N_PLAYLISTS_UPPER:
    sampled_pids = np.random.choice(unique_pids, size=N_PLAYLISTS_UPPER, replace=False)
    interactions = interactions[interactions["pid"].isin(sampled_pids)].copy()
    print("Subsampled interactions shape:", interactions.shape)
else:
    print("No subsampling applied.")


Raw interactions shape: (6685101, 4)


Unnamed: 0,pid,track_id,pos,duration_ms
0,0,0UaMYEvWZi0ZqiDOoHU3YI,0,226863
1,0,6I9VzXrHxO9rA9A5euc8Ak,1,198800
2,0,0WqIKmW4BTrj3eJFmnCKMv,2,235933
3,0,1AWQoqb9bSvzTjaLralEkT,3,267266
4,0,1lzr43nnXAijIGYnCT8M8H,4,227600


Total unique playlists: 100000
Subsampled interactions shape: (3351650, 4)


In [4]:
train_pids, test_pids = train_test_split(
    interactions["pid"].unique(),
    test_size=0.2,
    random_state=42,
)

train_df = interactions[interactions["pid"].isin(train_pids)].copy()
test_df  = interactions[interactions["pid"].isin(test_pids)].copy()

print("Train interactions:", train_df.shape)
print("Test interactions:", test_df.shape)


Train interactions: (2680519, 4)
Test interactions: (671131, 4)


In [5]:
# Use only tracks that appear in TRAIN (this keeps the item set tight)
unique_train_tids = np.sort(train_df["track_id"].unique())
unique_train_pids = np.sort(train_df["pid"].unique())

tid_to_idx = {tid: i for i, tid in enumerate(unique_train_tids)}
idx_to_tid = np.array(unique_train_tids)

pid_to_idx = {pid: i for i, pid in enumerate(unique_train_pids)}
idx_to_pid = np.array(unique_train_pids)

print("Num playlists (train):", len(unique_train_pids))
print("Num tracks (train):   ", len(unique_train_tids))

# rows = playlists, cols = tracks
rows = train_df["pid"].map(pid_to_idx).values
cols = train_df["track_id"].map(tid_to_idx).values
data = np.ones_like(rows, dtype=np.float32)

user_item = csr_matrix(
    (data, (rows, cols)),
    shape=(len(unique_train_pids), len(unique_train_tids)),
)

print("user_item shape:", user_item.shape)
print("user_item nnz:", user_item.nnz)


Num playlists (train): 40000
Num tracks (train):    405224
user_item shape: (40000, 405224)
user_item nnz: 2644343


In [6]:
# implicit expects item-user matrix for training
item_user = user_item.T.tocsr()

factors = 64
regularization = 0.01
iterations = 10  # keep this modest to avoid long runs

als_model = AlternatingLeastSquares(
    factors=factors,
    regularization=regularization,
    iterations=iterations,
    random_state=42,
    use_gpu=False,      # make sure we stay on CPU
    num_threads=4,      # adjust if you want
)

print("Fitting ALS...")
als_model.fit(item_user)
print("Done.")


Fitting ALS...


100%|██████████| 10/10 [00:09<00:00,  1.04it/s]

Done.





In [None]:
print("user_item shape:", user_item.shape)
print("als_model.user_factors:", als_model.user_factors.shape)
print("als_model.item_factors:", als_model.item_factors.shape)
print("len(idx_to_tid):", len(idx_to_tid))

In [None]:
track_meta = pd.read_csv(TRACK_META_PATH)

print("track_metadata.csv shape:", track_meta.shape)
display(track_meta.head())

# Metadata helper stays as you had it
meta_simple = track_meta[["track_id", "track_name", "artist_name", "album_name"]].drop_duplicates()
meta_simple = meta_simple.set_index("track_id")

def describe_tracks(track_ids, top_n=10):
    """Return small dataframe with names for a list/array of track_ids."""
    df = meta_simple.loc[meta_simple.index.intersection(track_ids)].copy()
    df = df.reset_index().rename(columns={"track_id": "track_id"})
    if top_n is not None:
        df = df.head(top_n)
    return df


track_metadata.csv shape: (679889, 7)


Unnamed: 0,track_id,track_name,artist_name,album_name,artist_uri,album_uri,duration_ms
0,0000uJA4xCdxThagdLkkLR,Heart As Cold As Stone,Cherryholmes,Cherryholmes,spotify:artist:5kEVfWQGTw0rIDO2Jqq1ww,spotify:album:3SPMBGMEvPw21lmT5b1ApW,161186
1,00039MgrmLoIzSpuYKurn9,Thas What I Do,Zach Farlow,The Great Escape 2,spotify:artist:2jTojc4rAsOMx6200a8Ah1,spotify:album:0UHfgx3ITlxePDXLaN5Y6x,222727
2,0005w1bMJ7QAMl6DY98oxa,"Sonata in G Major, BuxWV 271: Allegro -",Dietrich Buxtehude,"Chamber Music: London Baroque - Rosenmuller, J...",spotify:artist:2YfFYZnshSzgfLsKZMM4VL,spotify:album:6oRWciCAwKegipcCc5FlWe,111573
3,0007AYhg2UQbEm88mxu7js,Mandarin Oranges Part 2,Little Simz,E.D.G.E,spotify:artist:6eXZu6O7nAUA5z6vLV8NKI,spotify:album:32RJzqlapfiU0fr2l4SSW9,198000
4,000CTwOSsvRs0bgXlwB64e,Shady World,Celly Cel,The Lost Tapes,spotify:artist:6k7TX5dxiKZiwMWTn8ZKbl,spotify:album:6LOmO9x8uPwlc3gSXZqqi8,155238


In [None]:
def recommend_for_playlist_manual(
    pid: int,
    model,
    user_item_csr,
    pid_to_idx: dict,
    idx_to_tid: np.ndarray,
    N: int = 10,
    filter_seen: bool = True,
):
    """
    Recommend tracks for a single playlist using ALS factors directly
    (no call to model.recommend).

    Parameters
    ----------
    pid : int
        Original playlist ID from MPD.
    model : implicit.als.AlternatingLeastSquares
        Trained ALS model.
    user_item_csr : csr_matrix (n_playlists × n_tracks)
        Playlist–track interaction matrix used to train the model
        (before transpose).
    pid_to_idx : dict
        Mapping from playlist ID -> row index in user_item_csr.
    idx_to_tid : np.ndarray
        Array mapping track index -> track_id string (len = n_tracks).
    N : int
        Number of recommendations.
    filter_seen : bool
        Whether to remove tracks already in the playlist.
    """

    if pid not in pid_to_idx:
        raise ValueError(f"Playlist {pid} not in pid_to_idx mapping")

    u_idx = pid_to_idx[pid]

    # Sanity checks
    n_users, n_items = user_item_csr.shape
    uf_users, factors_u = model.user_factors.shape
    if_users = model.item_factors.shape[0]
    assert uf_users == n_users, f"user_factors mismatch: {uf_users} vs {n_users}"
    assert if_users == n_items, f"item_factors mismatch: {if_users} vs {n_items}"
    assert len(idx_to_tid) == n_items, f"idx_to_tid length mismatch: {len(idx_to_tid)} vs {n_items}"

    # 1) Get the latent vector for this playlist (user)
    user_vec = model.user_factors[u_idx]  # shape (factors,)

    # 2) Compute scores for all items = item_factors · user_vec
    #    This is a dense (n_items,) vector of preference scores
    scores = model.item_factors @ user_vec  # (n_items,)

    # 3) Optionally filter out tracks already in the playlist
    if filter_seen:
        u_row = user_item_csr[u_idx]       # 1×n_items sparse
        seen_item_indices = u_row.indices  # columns where this playlist has interactions
        scores[seen_item_indices] = -np.inf

    # 4) Take top-N items by score
    # argpartition for efficiency, then full sort of that small subset
    if N >= len(scores):
        topN_idx = np.argsort(-scores)
    else:
        topN_idx = np.argpartition(-scores, N)[:N]
        topN_idx = topN_idx[np.argsort(-scores[topN_idx])]

    top_scores = scores[topN_idx]
    rec_track_ids = idx_to_tid[topN_idx]

    # 5) Attach metadata
    rec_df = describe_tracks(rec_track_ids, top_n=None).copy()
    # In case some track_ids have no metadata (rare), align scores carefully
    score_map = dict(zip(rec_track_ids, top_scores))
    rec_df["score"] = rec_df["track_id"].map(score_map)
    return rec_df

In [9]:
RANDOM_STATE = 42

some_pid = int(train_df["pid"].sample(1, random_state=RANDOM_STATE).iloc[0])
print("Example playlist ID:", some_pid)

true_tracks = train_df[train_df["pid"] == some_pid]["track_id"].unique()
print(f"Playlist has {len(true_tracks)} tracks in train set.\n")

print("Some existing tracks in this playlist:")
display(describe_tracks(true_tracks[:10]))

print("\nALS recommendations:")
recs = recommend_for_playlist(
    some_pid,
    als_model,
    user_item,     # full user×item matrix
    pid_to_idx,
    idx_to_tid,
    N=15,
)
display(recs)


Example playlist ID: 150229
Playlist has 79 tracks in train set.

Some existing tracks in this playlist:


Unnamed: 0,track_id,track_name,artist_name,album_name
0,05KOgYg8PGeJyyWBPi5ja8,Free Smoke,Drake,More Life
1,0NdxbFFknA7kQ4E2zvJfey,Let's Get Blown,Snoop Dogg,R&G (Rhythm & Gangsta): The Masterpiece
2,0jx8zY5JQsS4YEQcfkoc5C,Angels (feat. Saba),Chance The Rapper,Coloring Book
3,1wYZZtamWTQAoj8B812uKQ,Here,Alessia Cara,Know-It-All
4,2Gyc6e2cLxA5hoX1NOvYnU,Say It,Tory Lanez,I Told You
5,3N1p1YDidgHABxyKfG5P6z,Thirsty,PARTYNEXTDOOR,PARTYNEXTDOOR TWO
6,4SYUUlkScpNR1QvPscXf8t,Too Young,Post Malone,Stoney
7,4XQQovRSltOGWevTCgacXY,Caretaker (feat. SZA),DRAM,Gahdamn!
8,5VPFATm85G3P04Q5g8yxqr,Nelly,Isaiah Rashad,Nelly
9,7Mb7jOst43wPQELas93ARE,Optimistic,Sounds Of Blackness,Evolution Of Gospel



ALS recommendations:
[DEBUG] Recommending for pid=150229, internal user idx=24515
[DEBUG] Row nnz (tracks in playlist): 79, row shape: (1, 405224)


IndexError: index 47929 is out of bounds for axis 1 with size 40000