In [None]:
# 📦 1. Install & Import Libraries
!pip install pandas scikit-learn scipy tqdm

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import cdist
from tqdm import tqdm





In [None]:
# 📂 2. Upload Flattened Features
from google.colab import files
uploaded = files.upload()

# Replace with your actual filename after upload
features_path = 'final_merged_with_extras.csv'
df = pd.read_csv(features_path)


Saving final_merged_with_extras.csv to final_merged_with_extras (1).csv


In [None]:
# 🚹 3. Preprocess Features
feature_cols = [col for col in df.columns if col.startswith(('mfcc_', 'chroma_', 'spec_contrast_', 'tempo', 'centroid', 'rms', 'zcr'))]

def safe_parse_tempo(val):
    if isinstance(val, str):
        try:
            return float(eval(val)[0])
        except:
            return np.nan
    elif isinstance(val, list) or isinstance(val, np.ndarray):
        return float(val[0]) if len(val) > 0 else np.nan
    elif isinstance(val, (float, int)):
        return float(val)
    return np.nan

if 'tempo' in df.columns:
    df['tempo'] = df['tempo'].apply(safe_parse_tempo)

X = df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# 🌟 4. Train Nearest Neighbors Model (initial: cosine)
knn_cosine = NearestNeighbors(n_neighbors=15, metric='cosine')
knn_cosine.fit(X_scaled)

# ✨ 4b. Alternative Nearest Neighbors Model (updated: euclidean)
knn_euclidean = NearestNeighbors(n_neighbors=11, metric='euclidean')
knn_euclidean.fit(X_scaled)

In [None]:
# 🔍 5. Basic Content-Based KNN Recommendation
# Toggle between cosine and euclidean with the 'metric' argument

def recommend_by_knn(song_id, top_k=3, metric='cosine', weighted=True):
    model = knn_euclidean if metric == 'euclidean' else knn_cosine

    query_index = df.index[df['track_id'] == song_id].tolist()
    if not query_index:
        print(f"❌ track_id {song_id} not found")
        return []

    query_index = query_index[0]
    query_vector = X_scaled[query_index].reshape(1, -1)
    distances, indices = model.kneighbors(query_vector, n_neighbors=model.n_neighbors)

    all_neighbors = [(i, d) for i, d in zip(indices[0], distances[0]) if i != query_index]
    if weighted:
        # Rank by weighted similarity (inverse distance)
        sorted_neighbors = sorted(all_neighbors, key=lambda x: 1 / (x[1] + 1e-6), reverse=True)
    else:
        sorted_neighbors = all_neighbors

    top_indices = [i for i, _ in sorted_neighbors[:top_k]]
    return df.iloc[top_indices][['track_id', 'title', 'artist_name', 'genre_top']]

In [None]:
# 🧠 6. Playlist-Aware Recommender
def recommend_within_playlist(song_id, playlist_df, top_k=3, metric='cosine', weighted=True):
    playlist_ids = playlist_df['track_id'].tolist()
    playlist_indices = df.index[df['track_id'].isin(playlist_ids)].tolist()

    if not playlist_indices:
        print("⚠️ No matching playlist songs found in dataset")
        return recommend_by_knn(song_id, top_k, metric=metric, weighted=weighted)

    playlist_vectors = X_scaled[playlist_indices]
    centroid = np.mean(playlist_vectors, axis=0).reshape(1, -1)

    all_dists = cdist(X_scaled, centroid, metric=metric).flatten()
    sorted_indices = np.argsort(all_dists)

    recommended = []
    for idx in sorted_indices:
        tid = df.iloc[idx]['track_id']
        if tid != song_id and tid not in playlist_ids:
            recommended.append(df.iloc[idx][['track_id', 'title', 'artist_name', 'genre_top']])
        if len(recommended) == top_k:
            break

    return pd.DataFrame(recommended)

In [None]:
# 🔄 7. Unified Entry Point
def recommend_songs(song_id, playlist_df=None, top_k=3, metric='cosine', weighted=True):
    if playlist_df is None or playlist_df.empty:
        print("📢 No playlist provided. Using content-based recommendation.")
        return recommend_by_knn(song_id, top_k, metric=metric, weighted=weighted)
    else:
        print("🎷 Playlist detected. Using playlist-aware recommendation.")
        return recommend_within_playlist(song_id, playlist_df, top_k, metric=metric)


In [None]:
# 📊 8. Evaluation (Optional: Genre Match Accuracy)
def evaluate_accuracy_at_k(k=3, metric='cosine', weighted=False):
    hits = 0
    total = 0
    for idx in tqdm(range(len(df))):
        song_id = df.iloc[idx]['track_id']
        true_genre = df.iloc[idx]['genre_top']
        recs = recommend_by_knn(song_id, top_k=k, metric=metric, weighted=weighted)
        rec_genres = recs['genre_top'].tolist()
        if true_genre in rec_genres:
            hits += 1
        total += 1
    acc = hits / total
    print(f"\n✅ Top-{k} genre match accuracy using {metric}: {acc:.3f}")
    return acc


In [None]:
# 🚀 Example usage:
recommend_songs(song_id=4, top_k=3, metric='cosine')
evaluate_accuracy_at_k(k=5, metric='cosine')


📢 No playlist provided. Using content-based recommendation.
❌ track_id 4 not found


100%|██████████| 4995/4995 [00:24<00:00, 202.99it/s]


✅ Top-5 genre match accuracy using cosine: 0.820





0.8196196196196196

In [None]:
# 🔁 KNN Recommender Benchmark Runner

configs = [
    {"metric": "cosine", "weighted": False, "label": "Cosine @3"},
    {"metric": "cosine", "weighted": True, "label": "Cosine + Weighted @3"},
    {"metric": "euclidean", "weighted": False, "label": "Euclidean @3"},
    {"metric": "euclidean", "weighted": True, "label": "Euclidean + Weighted @3"},
    {"metric": "cosine", "weighted": True, "label": "Cosine + Weighted @5", "top_k": 5},
]

results = []

for config in configs:
    k = config.get("top_k", 3)
    print(f"running: {config['label']}")
    acc = evaluate_accuracy_at_k(k=k, metric=config["metric"], weighted=config["weighted"])
    results.append((config["label"], acc))

# 📊 Print Summary
print("📈 Accuracy Summary:")
for label, acc in results:
    print(f"{label:<30}: {acc:.3f}")


running: Cosine @3


100%|██████████| 4995/4995 [00:22<00:00, 219.35it/s]



✅ Top-3 genre match accuracy using cosine: 0.742
running: Cosine + Weighted @3


100%|██████████| 4995/4995 [00:23<00:00, 214.63it/s]



✅ Top-3 genre match accuracy using cosine: 0.742
running: Euclidean @3


100%|██████████| 4995/4995 [00:13<00:00, 380.02it/s]



✅ Top-3 genre match accuracy using euclidean: 0.733
running: Euclidean + Weighted @3


100%|██████████| 4995/4995 [00:13<00:00, 375.62it/s]



✅ Top-3 genre match accuracy using euclidean: 0.733
running: Cosine + Weighted @5


100%|██████████| 4995/4995 [00:23<00:00, 213.00it/s]


✅ Top-5 genre match accuracy using cosine: 0.820
📈 Accuracy Summary:
Cosine @3                     : 0.742
Cosine + Weighted @3          : 0.742
Euclidean @3                  : 0.733
Euclidean + Weighted @3       : 0.733
Cosine + Weighted @5          : 0.820





In [None]:
# 📝 Note for README tracking:
# - Content-based KNN recommender implemented
# - Playlist-aware fallback added
# - Accuracy@3 via genre match as proxy for relevance
# - R^2 not applicable here since task is not regression
# - Tempo parsing bug fixed with `safe_parse_tempo()`
# - Evaluation baseline (cosine distance): 0.735 accuracy
# - Switched to Euclidean distance option (toggle with `metric` arg)
# - Added 3 new audio features: centroid, RMS, zero-crossing rate (ZCR)
# - Updated feature_cols to include the new features
# - Using `final_merged_with_extras.csv` for updated feature set
# - Latest cosine evaluation with extra features (15 neighbors): 0.742 accuracy (↑ from 0.735)
# - Top-5 genre match accuracy using cosine (unweighted): 0.820
# - Added optional weighted KNN recommendation support via inverse distance
# - Example usage now defaults to cosine with weighted=True

