In [1]:
# Item-Based Collaborative Filtering Music Recommendation System

#This notebook implements an item-based collaborative filtering system for music recommendations. The approach focuses on finding similarities between items (songs) based on user interaction patterns, and then recommends songs similar to those a user has already interacted with.

## Overview of the Approach
# - **Data**: Music metadata and user listening behavior
# - **Method**: Item-based collaborative filtering
# - **Focus**: User experience metrics including diversity, novelty
# - **Evaluation**: Ranking-sensitive metrics (NDCG@k)


## Step 1: Data Loading

First, we'll load the datasets and examine their structure to understand what we're working with.


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
np.random.seed(42)

print("Libraries imported successfully!")


Libraries imported successfully!


In [3]:
# Step 1: Load Data (Load CSVs)
print("=== STEP 1: LOAD DATA ===")
print("Loading datasets...")

# Load music metadata
music_df = pd.read_csv('music_list.csv')
print(f"Music dataset shape: {music_df.shape}")

# Load user behavior data  
behavior_df = pd.read_csv('user_behavior_list.csv')
print(f"User behavior dataset shape: {behavior_df.shape}")

print("\nDatasets loaded successfully!")


=== STEP 1: LOAD DATA ===
Loading datasets...
Music dataset shape: (50683, 21)
User behavior dataset shape: (9711301, 3)

Datasets loaded successfully!


In [4]:
# Examine the structure of the datasets
print("=== MUSIC DATASET STRUCTURE ===")
print("Columns:", music_df.columns.tolist())
print("\nFirst few rows:")
print(music_df.head())
print(f"\nDataset info:")
print(f"- Number of tracks: {len(music_df)}")
print(f"- Number of unique artists: {music_df['artist'].nunique()}")
print(f"- Missing values per column:")
print(music_df.isnull().sum())


=== MUSIC DATASET STRUCTURE ===
Columns: ['track_id', 'name', 'artist', 'spotify_preview_url', 'spotify_id', 'tags', 'genre', 'year', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']

First few rows:
             track_id             name           artist  \
0  TRIOREW128F424EAF0   Mr. Brightside      The Killers   
1  TRRIVDJ128F429B0E8       Wonderwall            Oasis   
2  TROUVHL128F426C441  Come as You Are          Nirvana   
3  TRUEIND128F93038C4      Take Me Out  Franz Ferdinand   
4  TRLNZBD128F935E4D8            Creep        Radiohead   

                                 spotify_preview_url              spotify_id  \
0  https://p.scdn.co/mp3-preview/4d26180e6961fd46...  09ZQ5TmUG8TSL56n0knqrj   
1  https://p.scdn.co/mp3-preview/d012e536916c927b...  06UfBBDISthj1ZJAtX4xjj   
2  https://p.scdn.co/mp3-preview/a1c11bb1cb231031...  0keNu0t0tqsWtExGM3nT1D   
3  ht

In [5]:
print("=== USER BEHAVIOR DATASET STRUCTURE ===")
print("Columns:", behavior_df.columns.tolist())
print("\nFirst few rows:")
print(behavior_df.head())
print(f"\nDataset info:")
print(f"- Number of interactions: {len(behavior_df)}")
print(f"- Number of unique users: {behavior_df['user_id'].nunique()}")
print(f"- Number of unique tracks: {behavior_df['track_id'].nunique()}")
print(f"- Playcount statistics:")
print(behavior_df['playcount'].describe())
print(f"- Missing values per column:")
print(behavior_df.isnull().sum())


=== USER BEHAVIOR DATASET STRUCTURE ===
Columns: ['track_id', 'user_id', 'playcount']

First few rows:
             track_id                                   user_id  playcount
0  TRIRLYL128F42539D1  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1
1  TRFUPBA128F934F7E1  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1
2  TRLQPQJ128F42AA94F  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1
3  TRTUCUY128F92E1D24  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1
4  TRHDDQG12903CB53EE  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1

Dataset info:
- Number of interactions: 9711301
- Number of unique users: 962037
- Number of unique tracks: 30459
- Playcount statistics:
count    9.711301e+06
mean     2.630946e+00
std      5.706324e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      2.948000e+03
Name: playcount, dtype: float64
- Missing values per column:
track_id     0
user_id      0
playcount    0
dtype: int64


## Step 2: Data Preprocessing

We'll clean the data and perform exploratory analysis to understand user-item interaction patterns.


In [6]:
# Step 2: Group by each user (Group User Behavior)
print("\n=== STEP 2: GROUP BY EACH USER ===")
print("Grouping songs of each user...")

# Group by user_id and create user-song lists with track_id and playcount pairs
user_song_list = (
    behavior_df
    .groupby('user_id', observed=True)[['track_id', 'playcount']]
    .apply(lambda x: list(zip(x['track_id'], x['playcount'])))
    .to_dict()
)

print(f"Number of users with listening history: {len(user_song_list)}")
print(f"Sample user-song list (first 2 users):")
for i, (user_id, songs) in enumerate(list(user_song_list.items())[:2]):
    print(f"  User {i+1}: {user_id}")
    print(f"    Songs: {len(songs)} tracks")
    print(f"    Sample: {songs[:3]}...")
    print()



=== STEP 2: GROUP BY EACH USER ===
Grouping songs of each user...
Number of users with listening history: 962037
Sample user-song list (first 2 users):
  User 1: 00000b722001882066dff9d2da8a775658053ea0
    Songs: 1 tracks
    Sample: [('TRQEBOU128F425D087', 1)]...

  User 2: 00001638d6189236866af9bbf309ae6c2347ffdc
    Songs: 1 tracks
    Sample: [('TRBCDMC128F1452976', 1)]...



In [7]:
# Step 3: Filter listening history < 50 (Filter Listening History)
print("\n=== STEP 3: FILTER LISTENING HISTORY < 50 ===")
print("Removing users with less than 50 listening records...")

# Filter users with at least 50 songs in their listening history
user_song_list = {user: songs for user, songs in user_song_list.items() if len(songs) >= 50}

print(f"Number of users after filtering (>= 50 songs): {len(user_song_list)}")

# Update behavior_df to include only filtered users
behavior_df = behavior_df[behavior_df['user_id'].isin(user_song_list.keys())]
print(f"Updated behavior_df shape: {behavior_df.shape}")

# Display sample of filtered data
print(f"\nSample of filtered user-song list (first 2 users):")
for i, (user_id, songs) in enumerate(list(user_song_list.items())[:2]):
    print(f"  User {i+1}: {user_id}")
    print(f"    Songs: {len(songs)} tracks")
    print(f"    Sample: {songs[:3]}...")
    print()



=== STEP 3: FILTER LISTENING HISTORY < 50 ===
Removing users with less than 50 listening records...
Number of users after filtering (>= 50 songs): 23795
Updated behavior_df shape: (1808122, 3)

Sample of filtered user-song list (first 2 users):
  User 1: 0003a64f7a69e5b87a80b09c3772227185c235c7
    Songs: 64 tracks
    Sample: [('TRRCWXH128F42B917C', 1), ('TRMHZLT12903CEA646', 1), ('TROVIQH128F42B91A1', 1)]...

  User 2: 00043d7bc800ceff4a90459e189eba5d442a1d3d
    Songs: 66 tracks
    Sample: [('TRLBHAD128F93519FC', 1), ('TRKEKLH128F428ABD4', 1), ('TRNNGMK128F423F034', 2)]...



In [8]:
# Step 4: Sample users (n = 5000) (User Sampling)
print("\n=== STEP 4: SAMPLE USERS (n = 5000) ===")
print("Sampling 5000 users randomly with seed=42...")

# Get list of active users (users with >= 50 songs)
active_users = list(user_song_list.keys())
print(f"Total active users available: {len(active_users)}")

# Sample 5000 users randomly with seed=42
rng = np.random.default_rng(seed=42)
sampled_users = rng.choice(active_users, size=5000, replace=False)

print(f"Sampled users: {len(sampled_users)}")

# Update user_song_list to include only sampled users
user_song_list = {user: songs for user, songs in user_song_list.items() if user in sampled_users}

# Filter the behavior_df DataFrame to include only sampled users
behavior_df = behavior_df[behavior_df['user_id'].isin(sampled_users)]

print(f"Updated behavior_df shape: {behavior_df.shape}")
print(f"Updated user_song_list size: {len(user_song_list)}")

# Display sample of sampled data
print(f"\nSample of sampled user-song list (first 2 users):")
for i, (user_id, songs) in enumerate(list(user_song_list.items())[:2]):
    print(f"  User {i+1}: {user_id}")
    print(f"    Songs: {len(songs)} tracks")
    print(f"    Sample: {songs[:3]}...")
    print()



=== STEP 4: SAMPLE USERS (n = 5000) ===
Sampling 5000 users randomly with seed=42...
Total active users available: 23795
Sampled users: 5000
Updated behavior_df shape: (377381, 3)
Updated user_song_list size: 5000

Sample of sampled user-song list (first 2 users):
  User 1: 0030f00cd1d9ccbff086e4ee6541a599484df3b0
    Songs: 59 tracks
    Sample: [('TRLBHAD128F93519FC', 1), ('TRMFANB128F9356836', 1), ('TRWZJEM128F93501BF', 1)]...

  User 2: 00441d21d173bb83e7eae898313e377655ba91b6
    Songs: 59 tracks
    Sample: [('TRUWDZO128F9339024', 1), ('TRNYCAH12903CB19F2', 4), ('TREAQSX128E07818CA', 1)]...



## Step 3: Feature Engineering / User-Item Matrix Creation

Create the user-item interaction matrix that will serve as the foundation for calculating item-to-item similarities.


In [9]:
# Configuration and utility functions
class Config:
    """Configuration parameters for the recommendation system"""
    MEMORY_THRESHOLD_GB = 4.0
    TOP_K_NEIGHBORS = 10
    BATCH_SIZE = 4096
    DIVERSITY_WEIGHT = 0.1
    NOVELTY_WEIGHT = 0.1

def calculate_memory_usage(matrix_shape):
    """Calculate memory usage in GB for a matrix"""
    return (matrix_shape[0] * matrix_shape[1] * 8) / (1024**3)

def create_user_item_matrix(behavior_df, user_to_idx, item_to_idx, idx_to_user, idx_to_item):
    """Create user-item matrix with memory optimization"""
    try:
        print("Building sparse user-item matrix...")
        
        # Map user_id and track_id to indices
        behavior_df['user_idx'] = behavior_df['user_id'].map(user_to_idx)
        behavior_df['item_idx'] = behavior_df['track_id'].map(item_to_idx)
        
        # Check for unmapped entries
        unmapped_users = behavior_df['user_idx'].isna().sum()
        unmapped_items = behavior_df['item_idx'].isna().sum()
        if unmapped_users > 0 or unmapped_items > 0:
            print(f"Warning: {unmapped_users} users and {unmapped_items} items could not be mapped")
            behavior_df = behavior_df.dropna(subset=['user_idx', 'item_idx'])
        
        # Create sparse matrix using scipy
        from scipy.sparse import csr_matrix
        
        # Extract coordinates and values
        rows = behavior_df['user_idx'].astype(int).values
        cols = behavior_df['item_idx'].astype(int).values  
        data = behavior_df['playcount'].values
        
        # Create sparse matrix
        user_item_sparse = csr_matrix((data, (rows, cols)), 
                                     shape=(len(user_to_idx), len(item_to_idx)))
        
        print(f"Sparse matrix shape: {user_item_sparse.shape}")
        print(f"Matrix density: {user_item_sparse.nnz / (user_item_sparse.shape[0] * user_item_sparse.shape[1]):.6f}")
        print(f"Non-zero elements: {user_item_sparse.nnz:,}")
        
        # Check memory usage
        memory_gb = calculate_memory_usage(user_item_sparse.shape)
        print(f"Dense matrix would need: {memory_gb:.2f} GB")

        # VVV FIX STARTS HERE VVV
        # The entire if/else block was removed and replaced with this:
        print("Always working with sparse matrix to ensure performance.")
        return user_item_sparse, user_item_sparse # Return the sparse matrix for both variables
        # ^^^ FIX ENDS HERE ^^^

    except Exception as e:
        print(f"Error creating user-item matrix: {e}")
        raise

print("Configuration and utility functions defined!")


Configuration and utility functions defined!


In [10]:
# Step 5: Filter behavior by sampled users (Filter Behavior by Sampled Users)
print("\n=== STEP 5: FILTER BEHAVIOR BY SAMPLED USERS ===")
print("Filtering behavior data to include only sampled users...")

# The behavior_df is already filtered to include only sampled users
# This step is already completed in Step 4, but let's verify
print(f"Final behavior dataset shape: {behavior_df.shape}")
print(f"Unique users in behavior data: {behavior_df['user_id'].nunique()}")
print(f"Unique tracks in behavior data: {behavior_df['track_id'].nunique()}")

# Step 6: 80/20 Train-Test Split (Train-Test Split)
print("\n=== STEP 6: 80/20 TRAIN-TEST SPLIT ===")
print("Performing train-test split with seed=42...")

# Perform train-test split on a per-user basis (80% train, 20% test)
train_df = behavior_df.groupby('user_id', group_keys=False).apply(
    lambda x: x.sample(frac=0.8, random_state=42)
)
test_df = behavior_df.drop(train_df.index)

print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Train users: {train_df['user_id'].nunique()}")
print(f"Test users: {test_df['user_id'].nunique()}")

# Verify that all users in test set are also in train set
train_users = set(train_df['user_id'].unique())
test_users = set(test_df['user_id'].unique())
common_users = train_users.intersection(test_users)
print(f"Users in both train and test sets: {len(common_users)}")

# Display summary statistics
print(f"\nSummary statistics:")
print(f"  Total interactions: {len(behavior_df):,}")
print(f"  Train interactions: {len(train_df):,} ({len(train_df)/len(behavior_df)*100:.1f}%)")
print(f"  Test interactions: {len(test_df):,} ({len(test_df)/len(behavior_df)*100:.1f}%)")
print(f"  Unique users: {behavior_df['user_id'].nunique():,}")
print(f"  Unique tracks: {behavior_df['track_id'].nunique():,}")

# Final behavior dataset ready for model building
print(f"\nFinal behavior dataset ready for model building: {behavior_df.shape}")



=== STEP 5: FILTER BEHAVIOR BY SAMPLED USERS ===
Filtering behavior data to include only sampled users...
Final behavior dataset shape: (377381, 3)
Unique users in behavior data: 5000
Unique tracks in behavior data: 21087

=== STEP 6: 80/20 TRAIN-TEST SPLIT ===
Performing train-test split with seed=42...
Train set shape: (301961, 3)
Test set shape: (75420, 3)
Train users: 5000
Test users: 5000
Users in both train and test sets: 5000

Summary statistics:
  Total interactions: 377,381
  Train interactions: 301,961 (80.0%)
  Test interactions: 75,420 (20.0%)
  Unique users: 5,000
  Unique tracks: 21,087

Final behavior dataset ready for model building: (377381, 3)


In [11]:
# Step 7: Create User-Item Matrix (Feature Engineering)
print("\n=== STEP 7: CREATE USER-ITEM MATRIX ===")
print("Creating user-item interaction matrix for collaborative filtering...")

# Create mappings for users and items to indices
unique_users = behavior_df['user_id'].unique()
unique_items = behavior_df['track_id'].unique()

user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
item_to_idx = {item: idx for idx, item in enumerate(unique_items)}
idx_to_user = {idx: user for user, idx in user_to_idx.items()}
idx_to_item = {idx: item for item, idx in item_to_idx.items()}

print(f"Created mappings for {len(user_to_idx)} users and {len(item_to_idx)} items")

# Use the utility function to create the matrix
user_item_matrix, user_item_array = create_user_item_matrix(
    behavior_df, user_to_idx, item_to_idx, idx_to_user, idx_to_item
)

print("User-item matrix ready for collaborative filtering")


=== STEP 7: CREATE USER-ITEM MATRIX ===
Creating user-item interaction matrix for collaborative filtering...
Created mappings for 5000 users and 21087 items
Building sparse user-item matrix...
Sparse matrix shape: (5000, 21087)
Matrix density: 0.003579
Non-zero elements: 377,381
Dense matrix would need: 0.79 GB
Always working with sparse matrix to ensure performance.
User-item matrix ready for collaborative filtering


In [12]:
# Step 8: Create Training Matrix (Create Training Matrix)
print("\n=== STEP 8: CREATE TRAINING MATRIX ===")
print("Creating training user-item matrix for model building...")

# Use the utility function to create the training matrix
train_matrix, train_array = create_user_item_matrix(
    train_df, user_to_idx, item_to_idx, idx_to_user, idx_to_item
)

print("Training matrix ready for model building")



=== STEP 8: CREATE TRAINING MATRIX ===
Creating training user-item matrix for model building...
Building sparse user-item matrix...
Sparse matrix shape: (5000, 21087)
Matrix density: 0.002864
Non-zero elements: 301,961
Dense matrix would need: 0.79 GB
Always working with sparse matrix to ensure performance.
Training matrix ready for model building


## Step 4: Item-Based Collaborative Filtering Model

Implement the core item-based collaborative filtering algorithm by calculating item-to-item similarities and building the recommendation logic.


In [13]:
# Calculate item-to-item similarity matrix (Top-K sparse, memory efficient)
print("=== CALCULATING ITEM SIMILARITY MATRIX (TOP-K SPARSE) ===")

TOP_K = 50  # neighbors per item
SIM_DTYPE = np.float32

if hasattr(train_array, 'toarray'):
    print("Working with sparse user-item training matrix")
    # Items as rows, users as columns
    item_matrix = train_array.T.tocsr()
    n_items, n_users = item_matrix.shape
    print(f"Item matrix shape for similarity calculation: ({n_items}, {n_users})")

    # Use NearestNeighbors with cosine distance on sparse matrix
    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=min(TOP_K + 1, n_items), metric='cosine', algorithm='brute', n_jobs=-1)
    nn.fit(item_matrix)

    print("Finding top-k nearest neighbors per item (cosine distance)...")
    distances, indices = nn.kneighbors(item_matrix, return_distance=True)

    # Convert distances to cosine similarity: sim = 1 - dist
    # Drop self neighbor (distance=0 at index 0)
    distances = distances[:, 1:].astype(SIM_DTYPE)
    indices = indices[:, 1:]
    similarities = (1.0 - distances).clip(min=0).astype(SIM_DTYPE)

    # Build sparse CSR similarity matrix
    indptr = np.arange(0, similarities.shape[0] * similarities.shape[1] + 1, similarities.shape[1])
    item_indices_flat = indices.ravel()
    sims_flat = similarities.ravel()

    from scipy.sparse import csr_matrix as _csr
    item_similarity_csr = _csr((sims_flat, item_indices_flat, indptr), shape=(n_items, n_items), dtype=SIM_DTYPE)

    # Symmetrize by taking max(sim(i,j), sim(j,i)) to improve recall
    item_similarity_csr = item_similarity_csr.maximum(item_similarity_csr.T)

    # Keep variable name used downstream (generic)
    item_similarity = item_similarity_csr

    print(f"Built sparse top-{TOP_K} similarity matrix: nnz={item_similarity.nnz:,}, shape={item_similarity.shape}")
else:
    print("Working with dense user-item training matrix")
    item_matrix = train_array.T
    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=min(TOP_K + 1, item_matrix.shape[0]), metric='cosine', algorithm='brute', n_jobs=-1)
    nn.fit(item_matrix)

    print("Finding top-k nearest neighbors per item (cosine distance, dense)...")
    distances, indices = nn.kneighbors(item_matrix, return_distance=True)

    distances = distances[:, 1:].astype(SIM_DTYPE)
    indices = indices[:, 1:]
    similarities = (1.0 - distances).clip(min=0).astype(SIM_DTYPE)

    # Build sparse matrix even for dense input to save memory
    from scipy.sparse import csr_matrix as _csr
    indptr = np.arange(0, similarities.shape[0] * similarities.shape[1] + 1, similarities.shape[1])
    item_similarity = _csr((similarities.ravel(), indices.ravel(), indptr), shape=(item_matrix.shape[0], item_matrix.shape[0]), dtype=SIM_DTYPE)
    item_similarity = item_similarity.maximum(item_similarity.T)

    print(f"Built sparse top-{TOP_K} similarity matrix: nnz={item_similarity.nnz:,}, shape={item_similarity.shape}")

print("Item similarity (Top-K sparse) computed successfully!")


=== CALCULATING ITEM SIMILARITY MATRIX (TOP-K SPARSE) ===
Working with sparse user-item training matrix
Item matrix shape for similarity calculation: (21087, 5000)
Finding top-k nearest neighbors per item (cosine distance)...
Built sparse top-50 similarity matrix: nnz=1,285,892, shape=(21087, 21087)
Item similarity (Top-K sparse) computed successfully!


In [14]:
# Analyze similarity distribution (supports sparse Top-K)
print("=== SIMILARITY ANALYSIS (TOP-K SPARSE) ===")

from scipy.sparse import coo_matrix

# item_similarity is a sparse CSR matrix produced above (or dense in rare cases)
if 'item_similarity' in globals() and hasattr(item_similarity, 'tocsr'):
    sim_csr = item_similarity.tocsr()
    sim_coo = sim_csr.tocoo(copy=False)

    # Exclude diagonal and keep upper triangle to avoid duplicates
    mask = (sim_coo.row < sim_coo.col)
    vals = sim_coo.data[mask]

    if vals.size == 0:
        print("No off-diagonal similarities found.")
    else:
        print("Similarity statistics:")
        print(f"- Mean:   {np.mean(vals):.4f}")
        print(f"- Median: {np.median(vals):.4f}")
        print(f"- Std:    {np.std(vals):.4f}")
        print(f"- Min:    {np.min(vals):.4f}")
        print(f"- Max:    {np.max(vals):.4f}")

        # Top-N pairs by similarity
        n_top_pairs = 5
        if vals.size <= n_top_pairs:
            top_idx = np.argsort(vals)[::-1]
        else:
            part = np.argpartition(vals, -n_top_pairs)[-n_top_pairs:]
            top_idx = part[np.argsort(vals[part])[::-1]]

        print(f"\nTop {n_top_pairs} most similar item pairs:")
        for i in top_idx:
            idx1 = int(sim_coo.row[mask][i])
            idx2 = int(sim_coo.col[mask][i])
            similarity = float(vals[i])

            item1 = idx_to_item[idx1] if 'idx_to_item' in globals() else idx1
            item2 = idx_to_item[idx2] if 'idx_to_item' in globals() else idx2

            item1_name = item1
            item2_name = item2
            try:
                if item1 in music_df['track_id'].values:
                    info1 = music_df[music_df['track_id'] == item1].iloc[0]
                    item1_name = f"{info1['name']} - {info1['artist']}"
                if item2 in music_df['track_id'].values:
                    info2 = music_df[music_df['track_id'] == item2].iloc[0]
                    item2_name = f"{info2['name']} - {info2['artist']}"
            except Exception:
                pass

            print(f"- {item1_name}")
            print(f"  <-> {item2_name}")
            print(f"  Similarity: {similarity:.4f}")
            print()
else:
    # Dense fallback (unlikely after Top-K build)
    sim = item_similarity if 'item_similarity' in globals() else item_similarity_matrix
    upper_triangle = np.triu(sim, k=1)
    similarities = upper_triangle[upper_triangle > 0]

    print(f"Similarity statistics:")
    print(f"- Mean: {np.mean(similarities):.4f}")
    print(f"- Median: {np.median(similarities):.4f}")
    print(f"- Std: {np.std(similarities):.4f}")
    print(f"- Min: {np.min(similarities):.4f}")
    print(f"- Max: {np.max(similarities):.4f}")

    n_top_pairs = 5
    sim_no_diag = sim.copy()
    np.fill_diagonal(sim_no_diag, 0)
    top_pairs_idx = np.unravel_index(
        np.argpartition(sim_no_diag.ravel(), -n_top_pairs)[-n_top_pairs:],
        sim_no_diag.shape
    )

    print(f"\nTop {n_top_pairs} most similar item pairs:")
    for i in range(n_top_pairs):
        idx1, idx2 = top_pairs_idx[0][i], top_pairs_idx[1][i]
        similarity = sim_no_diag[idx1, idx2]
        item1 = idx_to_item[idx1] if 'idx_to_item' in globals() else idx1
        item2 = idx_to_item[idx2] if 'idx_to_item' in globals() else idx2
        print(f"- {item1} <-> {item2} | {similarity:.4f}")

=== SIMILARITY ANALYSIS (TOP-K SPARSE) ===
Similarity statistics:
- Mean:   0.2691
- Median: 0.1997
- Std:    0.2122
- Min:    0.0031
- Max:    1.0000

Top 5 most similar item pairs:
- Chillin' - Modjo
  <-> Greatest Story Ever Told - Grateful Dead
  Similarity: 1.0000

- Nightflight - Novaspace
  <-> Too Shy - Kajagoogoo
  Similarity: 1.0000

- Rainbow Box - Riverside
  <-> Vocari Dei - Pain of Salvation
  Similarity: 1.0000

- Rainbow Box - Riverside
  <-> Back To The River - Riverside
  Similarity: 1.0000

- Beni Beni - Niyaz
  <-> Gouge Away - Pixies
  Similarity: 1.0000



In [15]:
# Calculate Item Popularity for the new novelty evaluation
print("\n=== CALCULATING ITEM POPULARITY FOR NOVELTY SCORES ===")

# Popularity is defined as the proportion of users in the training set who have listened to an item
item_counts = np.array((train_matrix > 0).sum(axis=0)).flatten()
total_users = train_matrix.shape[0]
item_popularity = item_counts / total_users

# Create a dictionary mapping from an item's index to its popularity score
popularity_scores_dict = {i: pop for i, pop in enumerate(item_popularity)}
print(f"Popularity scores prepared for {len(popularity_scores_dict)} items.")


=== CALCULATING ITEM POPULARITY FOR NOVELTY SCORES ===
Popularity scores prepared for 21087 items.


## Step 5: Recommendation Function

Create functions to generate recommendations using the item-based collaborative filtering approach, with focus on user experience metrics.


In [16]:
class ItemBasedRecommender:
    """
    Item-Based Collaborative Filtering Recommender System (Memory Efficient)
    Supports sparse Top-K item similarity and UX-oriented reranking
    """
    
    def __init__(self, user_item_matrix, item_similarity, music_metadata, 
                 user_to_idx=None, item_to_idx=None, idx_to_user=None, idx_to_item=None):
        self.user_item_matrix = user_item_matrix
        self.item_similarity = item_similarity  # CSR sparse or dense
        self.music_metadata = music_metadata
        
        # Handle both sparse and dense user-item matrices
        if hasattr(user_item_matrix, 'toarray'):  # Sparse matrix
            self.is_sparse = True
            self.user_to_idx = user_to_idx
            self.item_to_idx = item_to_idx
            self.idx_to_user = idx_to_user
            self.idx_to_item = idx_to_item
            self.users = list(user_to_idx.keys()) if user_to_idx else []
            self.items = list(item_to_idx.keys()) if item_to_idx else []
        else:  # Dense matrix (DataFrame)
            self.is_sparse = False
            self.items = list(user_item_matrix.columns)
            self.users = list(user_item_matrix.index)
        
        # Create item metadata lookup
        self.item_metadata = music_metadata.set_index('track_id').to_dict('index')

        # GPU members
        self.gpu_enabled = False
        self.batch_size = 1024
        self._cp = None
        self._cupyx = None
        self.item_similarity_gpu = None
        self.user_item_T_gpu = None
    
    def enable_gpu(self, batch_size=2048):
        """Enable GPU acceleration using CuPy if available."""
        try:
            import cupy as cp
            import cupyx
            from cupyx.scipy.sparse import csr_matrix as gpu_csr
        except Exception as e:
            print(f"GPU not available (CuPy). Falling back to CPU. Reason: {e}")
            self.gpu_enabled = False
            return
        if not (hasattr(self.item_similarity, 'tocsr') and self.is_sparse):
            print("GPU path requires sparse user-item and sparse item similarity. Using CPU.")
            self.gpu_enabled = False
            return
        # Move matrices to GPU
        self._cp = cp
        self._cupyx = cupyx
        self.batch_size = int(batch_size)
        # item similarity (items x items)
        self.item_similarity_gpu = gpu_csr((self.item_similarity.data.astype(np.float32),
                                            self.item_similarity.indices,
                                            self.item_similarity.indptr),
                                           shape=self.item_similarity.shape)
        # user-item transpose (items x users)
        ui_T = self.user_item_matrix.T.tocsr()
        self.user_item_T_gpu = gpu_csr((ui_T.data.astype(np.float32), ui_T.indices, ui_T.indptr),
                                       shape=ui_T.shape)
        self.gpu_enabled = True
        print(f"GPU enabled. Batch size={self.batch_size}")
    
    def _get_similarity(self, item_a, item_b):
        """Return similarity between two items using sparse/dense similarity."""
        if item_a not in self.items or item_b not in self.items:
            return 0.0
        if hasattr(self.item_similarity, 'tocsr') and self.is_sparse:
            ia = self.item_to_idx[item_a]
            ib = self.item_to_idx[item_b]
            row = self.item_similarity.getrow(ia)
            cols = row.indices
            data = row.data
            pos = np.searchsorted(cols, ib)
            if pos < len(cols) and cols[pos] == ib:
                return float(data[pos])
            return 0.0
        else:
            if hasattr(self.item_similarity, 'loc'):
                return float(self.item_similarity.loc[item_a, item_b])
            else:
                ia = self.items.index(item_a)
                ib = self.items.index(item_b)
                return float(self.item_similarity[ia, ib])
        
    def get_user_interacted_items(self, user_id):
        """Get items that a user has interacted with"""
        if self.is_sparse:
            if user_id not in self.user_to_idx:
                return []
            user_idx = self.user_to_idx[user_id]
            user_row = self.user_item_matrix[user_idx]
            _, item_indices = user_row.nonzero()
            return [self.idx_to_item[idx] for idx in item_indices]
        else:
            if user_id not in self.user_item_matrix.index:
                return []
            user_interactions = self.user_item_matrix.loc[user_id]
            return user_interactions[user_interactions > 0].index.tolist()
    
    def predict_item_score(self, user_id, target_item, k_neighbors=50):
        """Predict score for a target item using item-based CF with Top-K sim."""
        if target_item not in self.items:
            return 0.0
        user_items = self.get_user_interacted_items(user_id)
        if not user_items:
            return 0.0
        
        target_similarities = []
        user_ratings = []
        for item in user_items:
            sim = self._get_similarity(target_item, item)
            if sim > 0:
                if self.is_sparse:
                    user_idx = self.user_to_idx[user_id]
                    item_idx = self.item_to_idx[item]
                    rating = self.user_item_matrix[user_idx, item_idx]
                else:
                    rating = self.user_item_matrix.loc[user_id, item]
                target_similarities.append(sim)
                user_ratings.append(float(rating))
        if not target_similarities:
            return 0.0
        
        if len(target_similarities) > k_neighbors:
            order = np.argsort(target_similarities)[-k_neighbors:]
            sims = np.array(target_similarities)[order]
            ratings = np.array(user_ratings)[order]
        else:
            sims = np.array(target_similarities)
            ratings = np.array(user_ratings)
        denom = sims.sum()
        if denom <= 0:
            return 0.0
        return float((sims * ratings).sum() / denom)
    
    def recommend_for_users_batch(self, user_ids, n_recommendations=10):
        """Batch recommend. Uses GPU sparse matmul if enabled; else CPU fallback."""
        if not user_ids:
            return {}
        if self.gpu_enabled:
            cp = self._cp
            # Map to indices that exist in training
            valid_ids = [u for u in user_ids if u in self.user_to_idx]
            if not valid_ids:
                return {}
            uidx = cp.asarray([self.user_to_idx[u] for u in valid_ids], dtype=cp.int32)
            # Build submatrix R = UI_T[:, uidx]  (items x batch)
            R_batch = self.user_item_T_gpu[:, uidx]
            # Compute scores: S * R  => (items x items) @ (items x batch) = items x batch
            scores = self.item_similarity_gpu @ R_batch
            # Mask already interacted items: set to -inf
            # Build mask from R_batch > 0
            mask = R_batch.copy()
            mask.data[:] = 1.0
            # Convert scores to dense per column only for topk selection
            # Use argpartition on GPU
            results = {}
            scores_csc = scores.tocsc()  # better column access
            for j, uid in enumerate(valid_ids):
                col = scores_csc.getcol(j).toarray().ravel()
                # zero out seen items
                seen = R_batch.getcol(j).toarray().ravel() > 0
                col[seen] = -np.inf
                k = min(n_recommendations * 2, len(col))
                topk_idx = cp.asnumpy(cp.argpartition(cp.asarray(col), -k)[-k:])
                topk_sorted = topk_idx[np.argsort(col[topk_idx])[::-1]][:n_recommendations]
                items = [self.idx_to_item[i] for i in topk_sorted]
                scores_vals = [float(col[i]) for i in topk_sorted]
                results[uid] = list(zip(items, scores_vals))
            return results
        else:
            # CPU sparse batch: S (csr) @ R (items x batch)
            if not (hasattr(self.item_similarity, 'tocsr') and self.is_sparse):
                # fallback to per-user path
                return {u: self.get_recommendations(u, n_recommendations) for u in user_ids}
            valid_ids = [u for u in user_ids if u in self.user_to_idx]
            if not valid_ids:
                return {}
            uidx = np.array([self.user_to_idx[u] for u in valid_ids], dtype=np.int32)
            R_batch = self.user_item_matrix.T.tocsr()[:, uidx]
            scores = self.item_similarity @ R_batch
            results = {}
            from numpy import argpartition
            scores_csc = scores.tocsc()
            for j, uid in enumerate(valid_ids):
                col = scores_csc.getcol(j).toarray().ravel()
                seen = R_batch.getcol(j).toarray().ravel() > 0
                col[seen] = -np.inf
                k = min(n_recommendations * 2, len(col))
                topk_idx = argpartition(col, -k)[-k:]
                topk_sorted = topk_idx[np.argsort(col[topk_idx])[::-1]][:n_recommendations]
                items = [self.idx_to_item[i] for i in topk_sorted]
                scores_vals = [float(col[i]) for i in topk_sorted]
                results[uid] = list(zip(items, scores_vals))
            return results
    
    def get_recommendations(self, user_id, n_recommendations=10, 
                            diversity_weight=0.1, novelty_weight=0.1):
        """Generate recommendations for a user with UX-aware reranking."""
        if user_id not in self.users:
            return []
        user_items = set(self.get_user_interacted_items(user_id))
        candidate_items = [item for item in self.items if item not in user_items]
        if not candidate_items:
            return []
        
        item_scores = []
        for item in candidate_items:
            score = self.predict_item_score(user_id, item)
            if score > 0:
                item_scores.append((item, score))
        item_scores.sort(key=lambda x: x[1], reverse=True)
        final_recommendations = self._apply_diversity_novelty(
            item_scores, user_items, n_recommendations, diversity_weight, novelty_weight
        )
        return final_recommendations[:n_recommendations]
    
    def _apply_diversity_novelty(self, scored_items, user_items, 
                                 n_recommendations, diversity_weight, novelty_weight):
        if not scored_items:
            return []
        recommendations = []
        considered_genres = set()
        considered_artists = set()
        user_genres = set()
        user_artists = set()
        for item in user_items:
            if item in self.item_metadata:
                meta = self.item_metadata[item]
                if 'genre' in meta and meta['genre']:
                    user_genres.add(meta['genre'])
                if 'artist' in meta:
                    user_artists.add(meta['artist'])
        for item, base_score in scored_items:
            if len(recommendations) >= n_recommendations:
                break
            diversity_bonus = 0
            novelty_bonus = 0
            if item in self.item_metadata:
                meta = self.item_metadata[item]
                g = meta.get('genre', '')
                a = meta.get('artist', '')
                if g and g not in considered_genres:
                    diversity_bonus += diversity_weight * 0.5
                if a and a not in considered_artists:
                    diversity_bonus += diversity_weight * 0.5
                if g and g not in user_genres:
                    novelty_bonus += novelty_weight * 0.5
                if a and a not in user_artists:
                    novelty_bonus += novelty_weight * 0.5
            final_score = base_score + diversity_bonus + novelty_bonus
            recommendations.append((item, final_score, base_score))
            if item in self.item_metadata:
                meta = self.item_metadata[item]
                if meta.get('genre'):
                    considered_genres.add(meta['genre'])
                if meta.get('artist'):
                    considered_artists.add(meta['artist'])
        recommendations.sort(key=lambda x: x[1], reverse=True)
        return [(item, s) for item, s, _ in recommendations]

# Initialize the recommender
print("=== INITIALIZING ITEM-BASED RECOMMENDER ===")

# Pass the appropriate mappings for sparse matrix support
if hasattr(train_matrix, 'toarray'):  # If sparse matrix
    recommender = ItemBasedRecommender(
        train_matrix, item_similarity, music_df,
        user_to_idx, item_to_idx, idx_to_user, idx_to_item
    )
else:  # Dense matrix
    recommender = ItemBasedRecommender(train_matrix, item_similarity, music_df)

# Try enable GPU for batched recommendation
recommender.enable_gpu(batch_size=4096)

print("Recommender initialized successfully!")


=== INITIALIZING ITEM-BASED RECOMMENDER ===


GPU not available (CuPy). Falling back to CPU. Reason: No module named 'cupy'
Recommender initialized successfully!


In [17]:
# # Test the recommendation function
# print("=== TESTING RECOMMENDATION FUNCTION ===")

# # Get a sample user for testing
# if hasattr(train_matrix, 'toarray'):  # Sparse matrix
#     sample_user = list(user_to_idx.keys())[0]
# else:  # Dense matrix
#     sample_user = train_matrix.index[0]

# print(f"Testing recommendations for user: {sample_user}")

# # Get user's interaction history
# user_history = recommender.get_user_interacted_items(sample_user)
# print(f"User has interacted with {len(user_history)} items")

# # Show some of their interactions with metadata
# print("\nUser's listening history (sample):")
# for i, track_id in enumerate(user_history[:5]):
#     if track_id in recommender.item_metadata:
#         track_info = recommender.item_metadata[track_id]
#         print(f"  {i+1}. {track_info.get('name', 'Unknown')} - {track_info.get('artist', 'Unknown Artist')}")

# # Generate recommendations
# print("Generating recommendations...")
# recommendations = recommender.get_recommendations(sample_user, n_recommendations=5)
# print(f"\nGenerated {len(recommendations)} recommendations:")

# for i, (track_id, score) in enumerate(recommendations):
#     if track_id in recommender.item_metadata:
#         track_info = recommender.item_metadata[track_id]
#         print(f"  {i+1}. {track_info.get('name', 'Unknown')} - {track_info.get('artist', 'Unknown Artist')} (Score: {score:.4f})")
#     else:
#         print(f"  {i+1}. {track_id} (Score: {score:.4f})")

# # Test memory usage
# import psutil
# import os
# process = psutil.Process(os.getpid())
# memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
# print(f"\n💾 Current memory usage: {memory_gb:.2f} GB")


## Step 6: Evaluation

Implement comprehensive evaluation metrics focusing on both ranking accuracy and user experience quality.


In [18]:
# Import the standalone evaluation metrics
from standalone_evaluation_metrics import evaluate_recommendations, print_evaluation_summary

print("=== STANDALONE EVALUATION FRAMEWORK READY ===")
print("Standalone evaluation metrics imported successfully!")

=== STANDALONE EVALUATION FRAMEWORK READY ===
Standalone evaluation metrics imported successfully!


In [None]:
# Generate recommendations for evaluation using the FAST BATCH method
print("\n=== GENERATING RECOMMENDATIONS FOR EVALUATION (FAST BATCH METHOD) ===")

# Get all users that are in the test set
test_users_list = test_df['user_id'].unique().tolist()
print(f"Generating recommendations for {len(test_users_list)} test users...")

# Define a batch size to process users in chunks (avoids memory issues)
BATCH_SIZE = 512 
recommendations = {}
total_batches = (len(test_users_list) + BATCH_SIZE - 1) // BATCH_SIZE

for i in range(0, len(test_users_list), BATCH_SIZE):
    # Get a batch of user IDs
    batch_user_ids = test_users_list[i:i + BATCH_SIZE]
    
    # Use the fast batch recommendation function
    batch_recs = recommender.recommend_for_users_batch(
        user_ids=batch_user_ids, 
        n_recommendations=20
    )
    
    # Update the main recommendations dictionary with the batch results
    recommendations.update(batch_recs)
    
    # Print progress
    current_batch_num = (i // BATCH_SIZE) + 1
    print(f"  Processed batch {current_batch_num}/{total_batches}...")

print(f"\nGenerated recommendations for {len(recommendations)} users.")

# The evaluation script expects item INDICES, not item IDs.
# We need to convert the track_id recommendations into item_idx recommendations.
print("Converting recommendations to the required index format for evaluation...")
final_recommendations_indexed = {}
for user_id, rec_list in recommendations.items():
    if user_id in user_to_idx:
        user_idx = user_to_idx[user_id]
        
        # Convert track_ids to item_indices, filtering out any not in the mapping
        rec_indices = [item_to_idx[track_id] for track_id, score in rec_list if track_id in item_to_idx]
        
        if rec_indices:
            final_recommendations_indexed[user_idx] = np.array(rec_indices)

print(f"Successfully prepared recommendations for {len(final_recommendations_indexed)} users for evaluation.")

# IMPORTANT: The evaluation function below must use the NEW variable 'final_recommendations_indexed'
# We rename it here to 'recommendations' to match what the original code expects.
recommendations = final_recommendations_indexed


=== GENERATING RECOMMENDATIONS FOR EVALUATION (FAST BATCH METHOD) ===
Generating recommendations for 5000 test users...
  Processed batch 1/10...
  Processed batch 2/10...
  Processed batch 3/10...
  Processed batch 4/10...
  Processed batch 5/10...
  Processed batch 6/10...
  Processed batch 7/10...
  Processed batch 8/10...
  Processed batch 9/10...
  Processed batch 10/10...

Generated recommendations for 5000 users.
Converting recommendations to the required index format for evaluation...
Successfully prepared recommendations for 5000 users for evaluation.


In [22]:
# Prepare item content features for diversity and serendipity calculations
print("\n=== PREPARING ITEM CONTENT FEATURES ===")

# Use the item-user matrix (transpose of train_matrix) as the feature representation
# Convert to dense array and L2-normalize for proper cosine similarity calculations
item_content = train_matrix.T.toarray().astype(np.float32)
norms = np.linalg.norm(item_content, axis=1, keepdims=True) + 1e-12
item_content = item_content / norms

print(f"Item content features prepared with shape: {item_content.shape}")

# Run evaluation using standalone metrics
print("\n=== RUNNING EVALUATION WITH STANDALONE METRICS ===")

evaluation_results = evaluate_recommendations(
    recommendations=recommendations, # This uses the variable from the code I provided
    train_df=train_df,
    test_df=test_df,
    user_to_idx=user_to_idx,
    item_to_idx=item_to_idx,
    item_content=item_content,
    k_values=[5, 10, 15, 20],
    metrics=['ndcg', 'novelty', 'diversity', 'serendipity', 'catalog_coverage', 'user_coverage']
)

# Print results using the standalone metrics summary function
print_evaluation_summary(evaluation_results)


=== PREPARING ITEM CONTENT FEATURES ===
Item content features prepared with shape: (21087, 5000)

=== RUNNING EVALUATION WITH STANDALONE METRICS ===
Evaluating 5000 users across 4 k values...
  Calculating ndcg...
    ndcg@5: 0.2170
    ndcg@10: 0.1747
    ndcg@15: 0.1672
    ndcg@20: 0.1729
  Calculating novelty...
    novelty@5: 7.8330
    novelty@10: 7.9205
    novelty@15: 7.9586
    novelty@20: 7.9954
  Calculating diversity...
    diversity@5: 0.5642
    diversity@10: 0.6494
    diversity@15: 0.7094
    diversity@20: 0.7529
  Calculating serendipity...
    serendipity@5: 0.6888
    serendipity@10: 0.7063
    serendipity@15: 0.7181
    serendipity@20: 0.7273
  Calculating catalog_coverage...
    catalog_coverage@5: 33.0535
    catalog_coverage@10: 48.3284
    catalog_coverage@15: 58.3061
    catalog_coverage@20: 65.3199
  Calculating user_coverage...
    user_coverage@5: 100.0000
    user_coverage@10: 100.0000
    user_coverage@15: 100.0000
    user_coverage@20: 100.0000

RECOMMEN