# Popularity Baseline Evaluation

This notebook evaluates a popularity-based recommendation system using comprehensive evaluation metrics at k values of 5, 10, 15, and 20.

## Overview
- **Baseline**: Popularity-based recommendations (most popular items for all users)
- **Evaluation Metrics**: NDCG, Novelty, Diversity, Serendipity, Coverage
- **K Values**: 5, 10, 15, 20


## 1. Setup and Imports


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("✅ Libraries imported successfully!")


✅ Libraries imported successfully!


In [2]:
# Import the standalone evaluation metrics
# Make sure standalone_evaluation_metrics.py is in the same directory
from standalone_evaluation_metrics import (
    quick_evaluate,
    evaluate_recommendations,
    print_evaluation_summary,
    ndcg_at_k,
    novelty_at_k,
    diversity_ild_at_k,
    serendipity_at_k,
    catalog_coverage_at_k,
    user_coverage_at_k
)

print("✅ Evaluation metrics imported successfully!")


✅ Evaluation metrics imported successfully!


## 2. Data Loading and Preparation


In [9]:
# Load the datasets
# Update the path to your data files
dataset_path = ''  # Update this path to your data directory

try:
    music_list = pd.read_csv(dataset_path + 'music_list.csv')
    user_behavior_list = pd.read_csv(dataset_path + 'user_behavior_list.csv')
    print("✅ Data loaded successfully!")
    print(f"Music list shape: {music_list.shape}")
    print(f"User behavior list shape: {user_behavior_list.shape}")
except FileNotFoundError as e:
    print(f"❌ Error loading data: {e}")
    print("Please update the dataset_path variable with the correct path to your data files.")
    # For demonstration, we'll create synthetic data
    print("Creating synthetic data for demonstration...")
    
    # Create synthetic data
    n_users, n_items = 1000, 500
    user_ids = [f"user_{i:04d}" for i in range(n_users)]
    item_ids = [f"item_{i:04d}" for i in range(n_items)]
    
    # Create synthetic interactions
    interactions = []
    for user_id in user_ids:
        n_user_items = np.random.poisson(20)  # Average 20 items per user
        user_items = np.random.choice(item_ids, size=min(n_user_items, n_items), replace=False)
        
        for item_id in user_items:
            playcount = np.random.poisson(3) + 1
            interactions.append({
                'user_id': user_id,
                'track_id': item_id,
                'playcount': playcount
            })
    
    user_behavior_list = pd.DataFrame(interactions)
    
    # Create synthetic music list
    music_list = pd.DataFrame({
        'track_id': item_ids,
        'name': [f"Song_{i}" for i in range(n_items)],
        'artist': [f"Artist_{i%50}" for i in range(n_items)],
        'genre': [f"Genre_{i%10}" for i in range(n_items)]
    })
    
    print("✅ Synthetic data created!")
    print(f"Music list shape: {music_list.shape}")
    print(f"User behavior list shape: {user_behavior_list.shape}")


✅ Data loaded successfully!
Music list shape: (50683, 21)
User behavior list shape: (9711301, 3)


In [10]:
# Data preprocessing following the popularity baseline approach
print("📊 Preprocessing data...")

# Filter users with at least 10 interactions (reduced from 50 for faster processing)
user_counts = user_behavior_list['user_id'].value_counts()
active_users = user_counts[user_counts >= 10].index
user_behavior_list = user_behavior_list[user_behavior_list['user_id'].isin(active_users)]

print(f"Active users (≥10 interactions): {len(active_users)}")

# Sample users for faster evaluation (optional)
if len(active_users) > 5000:
    rng = np.random.default_rng(seed=42)
    sampled_users = rng.choice(active_users, size=5000, replace=False)
    user_behavior_list = user_behavior_list[user_behavior_list['user_id'].isin(sampled_users)]
    print(f"Sampled users for evaluation: {len(sampled_users)}")

# Shuffle the data
user_behavior_list = shuffle(user_behavior_list, random_state=42)

# Train-test split (80-20)
train_df = user_behavior_list.groupby('user_id', group_keys=False).apply(
    lambda x: x.sample(frac=0.8, random_state=42)
)
test_df = user_behavior_list.drop(train_df.index)

print(f"Train interactions: {len(train_df)}")
print(f"Test interactions: {len(test_df)}")
print(f"Unique users in train: {train_df['user_id'].nunique()}")
print(f"Unique users in test: {test_df['user_id'].nunique()}")
print(f"Unique items in train: {train_df['track_id'].nunique()}")
print(f"Unique items in test: {test_df['track_id'].nunique()}")


📊 Preprocessing data...
Active users (≥10 interactions): 290898
Sampled users for evaluation: 5000
Train interactions: 96748
Test interactions: 24073
Unique users in train: 5000
Unique users in test: 5000
Unique items in train: 15247
Unique items in test: 8777


## 3. Popularity Baseline Implementation


In [11]:
# Calculate track popularity from training data
print("🎵 Calculating track popularity...")

track_popularity = train_df.groupby('track_id')['playcount'].sum().reset_index()
track_popularity.rename(columns={'playcount': 'total_playcount'}, inplace=True)

# Sort by popularity (descending)
popularity_sorted = track_popularity.sort_values('total_playcount', ascending=False)

print(f"Total unique tracks: {len(track_popularity)}")
print(f"Tracks with playcount > 0: {(track_popularity['total_playcount'] > 0).sum()}")

# Display top 10 most popular tracks
print("\nTop 10 Most Popular Tracks:")
print(popularity_sorted.head(10))

# Create item mappings
all_items = sorted(train_df['track_id'].unique())
all_users = sorted(train_df['user_id'].unique())

item_to_idx = {item: idx for idx, item in enumerate(all_items)}
user_to_idx = {user: idx for idx, user in enumerate(all_users)}

print(f"\nItem mapping created: {len(item_to_idx)} items")
print(f"User mapping created: {len(user_to_idx)} users")


🎵 Calculating track popularity...
Total unique tracks: 15247
Tracks with playcount > 0: 15247

Top 10 Most Popular Tracks:
                 track_id  total_playcount
8632   TRONYHY128F92C9D11             2589
399    TRAOIAH128F92F707B              923
11855  TRUFTBY128F93450B8              787
11471  TRTNFRQ12903CB6360              757
14994  TRZNAHL128F9327D5A              701
21     TRAALAH128E078234A              674
14077  TRXWAZC128F9314B3E              673
3747   TRGCHLH12903CB7352              600
9102   TRPFYYL128F92F7144              599
8586   TROMKCG128F9320C09              598

Item mapping created: 15247 items
User mapping created: 5000 users


In [12]:
# Create popularity-based recommendations
def create_popularity_recommendations(user_to_idx, item_to_idx, popularity_sorted, k=20):
    """
    Create popularity-based recommendations for all users.
    
    Args:
        user_to_idx: User ID to index mapping
        item_to_idx: Item ID to index mapping
        popularity_sorted: DataFrame sorted by popularity
        k: Number of recommendations per user
        
    Returns:
        Dictionary mapping user indices to recommendation arrays
    """
    # Get top-k most popular items
    top_k_items = popularity_sorted.head(k)['track_id'].tolist()
    
    # Convert to indices
    top_k_indices = [item_to_idx[item] for item in top_k_items if item in item_to_idx]
    
    # Create recommendations for all users (same recommendations for everyone)
    recommendations = {}
    for user_idx in user_to_idx.values():
        recommendations[user_idx] = np.array(top_k_indices)
    
    return recommendations

# Generate recommendations for different k values
k_values = [5, 10, 15, 20]
all_recommendations = {}

for k in k_values:
    recommendations = create_popularity_recommendations(user_to_idx, item_to_idx, popularity_sorted, k)
    all_recommendations[k] = recommendations
    print(f"✅ Generated popularity recommendations for k={k}")

print(f"\nRecommendation summary:")
for k, recs in all_recommendations.items():
    print(f"  k={k}: {len(recs)} users, {len(recs[list(recs.keys())[0]])} items per user")


✅ Generated popularity recommendations for k=5
✅ Generated popularity recommendations for k=10
✅ Generated popularity recommendations for k=15
✅ Generated popularity recommendations for k=20

Recommendation summary:
  k=5: 5000 users, 5 items per user
  k=10: 5000 users, 10 items per user
  k=15: 5000 users, 15 items per user
  k=20: 5000 users, 20 items per user


## 4. Content Features Creation


In [None]:
# Create synthetic content features for evaluation
# In a real scenario, you would use actual audio features, metadata, etc.
print("🎼 Creating content features...")

n_items = len(item_to_idx)
n_features = 50  # Number of content features

# Create synthetic content features (replace with real features)
np.random.seed(42)
item_content = np.random.randn(n_items, n_features).astype(np.float32)

# L2 normalize for cosine similarity calculations
norms = np.linalg.norm(item_content, axis=1, keepdims=True) + 1e-12
item_content = item_content / norms

print(f"Content features created: {item_content.shape}")
print(f"Features are L2-normalized: {np.allclose(np.linalg.norm(item_content, axis=1), 1.0)}")

# Display feature statistics
print(f"\nFeature statistics:")
print(f"  Mean: {item_content.mean():.4f}")
print(f"  Std: {item_content.std():.4f}")
print(f"  Min: {item_content.min():.4f}")
print(f"  Max: {item_content.max():.4f}")


## 5. Individual Metric Evaluation


In [None]:
# Evaluate individual metrics for each k value
print("🔍 Evaluating individual metrics...")

# Store results for each k value
individual_results = {}

for k in k_values:
    print(f"\n--- Evaluating k={k} ---")
    recommendations = all_recommendations[k]
    
    # Calculate each metric individually
    ndcg_score = ndcg_at_k(recommendations, test_df, user_to_idx, item_to_idx, k)
    novelty_score = novelty_at_k(recommendations, train_df, item_to_idx, k)
    diversity_score = diversity_ild_at_k(recommendations, item_content, k)
    serendipity_score = serendipity_at_k(recommendations, train_df, test_df, user_to_idx, item_to_idx, item_content, k)
    catalog_coverage = catalog_coverage_at_k(recommendations, len(item_to_idx), k)
    user_coverage = user_coverage_at_k(recommendations, len(user_to_idx))
    
    # Store results
    individual_results[k] = {
        'ndcg': ndcg_score,
        'novelty': novelty_score,
        'diversity': diversity_score,
        'serendipity': serendipity_score,
        'catalog_coverage': catalog_coverage,
        'user_coverage': user_coverage
    }
    
    # Print results
    print(f"  NDCG@{k}: {ndcg_score:.4f}")
    print(f"  Novelty@{k}: {novelty_score:.4f}")
    print(f"  Diversity@{k}: {diversity_score:.4f}")
    print(f"  Serendipity@{k}: {serendipity_score:.4f}")
    print(f"  Catalog Coverage@{k}: {catalog_coverage:.2f}%")
    print(f"  User Coverage: {user_coverage:.2f}%")

print("\n✅ Individual metric evaluation completed!")


## 6. Comprehensive Evaluation


In [None]:
# Use the comprehensive evaluation function
print("📊 Running comprehensive evaluation...")

# Use k=20 recommendations for comprehensive evaluation
comprehensive_results = evaluate_recommendations(
    recommendations=all_recommendations[20],  # Use k=20 for comprehensive evaluation
    train_df=train_df,
    test_df=test_df,
    user_to_idx=user_to_idx,
    item_to_idx=item_to_idx,
    item_content=item_content,
    k_values=k_values,
    metrics=['ndcg', 'novelty', 'diversity', 'serendipity', 'catalog_coverage', 'user_coverage']
)

# Print formatted results
print_evaluation_summary(comprehensive_results)


## 7. Results Visualization


In [None]:
# Create visualizations of the results
plt.style.use('default')
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Popularity Baseline Evaluation Results', fontsize=16, fontweight='bold')

# Extract data for plotting
metrics = ['ndcg', 'novelty', 'diversity', 'serendipity', 'catalog_coverage', 'user_coverage']
metric_titles = ['NDCG@k', 'Novelty@k', 'Diversity@k', 'Serendipity@k', 'Catalog Coverage@k (%)', 'User Coverage (%)']

for i, (metric, title) in enumerate(zip(metrics, metric_titles)):
    row = i // 3
    col = i % 3
    ax = axes[row, col]
    
    # Get scores for this metric
    scores = [individual_results[k][metric] for k in k_values]
    
    # Plot
    ax.plot(k_values, scores, 'o-', linewidth=2, markersize=8, color='steelblue')
    ax.set_title(title, fontweight='bold')
    ax.set_xlabel('k')
    ax.set_ylabel('Score')
    ax.grid(True, alpha=0.3)
    ax.set_xticks(k_values)
    
    # Add value labels on points
    for x, y in zip(k_values, scores):
        ax.annotate(f'{y:.3f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center')

plt.tight_layout()
plt.show()

# Create a summary table
print("\n📋 Summary Table:")
summary_df = pd.DataFrame(individual_results).T
summary_df.index.name = 'k'
print(summary_df.round(4))


## 8. Analysis and Insights


In [None]:
# Analyze the results and provide insights
print("🔍 Analysis and Insights:")
print("=" * 50)

# NDCG Analysis
print("\n1. NDCG@k (Ranking Quality):")
ndcg_scores = [individual_results[k]['ndcg'] for k in k_values]
print(f"   Range: {min(ndcg_scores):.4f} - {max(ndcg_scores):.4f}")
print(f"   Trend: {'Increasing' if ndcg_scores[-1] > ndcg_scores[0] else 'Decreasing'}")
print("   Interpretation: Higher is better. Measures how well the ranking matches user preferences.")

# Novelty Analysis
print("\n2. Novelty@k (Item Unpopularity):")
novelty_scores = [individual_results[k]['novelty'] for k in k_values]
print(f"   Range: {min(novelty_scores):.4f} - {max(novelty_scores):.4f}")
print(f"   Trend: {'Increasing' if novelty_scores[-1] > novelty_scores[0] else 'Decreasing'}")
print("   Interpretation: Higher is better. Measures how 'unpopular' recommended items are.")

# Diversity Analysis
print("\n3. Diversity@k (List Variety):")
diversity_scores = [individual_results[k]['diversity'] for k in k_values]
print(f"   Range: {min(diversity_scores):.4f} - {max(diversity_scores):.4f}")
print(f"   Trend: {'Increasing' if diversity_scores[-1] > diversity_scores[0] else 'Decreasing'}")
print("   Interpretation: Higher is better. Measures variety within recommendation lists.")

# Serendipity Analysis
print("\n4. Serendipity@k (Surprising Relevance):")
serendipity_scores = [individual_results[k]['serendipity'] for k in k_values]
print(f"   Range: {min(serendipity_scores):.4f} - {max(serendipity_scores):.4f}")
print(f"   Trend: {'Increasing' if serendipity_scores[-1] > serendipity_scores[0] else 'Decreasing'}")
print("   Interpretation: Higher is better. Measures surprising but relevant recommendations.")

# Coverage Analysis
print("\n5. Coverage Analysis:")
catalog_coverage_scores = [individual_results[k]['catalog_coverage'] for k in k_values]
user_coverage = individual_results[k_values[0]]['user_coverage']
print(f"   Catalog Coverage Range: {min(catalog_coverage_scores):.2f}% - {max(catalog_coverage_scores):.2f}%")
print(f"   User Coverage: {user_coverage:.2f}%")
print("   Interpretation: Higher is better. Measures how much of the catalog/users the system can recommend to.")

# Overall Assessment
print("\n6. Overall Assessment:")
print("   Popularity baseline characteristics:")
print("   ✅ High user coverage (100%) - all users get recommendations")
print("   ✅ Consistent recommendations across users")
print("   ❌ Low novelty - only recommends popular items")
print("   ❌ Low diversity - same items for all users")
print("   ❌ Low serendipity - no personalization")
print("   ❌ Limited catalog coverage - only top-k popular items")

print("\n7. Recommendations for Improvement:")
print("   - Implement collaborative filtering for personalization")
print("   - Add content-based filtering for diversity")
print("   - Use hybrid approaches to balance popularity and personalization")
print("   - Consider user history for serendipity")
