## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

## Load Data and Models

In [None]:
# Load data
users = pd.read_csv("../data/raw/users.csv")
programs = pd.read_csv("../data/raw/programs.csv")
interactions = pd.read_csv("../data/raw/interactions.csv")

# Load models
tfidf_vectorizer = joblib.load("../models/tfidf.pkl")
tfidf_matrix = joblib.load("../models/program_tfidf.pkl")
cf_model = joblib.load("../models/cf_svd.pkl")

# Extract CF components
predicted_scores = cf_model["predicted_scores"]
user_id_map = cf_model["user_id_map"]
reverse_item_map = cf_model["reverse_item_map"]
interaction_matrix = cf_model["interaction_matrix"]

print(f"Data loaded: {len(users)} users, {len(programs)} programs, {len(interactions)} interactions")

## Create Ground Truth

For evaluation, we need to know which programs each user actually liked (interacted with).

In [None]:
# Create ground truth: programs each user interacted with
ground_truth = defaultdict(set)
for _, row in interactions.iterrows():
    if row["interaction"] == 1:  # Only positive interactions
        ground_truth[row["user_id"]].add(row["program_id"])

print(f"Ground truth created for {len(ground_truth)} users")
print(f"Average interactions per user: {np.mean([len(v) for v in ground_truth.values()]):.2f}")

## Recommendation Functions

Define the three recommendation approaches for evaluation.

In [None]:
def recommend_content_based(user_interests, k=5):
    """Content-based recommendations"""
    user_vector = tfidf_vectorizer.transform([user_interests])
    similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()
    top_indices = np.argsort(similarities)[::-1][:k]
    return [programs.iloc[i]["program_id"] for i in top_indices]

def recommend_cf(user_id, k=5):
    """Collaborative filtering recommendations"""
    if user_id not in user_id_map:
        return []  # Cold-start
    
    user_idx = user_id_map[user_id]
    scores = predicted_scores[user_idx]
    
    # Filter out already-interacted programs
    interacted_items = interaction_matrix[user_idx].nonzero()[1]
    scores_copy = scores.copy()
    scores_copy[interacted_items] = -np.inf
    
    top_items = np.argsort(scores_copy)[::-1][:k]
    return [reverse_item_map[i] for i in top_items]

def recommend_hybrid(user_id, user_interests, k=5, content_weight=0.6, cf_weight=0.4):
    """Hybrid recommendations"""
    # Get content-based scores
    user_vector = tfidf_vectorizer.transform([user_interests])
    content_scores = cosine_similarity(user_vector, tfidf_matrix).flatten()
    content_scores = content_scores / (content_scores.max() if content_scores.max() > 0 else 1)
    
    # Get CF scores (if available)
    combined_scores = {}
    
    if user_id in user_id_map:
        user_idx = user_id_map[user_id]
        cf_scores = predicted_scores[user_idx]
        
        # Normalize CF scores
        valid_scores = cf_scores[cf_scores > -np.inf]
        if len(valid_scores) > 0:
            min_score, max_score = valid_scores.min(), valid_scores.max()
            score_range = max_score - min_score if max_score > min_score else 1
            cf_scores_norm = (cf_scores - min_score) / score_range
        else:
            cf_scores_norm = cf_scores
        
        # Combine scores
        for i, program_id in enumerate(programs["program_id"]):
            if program_id in reverse_item_map.values():
                item_idx = [k for k, v in reverse_item_map.items() if v == program_id][0]
                combined_scores[program_id] = content_weight * content_scores[i] + cf_weight * cf_scores_norm[item_idx]
            else:
                combined_scores[program_id] = content_weight * content_scores[i]
    else:
        # Cold-start: use only content-based
        for i, program_id in enumerate(programs["program_id"]):
            combined_scores[program_id] = content_scores[i]
    
    # Sort and return top-k
    sorted_programs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]
    return [p[0] for p in sorted_programs]

print("âœ“ Recommendation functions defined")

## Evaluation Metrics Implementation

In [None]:
def precision_at_k(recommended, relevant, k):
    """Precision@k: fraction of recommended items that are relevant"""
    recommended_k = recommended[:k]
    relevant_count = len(set(recommended_k) & relevant)
    return relevant_count / k if k > 0 else 0

def recall_at_k(recommended, relevant, k):
    """Recall@k: fraction of relevant items that are recommended"""
    recommended_k = recommended[:k]
    relevant_count = len(set(recommended_k) & relevant)
    return relevant_count / len(relevant) if len(relevant) > 0 else 0

def ndcg_at_k(recommended, relevant, k):
    """NDCG@k: Normalized Discounted Cumulative Gain"""
    recommended_k = recommended[:k]
    
    # DCG: sum of (relevance / log2(position + 1))
    dcg = 0
    for i, item in enumerate(recommended_k):
        if item in relevant:
            dcg += 1 / np.log2(i + 2)  # +2 because positions start at 1
    
    # IDCG: ideal DCG (if all relevant items were at the top)
    idcg = sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])
    
    return dcg / idcg if idcg > 0 else 0

print("âœ“ Metrics implemented")

## Evaluate All Three Approaches

In [None]:
# Evaluation parameters
K_VALUES = [1, 3, 5]

# Store results
results = {
    "content_based": {k: {"precision": [], "recall": [], "ndcg": []} for k in K_VALUES},
    "collaborative": {k: {"precision": [], "recall": [], "ndcg": []} for k in K_VALUES},
    "hybrid": {k: {"precision": [], "recall": [], "ndcg": []} for k in K_VALUES}
}

# Track coverage (which programs get recommended)
coverage = {
    "content_based": set(),
    "collaborative": set(),
    "hybrid": set()
}

# Evaluate for each user
test_users = users.head(50)  # Test on first 50 users for speed

for _, user in test_users.iterrows():
    user_id = user["user_id"]
    user_interests = user["interests_text"]
    
    # Get ground truth for this user
    relevant = ground_truth.get(user_id, set())
    
    if len(relevant) == 0:
        continue  # Skip users with no interactions
    
    # Get recommendations from all three approaches
    content_recs = recommend_content_based(user_interests, k=max(K_VALUES))
    cf_recs = recommend_cf(user_id, k=max(K_VALUES))
    hybrid_recs = recommend_hybrid(user_id, user_interests, k=max(K_VALUES))
    
    # Update coverage
    coverage["content_based"].update(content_recs)
    coverage["collaborative"].update(cf_recs)
    coverage["hybrid"].update(hybrid_recs)
    
    # Calculate metrics for each k
    for k in K_VALUES:
        # Content-based
        results["content_based"][k]["precision"].append(precision_at_k(content_recs, relevant, k))
        results["content_based"][k]["recall"].append(recall_at_k(content_recs, relevant, k))
        results["content_based"][k]["ndcg"].append(ndcg_at_k(content_recs, relevant, k))
        
        # Collaborative filtering
        if len(cf_recs) > 0:
            results["collaborative"][k]["precision"].append(precision_at_k(cf_recs, relevant, k))
            results["collaborative"][k]["recall"].append(recall_at_k(cf_recs, relevant, k))
            results["collaborative"][k]["ndcg"].append(ndcg_at_k(cf_recs, relevant, k))
        
        # Hybrid
        results["hybrid"][k]["precision"].append(precision_at_k(hybrid_recs, relevant, k))
        results["hybrid"][k]["recall"].append(recall_at_k(hybrid_recs, relevant, k))
        results["hybrid"][k]["ndcg"].append(ndcg_at_k(hybrid_recs, relevant, k))

print(f"âœ“ Evaluation completed for {len(test_users)} users")

## Results Summary

In [None]:
# Calculate average metrics
summary = []

for approach in ["content_based", "collaborative", "hybrid"]:
    for k in K_VALUES:
        avg_precision = np.mean(results[approach][k]["precision"])
        avg_recall = np.mean(results[approach][k]["recall"])
        avg_ndcg = np.mean(results[approach][k]["ndcg"])
        
        summary.append({
            "Approach": approach.replace("_", " ").title(),
            "k": k,
            "Precision@k": f"{avg_precision:.3f}",
            "Recall@k": f"{avg_recall:.3f}",
            "NDCG@k": f"{avg_ndcg:.3f}"
        })

results_df = pd.DataFrame(summary)
print("\n" + "="*70)
print("RECOMMENDATION SYSTEM EVALUATION RESULTS")
print("="*70 + "\n")
print(results_df.to_string(index=False))
print("\n" + "="*70)

## Coverage Analysis

Coverage measures what percentage of programs get recommended across all users.

In [None]:
total_programs = len(programs)

print("\nCOVERAGE ANALYSIS")
print("="*50)
for approach, recommended_programs in coverage.items():
    coverage_pct = len(recommended_programs) / total_programs * 100
    print(f"{approach.replace('_', ' ').title():20s}: {len(recommended_programs)}/{total_programs} programs ({coverage_pct:.1f}%)")
print("="*50)

## Visualization: Metrics Comparison

In [None]:
# Prepare data for plotting
metrics = ["Precision@k", "Recall@k", "NDCG@k"]
approaches = ["Content Based", "Collaborative", "Hybrid"]

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle("Recommendation System Performance Comparison", fontsize=14, fontweight='bold')

for idx, metric in enumerate(["precision", "recall", "ndcg"]):
    ax = axes[idx]
    
    for approach_key, approach_name in zip(["content_based", "collaborative", "hybrid"], approaches):
        values = [np.mean(results[approach_key][k][metric]) for k in K_VALUES]
        ax.plot(K_VALUES, values, marker='o', label=approach_name, linewidth=2)
    
    ax.set_xlabel('k (Number of Recommendations)', fontsize=10)
    ax.set_ylabel(metrics[idx], fontsize=10)
    ax.set_title(metrics[idx], fontsize=11, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_xticks(K_VALUES)

plt.tight_layout()
plt.savefig("../models/evaluation_metrics.png", dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Visualization saved to ../models/evaluation_metrics.png")

## Bar Chart: Approach Comparison at k=3

In [None]:
k = 3
metric_names = ['Precision', 'Recall', 'NDCG']
metric_keys = ['precision', 'recall', 'ndcg']

# Prepare data
content_values = [np.mean(results["content_based"][k][m]) for m in metric_keys]
cf_values = [np.mean(results["collaborative"][k][m]) for m in metric_keys]
hybrid_values = [np.mean(results["hybrid"][k][m]) for m in metric_keys]

x = np.arange(len(metric_names))
width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width, content_values, width, label='Content-Based', color='#3498db')
bars2 = ax.bar(x, cf_values, width, label='Collaborative', color='#e74c3c')
bars3 = ax.bar(x + width, hybrid_values, width, label='Hybrid', color='#2ecc71')

ax.set_xlabel('Metrics', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title(f'Recommendation System Performance Comparison (k={k})', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels([f"{m}@{k}" for m in metric_names])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig("../models/evaluation_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Comparison chart saved to ../models/evaluation_comparison.png")

## Key Findings and Analysis

In [None]:
# Determine best approach for each metric at k=3
k = 3
best_approaches = {}

for metric in ["precision", "recall", "ndcg"]:
    scores = {
        "Content-Based": np.mean(results["content_based"][k][metric]),
        "Collaborative": np.mean(results["collaborative"][k][metric]),
        "Hybrid": np.mean(results["hybrid"][k][metric])
    }
    best_approaches[metric] = max(scores, key=scores.get)

print("\n" + "="*70)
print("KEY FINDINGS")
print("="*70)
print(f"\nðŸ“Š Best Approach by Metric (k={k}):")
print(f"   â€¢ Precision@{k}: {best_approaches['precision']}")
print(f"   â€¢ Recall@{k}: {best_approaches['recall']}")
print(f"   â€¢ NDCG@{k}: {best_approaches['ndcg']}")

print(f"\nðŸ“ˆ Coverage:")
for approach in ["content_based", "collaborative", "hybrid"]:
    pct = len(coverage[approach]) / total_programs * 100
    print(f"   â€¢ {approach.replace('_', ' ').title()}: {pct:.1f}% of programs recommended")

print("\nðŸ’¡ Interpretation:")
print("   â€¢ Higher precision = More accurate recommendations")
print("   â€¢ Higher recall = Better at finding all relevant programs")
print("   â€¢ Higher NDCG = Better ranking quality (relevant items at top)")
print("   â€¢ Higher coverage = More diversity in recommendations")
print("\n" + "="*70)

## Save Evaluation Results

In [None]:
# Save results to CSV
results_df.to_csv("../models/evaluation_results.csv", index=False)
print("âœ“ Results saved to ../models/evaluation_results.csv")

# Save detailed metrics
evaluation_summary = {
    "results": results,
    "coverage": {k: list(v) for k, v in coverage.items()},
    "best_approaches": best_approaches,
    "k_values": K_VALUES
}

joblib.dump(evaluation_summary, "../models/evaluation_summary.pkl")
print("âœ“ Detailed evaluation saved to ../models/evaluation_summary.pkl")

## Summary

**Evaluation Complete! âœ…**

**Metrics Evaluated:**
- âœ… Precision@k (1, 3, 5)
- âœ… Recall@k (1, 3, 5)
- âœ… NDCG@k (1, 3, 5)
- âœ… Coverage analysis

**Approaches Compared:**
- âœ… Content-Based Filtering
- âœ… Collaborative Filtering (SVD)
- âœ… Hybrid (60% content + 40% CF)

**Outputs Generated:**
- Results table with all metrics
- Line charts showing metric trends
- Bar chart comparison at k=3
- Coverage analysis
- CSV export for report

**Next Steps:**
1. Build FastAPI application
2. Create UI for user interaction
3. Add feedback logging
4. Write final report with these results