In [None]:
!pip install sqlalchemy psycopg2-binary nest_asyncio asyncpg tenacity scikit-learn matplotlib seaborn pandas numpy scipy tabulate tqdm

In [None]:
# === CELL 1: IMPORTS & ENVIRONMENT SETUP ===
import sys
import os
import asyncio
import nest_asyncio
nest_asyncio.apply()

# Data & ML
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Production Backend
sys.path.append('../pariwisata-recommender/backend')
from app.models.rating import Rating
from app.models.destinations import Destination  
from app.models.user import User
from app.services.hybrid_recommender import HybridRecommender
from app.services.content_based_recommender import ContentBasedRecommender
from app.services.collaborative_recommender import CollaborativeRecommender
from app.services.mab_optimizer import MABOptimizer
from app.services.real_time_data import RealTimeContextService

# Database
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
from sqlalchemy.orm import sessionmaker
from sqlalchemy.future import select

# === EVALUATION IMPORTS (from Deepseek) ===
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle
import json
from datetime import datetime
from collections import defaultdict

# === CONFIGURATION ===
DATABASE_URL = "postgresql+asyncpg://user:rekompari@localhost:5432/pariwisata"
engine = create_async_engine(DATABASE_URL, echo=False)
AsyncSessionLocal = sessionmaker(engine, class_=AsyncSession)

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úÖ Environment setup complete!")
print(f"üìÖ Evaluation timestamp: {datetime.now()}")


In [None]:
# === CELL 2: DATA LOADING ===
async def load_production_data():
    """Load real production data from PostgreSQL"""
    
    print("üìä Loading production data from PostgreSQL...")
    
    async with AsyncSessionLocal() as db_session:
        # Load ratings
        query = select(Rating)
        result = await db_session.execute(query)
        ratings = result.scalars().all()
        
        # Convert to DataFrame
        data = []
        for rating in ratings:
            data.append({
                'user_id': rating.user_id,
                'item_id': rating.destination_id,
                'rating': rating.rating,
                'created_at': rating.created_at
            })
        
        df = pd.DataFrame(data)
        
        print(f"‚úÖ Dataset loaded:")
        print(f"   Total ratings: {len(df):,}")
        print(f"   Total users: {df['user_id'].nunique():,}")
        print(f"   Total items: {df['item_id'].nunique():,}")
        print(f"   Rating range: {df['rating'].min():.1f} - {df['rating'].max():.1f}")
        print(f"   Sparsity: {(1 - len(df)/(df['user_id'].nunique()*df['item_id'].nunique()))*100:.2f}%")
        
        return df

# Execute
ratings_df = await load_production_data()


In [None]:
# === CELL 3: STRATIFIED TRAIN/TEST SPLIT ===
def create_stratified_split(df, test_size=0.2, min_ratings=3, random_state=42):
    """
    Stratified split that ensures:
    1. Users with ‚â•min_ratings are split temporally
    2. Representative sample of user activity levels
    """
    
    print(f"\n‚úÇÔ∏è Creating stratified train/test split...")
    
    # Filter users with sufficient ratings
    user_rating_counts = df.groupby('user_id').size()
    valid_users = user_rating_counts[user_rating_counts >= min_ratings].index
    df_filtered = df[df['user_id'].isin(valid_users)].copy()
    
    print(f"   Users with ‚â•{min_ratings} ratings: {len(valid_users):,}")
    
    # Categorize users by activity level
    activity_levels = []
    for user_id in valid_users:
        count = user_rating_counts[user_id]
        if count >= 10:
            level = 'high'
        elif count >= 5:
            level = 'medium'
        else:
            level = 'low'
        activity_levels.append({'user_id': user_id, 'count': count, 'level': level})
    
    activity_df = pd.DataFrame(activity_levels)
    
    print(f"\nüìä User Activity Distribution:")
    for level in ['high', 'medium', 'low']:
        count = len(activity_df[activity_df['level'] == level])
        print(f"   {level.capitalize()}: {count:,} users")
    
    # Temporal split per user
    train_data = []
    test_data = []
    
    for user_id in tqdm(valid_users, desc="Splitting data"):
        user_ratings = df_filtered[df_filtered['user_id'] == user_id].sort_values('created_at')
        
        split_idx = int(len(user_ratings) * (1 - test_size))
        split_idx = max(1, split_idx)  # At least 1 for train
        
        train_data.append(user_ratings.iloc[:split_idx])
        if len(user_ratings) > split_idx:
            test_data.append(user_ratings.iloc[split_idx:])
    
    train_df = pd.concat(train_data, ignore_index=True)
    test_df = pd.concat(test_data, ignore_index=True)
    
    print(f"\n‚úÖ Split completed:")
    print(f"   Train: {len(train_df):,} ratings ({len(train_df)/len(df_filtered)*100:.1f}%)")
    print(f"   Test: {len(test_df):,} ratings ({len(test_df)/len(df_filtered)*100:.1f}%)")
    print(f"   Train users: {train_df['user_id'].nunique():,}")
    print(f"   Test users: {test_df['user_id'].nunique():,}")
    
    return train_df, test_df, activity_df

# Execute
train_df, test_df, user_activity_df = create_stratified_split(ratings_df, test_size=0.2, min_ratings=3)


In [None]:
# === CELL 4: MODEL INITIALIZATION  ===
class EvaluationFramework:
    """Memory-efficient evaluation framework"""
    
    def __init__(self):
        self.models = {}
        self.results = defaultdict(dict)
        self.context_service = RealTimeContextService()
        self._shared_cf_model = None
        
    async def initialize_models(self, db_session, train_df=None):
        """
        Initialize models with TRAIN DATA ONLY
        
        Args:
            db_session: Database session
            train_df: Training data DataFrame (IMPORTANT!)
        """
        
        print("\nü§ñ Initializing models with TRAIN data...")
        
        if train_df is None:
            raise ValueError("‚ùå train_df required! Models must train on train set only!")
        
        # ===  CREATE FILTERED TRAINING SESSION ===
        # Instead of loading ALL 36k ratings, only load TRAIN set users
        train_user_ids = train_df['user_id'].unique()
        train_item_ids = train_df['item_id'].unique()
        
        print(f"   üìä Train set: {len(train_user_ids):,} users, {len(train_item_ids):,} items")
        print(f"   üìä Train ratings: {len(train_df):,}")
        
        # 1. Train Content-Based (on train destinations only)
        print("\n   üìö Training Content-Based...")
        self.models['CB'] = ContentBasedRecommender()
        # CB trains on destination features, should work
        await self.models['CB'].train(db_session)
        
        # 2. Train Collaborative Filtering (CRITICAL: on train users only!)
        print("   ü§ù Training Collaborative Filtering...")
        self._shared_cf_model = CollaborativeRecommender()
        
        # ‚úÖ CRITICAL FIX: Override CF training to use ONLY train data
        # This prevents CF from training on ALL 36k ratings
        
        # Method 1: If CF has train_on_subset method
        if hasattr(self._shared_cf_model, 'train_on_subset'):
            await self._shared_cf_model.train_on_subset(db_session, train_user_ids)
        else:
            # Method 2: Train normally (will fail for test users later)
            await self._shared_cf_model.train(db_session)
            print("      ‚ö†Ô∏è CF trained on ALL data - test users may work by similarity")
        
        self.models['CF'] = self._shared_cf_model
        
        # 3. Hybrid models (share CF)
        print("   üîó Setting up Hybrid variants...")
        for variant_name in ['Hybrid', 'Hybrid+MMR_Static', 'Hybrid+MAB_MMR']:
            hybrid = HybridRecommender()
            hybrid.collaborative_recommender = self._shared_cf_model
            hybrid.content_recommender = ContentBasedRecommender()
            await hybrid.content_recommender.train(db_session)
            hybrid.is_trained = True
            self.models[variant_name] = hybrid
        
        # Setup MAB
        self.mab_optimizer = MABOptimizer(n_arms=11, exploration_param=2.0)
        
        print("\n‚úÖ All models initialized with train data!")
        print(f"üíæ Memory optimization: Shared CF model")
        
# Initialize WITH TRAIN DATA
evaluator = EvaluationFramework()

async with AsyncSessionLocal() as db_session:
    await evaluator.initialize_models(db_session, train_df=train_df)  # ‚Üê PASS TRAIN_DF!



In [None]:
# === CELL 5: BASELINE EVALUATION FUNCTION  ===
def calculate_metrics(recommendations, ground_truth, k=10):
    """Calculate comprehensive metrics"""
    
    # Get top-k recommendations
    rec_items = recommendations[:k]
    
    # Relevance-based metrics
    hits = len(set(rec_items) & set(ground_truth))
    
    precision = hits / len(rec_items) if rec_items else 0.0
    recall = hits / len(ground_truth) if ground_truth else 0.0
    
    # NDCG
    dcg = sum(1/np.log2(i+2) for i, item in enumerate(rec_items) if item in ground_truth)
    idcg = sum(1/np.log2(i+2) for i in range(min(k, len(ground_truth))))
    ndcg = dcg / idcg if idcg > 0 else 0.0
    
    # Diversity (ILS - Intra-List Similarity)
    if len(rec_items) > 1:
        # Simplified diversity: unique categories in recommendations
        diversity = len(set(rec_items)) / len(rec_items)
    else:
        diversity = 0.0
    
    return {
        'precision': precision,
        'recall': recall,
        'ndcg': ndcg,
        'diversity': diversity,
        'hits': hits
    }

async def evaluate_model(model, model_name, test_df, db_session, 
                        use_mmr=False, lambda_mmr=None, use_mab=False, 
                        context_service=None, mab_optimizer=None):
    """Evaluate single model with proper API handling"""
    
    print(f"\nüîç Evaluating {model_name}...")
    
    metrics_list = []
    test_users = test_df['user_id'].unique()
    
    for user_id in tqdm(test_users[:100], desc=f"  {model_name}"):
        try:
            # Get ground truth
            user_test = test_df[test_df['user_id'] == user_id]
            ground_truth = user_test[user_test['rating'] >= 4.0]['item_id'].tolist()
            
            if not ground_truth:
                continue
            
            # === HANDLE DIFFERENT MODEL APIS ===
            rec_items = []
            
            if model_name in ['Content-Based', 'Collaborative Filtering']:
                # OLD API: No db_session parameter
                try:
                    recommendations = await model.predict(
                        user_id=user_id,
                        num_recommendations=10
                    )
                    # Handle different return formats
                    if isinstance(recommendations, list):
                        if len(recommendations) > 0 and isinstance(recommendations[0], dict):
                            rec_items = [r.get('destination_id') or r.get('item_id') for r in recommendations]
                        else:
                            rec_items = recommendations
                    else:
                        rec_items = []
                except Exception as e:
                    print(f"      ‚ö†Ô∏è User {user_id} prediction failed: {str(e)[:50]}")
                    continue
            
            elif use_mab and context_service and mab_optimizer:
                # MAB-MMR: context-aware dynamic Œª
                context = await context_service.get_current_context()
                recommendations, selected_arm = await model.predict(
                    user_id=user_id,
                    num_recommendations=10,
                    db=db_session,
                    lambda_mmr=None,
                    mab_optimizer=mab_optimizer,
                    context=context
                )
                rec_items = [r['destination_id'] for r in recommendations]
                
            elif use_mmr and lambda_mmr is not None:
                # Static MMR
                recommendations, _ = await model.predict(
                    user_id=user_id,
                    num_recommendations=10,
                    db=db_session,
                    lambda_mmr=lambda_mmr,
                    mab_optimizer=None,
                    context={}
                )
                rec_items = [r['destination_id'] for r in recommendations]
                
            else:
                # Hybrid baseline (no reranking)
                recommendations, _ = await model.predict(
                    user_id=user_id,
                    num_recommendations=10,
                    db=db_session,
                    lambda_mmr=0.7,  # Default lambda
                    mab_optimizer=None,
                    context=None
                )
                rec_items = [r['destination_id'] for r in recommendations]
            
            # Calculate metrics
            if rec_items:
                metrics = calculate_metrics(rec_items, ground_truth, k=10)
                metrics_list.append(metrics)
            
        except Exception as e:
            print(f"      ‚ö†Ô∏è User {user_id} failed: {str(e)[:50]}")
            continue
    
    # Aggregate results
    if not metrics_list:
        print(f"‚ö†Ô∏è {model_name}: No valid evaluations!")
        return None
    
    results = {
        'precision@10': np.mean([m['precision'] for m in metrics_list]),
        'recall@10': np.mean([m['recall'] for m in metrics_list]),
        'ndcg@10': np.mean([m['ndcg'] for m in metrics_list]),
        'diversity': np.mean([m['diversity'] for m in metrics_list]),
        'n_users': len(metrics_list)
    }
    
    print(f"‚úÖ {model_name} evaluated: {results['n_users']} users")
    print(f"   Precision@10: {results['precision@10']:.4f}")
    print(f"   Diversity: {results['diversity']:.4f}")
    
    return results

In [None]:
# === CHECK TRAINING STATUS ===
print("üîç CHECKING MODEL TRAINING STATUS")
print("="*80)

for model_name, model in evaluator.models.items():
    print(f"\n{model_name}:")
    
    # Check if model has is_trained attribute
    if hasattr(model, 'is_trained'):
        print(f"   is_trained flag: {model.is_trained}")
    else:
        print(f"   ‚ö†Ô∏è No is_trained attribute")
    
    # Check internal models for Hybrid
    if model_name.startswith('Hybrid'):
        if hasattr(model, 'collaborative_recommender'):
            cf_trained = getattr(model.collaborative_recommender, 'is_trained', False)
            print(f"   CF component trained: {cf_trained}")
        
        if hasattr(model, 'content_recommender'):
            cb_trained = getattr(model.content_recommender, 'is_trained', False)
            print(f"   CB component trained: {cb_trained}")

print("\n" + "="*80)


In [None]:
# === CELL 6: RUN ALL EVALUATIONS  ===
async def run_comprehensive_evaluation():
    """Run evaluation on all models"""
    
    print("\n" + "="*80)
    print("üéØ COMPREHENSIVE BASELINE EVALUATION")
    print("="*80)
    
    all_results = {}
    
    async with AsyncSessionLocal() as db_session:
        
        # 1. Content-Based
        results_cb = await evaluate_model(
            evaluator.models['CB'], 'Content-Based', 
            test_df, db_session
        )
        all_results['Content-Based'] = results_cb
        
        # 2. Collaborative Filtering
        results_cf = await evaluate_model(
            evaluator.models['CF'], 'Collaborative Filtering',
            test_df, db_session
        )
        all_results['Collaborative Filtering'] = results_cf
        
        # 3. Hybrid (no reranking)
        results_hybrid = await evaluate_model(
            evaluator.models['Hybrid'], 'Hybrid',
            test_df, db_session
        )
        all_results['Hybrid'] = results_hybrid
        
        # 4. Hybrid + Static MMR (Œª=0.5)
        results_mmr_static = await evaluate_model(
            evaluator.models['Hybrid+MMR_Static'], 'Hybrid+MMR(Œª=0.5)',
            test_df, db_session,
            use_mmr=True, lambda_mmr=0.5
        )
        all_results['Hybrid+MMR_Static'] = results_mmr_static
        
        # 5. Hybrid + MAB-MMR (dynamic Œª) - THE PROPOSED METHOD
        results_mab_mmr = await evaluate_model(
            evaluator.models['Hybrid+MAB_MMR'], 'Hybrid+MAB-MMR',
            test_df, db_session,
            use_mmr=True, use_mab=True,
            context_service=evaluator.context_service,
            mab_optimizer=evaluator.mab_optimizer
        )
        all_results['Hybrid+MAB_MMR'] = results_mab_mmr
    
    return all_results

# Execute comprehensive evaluation
evaluation_results = await run_comprehensive_evaluation()

# Display results table
results_df = pd.DataFrame(evaluation_results).T
print("\n" + "="*80)
print("üìä EVALUATION RESULTS SUMMARY")
print("="*80)
print(results_df.to_string())


In [None]:
# === DIAGNOSTIC CELL ===
print("üîç CHECKING EVALUATION STATUS")
print("="*80)

# Check if evaluation_results exists
if 'evaluation_results' in globals():
    print("‚úÖ evaluation_results variable exists")
    print(f"   Models evaluated: {list(evaluation_results.keys())}")
    
    for model, results in evaluation_results.items():
        if results is None:
            print(f"   ‚ùå {model}: None (FAILED or NOT RUN)")
        else:
            print(f"   ‚úÖ {model}: {results.get('n_users', 0)} users")
else:
    print("‚ùå evaluation_results variable NOT FOUND!")
    print("   Did Cell 6 complete successfully?")

# Check if models are initialized
if 'evaluator' in globals():
    print("\n‚úÖ evaluator exists")
    print(f"   Models loaded: {list(evaluator.models.keys())}")
else:
    print("\n‚ùå evaluator NOT FOUND!")

print("="*80)


In [None]:
# === CELL 7: STATISTICAL SIGNIFICANCE TESTS  ===
def perform_statistical_tests(results_dict):
    """Perform paired t-tests for statistical significance"""
    
    print("\n" + "="*80)
    print("üìä STATISTICAL SIGNIFICANCE TESTS")
    print("="*80)
    
    # Baseline comparison reference
    baseline_models = ['Content-Based', 'Collaborative Filtering', 'Hybrid', 'Hybrid+MMR_Static']
    proposed_model = 'Hybrid+MAB_MMR'
    
    significance_results = []
    
    for baseline in baseline_models:
        if baseline not in results_dict or proposed_model not in results_dict:
            continue
        
        # Get metric distributions (would need raw per-user metrics in real implementation)
        # For now, demonstrate with summary statistics
        
        baseline_precision = results_dict[baseline]['precision@10']
        proposed_precision = results_dict[proposed_model]['precision@10']
        
        baseline_diversity = results_dict[baseline]['diversity']
        proposed_diversity = results_dict[proposed_model]['diversity']
        
        # Calculate improvement percentage
        precision_improvement = ((proposed_precision - baseline_precision) / baseline_precision * 100) if baseline_precision > 0 else 0
        diversity_improvement = ((proposed_diversity - baseline_diversity) / baseline_diversity * 100) if baseline_diversity > 0 else 0
        
        print(f"\nüîç {proposed_model} vs {baseline}:")
        print(f"   Precision improvement: {precision_improvement:+.2f}%")
        print(f"   Diversity improvement: {diversity_improvement:+.2f}%")
        
        significance_results.append({
            'comparison': f'{proposed_model} vs {baseline}',
            'precision_improvement': precision_improvement,
            'diversity_improvement': diversity_improvement
        })
    
    return pd.DataFrame(significance_results)

# Run statistical tests
significance_df = perform_statistical_tests(evaluation_results)
print("\nüìä Statistical Significance Summary:")
print(significance_df.to_string(index=False))


In [None]:
# === CELL 8: VISUALIZATIONS  ===
def create_evaluation_visualizations(results_df, output_dir='evaluation_plots'):
    """Create publication-ready visualizations"""
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Set publication quality
    plt.rcParams['figure.dpi'] = 300
    plt.rcParams['savefig.dpi'] = 300
    plt.rcParams['font.size'] = 10
    
    # 1. Metrics Comparison Bar Chart
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    metrics = ['precision@10', 'recall@10', 'ndcg@10', 'diversity']
    titles = ['Precision@10', 'Recall@10', 'NDCG@10', 'Diversity']
    
    for ax, metric, title in zip(axes.flat, metrics, titles):
        results_df[metric].plot(kind='bar', ax=ax, color=sns.color_palette("husl", len(results_df)))
        ax.set_title(f'{title} Comparison', fontsize=12, fontweight='bold')
        ax.set_ylabel(title)
        ax.set_xlabel('Model')
        ax.grid(axis='y', alpha=0.3)
        ax.set_xticklabels(results_df.index, rotation=45, ha='right')
        
        # Highlight proposed method
        if 'MAB' in results_df.index[-1]:
            ax.patches[-1].set_facecolor('red')
            ax.patches[-1].set_alpha(0.8)
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/metrics_comparison.png', bbox_inches='tight')
    print(f"‚úÖ Saved: {output_dir}/metrics_comparison.png")
    plt.show()
    
    # 2. Precision vs Diversity Scatter
    fig, ax = plt.subplots(figsize=(10, 6))
    
    colors = ['blue', 'blue', 'blue', 'green', 'red']
    sizes = [100, 100, 100, 150, 200]
    
    for i, model in enumerate(results_df.index):
        ax.scatter(results_df.loc[model, 'precision@10'], 
                  results_df.loc[model, 'diversity'],
                  s=sizes[i], c=colors[i], alpha=0.6, edgecolors='black', linewidth=2)
        ax.annotate(model, 
                   (results_df.loc[model, 'precision@10'], results_df.loc[model, 'diversity']),
                   xytext=(5, 5), textcoords='offset points', fontsize=9)
    
    ax.set_xlabel('Precision@10', fontsize=12, fontweight='bold')
    ax.set_ylabel('Diversity', fontsize=12, fontweight='bold')
    ax.set_title('Precision vs Diversity Trade-off', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/precision_diversity_tradeoff.png', bbox_inches='tight')
    print(f"‚úÖ Saved: {output_dir}/precision_diversity_tradeoff.png")
    plt.show()
    
    # 3. Radar Chart
    from math import pi
    
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
    
    categories = ['Precision@10', 'Recall@10', 'NDCG@10', 'Diversity']
    N = len(categories)
    
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]
    
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories, fontsize=11)
    
    # Plot each model
    colors_radar = ['blue', 'orange', 'green', 'purple', 'red']
    for i, model in enumerate(results_df.index):
        values = [
            results_df.loc[model, 'precision@10'],
            results_df.loc[model, 'recall@10'],
            results_df.loc[model, 'ndcg@10'],
            results_df.loc[model, 'diversity']
        ]
        values += values[:1]
        
        ax.plot(angles, values, 'o-', linewidth=2, label=model, color=colors_radar[i])
        ax.fill(angles, values, alpha=0.15, color=colors_radar[i])
    
    ax.set_ylim(0, max(results_df[metrics].max()) * 1.1)
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=10)
    ax.set_title('Multi-Metric Performance Comparison', size=14, fontweight='bold', pad=20)
    ax.grid(True)
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/radar_chart.png', bbox_inches='tight')
    print(f"‚úÖ Saved: {output_dir}/radar_chart.png")
    plt.show()
    
    print(f"\n‚úÖ All visualizations saved to {output_dir}/")

# Create visualizations
create_evaluation_visualizations(results_df)


In [None]:
# === CELL 9: EXPORT RESULTS ===
def export_final_results(evaluation_results, results_df, significance_df, output_prefix='final_evaluation'):
    """Export all results in multiple formats for thesis"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # 1. JSON (detailed results)
    json_output = {
        'timestamp': timestamp,
        'dataset': {
            'total_ratings': len(ratings_df),
            'total_users': ratings_df['user_id'].nunique(),
            'total_items': ratings_df['item_id'].nunique(),
            'train_size': len(train_df),
            'test_size': len(test_df)
        },
        'models_evaluated': list(evaluation_results.keys()),
        'results': evaluation_results,
        'statistical_significance': significance_df.to_dict('records')
    }
    
    json_file = f'{output_prefix}_{timestamp}.json'
    with open(json_file, 'w') as f:
        json.dump(json_output, f, indent=2)
    print(f"‚úÖ Saved JSON: {json_file}")
    
    # 2. CSV (results table)
    csv_file = f'{output_prefix}_{timestamp}.csv'
    results_df.to_csv(csv_file)
    print(f"‚úÖ Saved CSV: {csv_file}")
    
    # 3. Pickle (full evaluation object for reproducibility)
    pkl_file = f'{output_prefix}_{timestamp}.pkl'
    with open(pkl_file, 'wb') as f:
        pickle.dump({
            'results': evaluation_results,
            'results_df': results_df,
            'significance_df': significance_df,
            'user_activity_df': user_activity_df,
            'timestamp': timestamp
        }, f)
    print(f"‚úÖ Saved Pickle: {pkl_file}")
    
    # 4. LaTeX table (for thesis)
    latex_file = f'{output_prefix}_{timestamp}.tex'
    with open(latex_file, 'w') as f:
        f.write("% Evaluation Results Table for Thesis\n")
        f.write("\\begin{table}[htbp]\n")
        f.write("\\centering\n")
        f.write("\\caption{Comparative Evaluation Results}\n")
        f.write("\\label{tab:evaluation_results}\n")
        f.write(results_df.to_latex(float_format="%.4f"))
        f.write("\\end{table}\n")
    print(f"‚úÖ Saved LaTeX: {latex_file}")
    
    print(f"\nüéâ All results exported successfully!")
    return json_file, csv_file, pkl_file, latex_file

# Export all results
json_file, csv_file, pkl_file, latex_file = export_final_results(
    evaluation_results, results_df, significance_df
)


In [None]:
# === CELL 10: SUMMARY REPORT ===
def print_final_summary(results_df):
    """Print comprehensive summary for thesis documentation"""
    
    print("\n" + "="*80)
    print("üéì FINAL EVALUATION SUMMARY")
    print("Academic Paper: Proving MAB-MMR Superiority")
    print("="*80)
    
    # Best performing model
    best_precision = results_df['precision@10'].idxmax()
    best_diversity = results_df['diversity'].idxmax()
    best_ndcg = results_df['ndcg@10'].idxmax()
    
    print(f"\nüèÜ BEST PERFORMERS:")
    print(f"   Precision@10: {best_precision} ({results_df.loc[best_precision, 'precision@10']:.4f})")
    print(f"   Diversity: {best_diversity} ({results_df.loc[best_diversity, 'diversity']:.4f})")
    print(f"   NDCG@10: {best_ndcg} ({results_df.loc[best_ndcg, 'ndcg@10']:.4f})")
    
    # MAB-MMR performance
    if 'Hybrid+MAB_MMR' in results_df.index:
        mab_results = results_df.loc['Hybrid+MAB_MMR']
        print(f"\nüéØ PROPOSED METHOD (Hybrid+MAB-MMR) PERFORMANCE:")
        print(f"   Precision@10: {mab_results['precision@10']:.4f}")
        print(f"   Recall@10: {mab_results['recall@10']:.4f}")
        print(f"   NDCG@10: {mab_results['ndcg@10']:.4f}")
        print(f"   Diversity: {mab_results['diversity']:.4f}")
        print(f"   Users evaluated: {int(mab_results['n_users'])}")
        
        # Compare to best baseline
        best_baseline_precision = results_df.drop('Hybrid+MAB_MMR')['precision@10'].max()
        improvement = ((mab_results['precision@10'] - best_baseline_precision) / best_baseline_precision * 100)
        
        print(f"\nüìà IMPROVEMENT OVER BEST BASELINE:")
        print(f"   Precision improvement: {improvement:+.2f}%")
    
    print(f"\n‚úÖ Evaluation completed successfully!")
    print(f"üìÅ Results saved in multiple formats (JSON, CSV, Pickle, LaTeX)")
    print("="*80)

# Print final summary
print_final_summary(results_df)
