In [None]:
# RAG Evaluation Analysis for Research Paper

This notebook provides comprehensive analysis and visualization of RAG evaluation results for academic research.

## Table of Contents
1. Load and explore evaluation results
2. Statistical analysis
3. Performance comparisons
4. Visualization for paper
5. Statistical significance testing
6. Export tables for LaTeX


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import json
import glob
import os
from pathlib import Path

# Set style for academic paper quality plots
plt.style.use('seaborn-v0_8-paper')
sns.set_palette("husl")

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)

print("📊 RAG Evaluation Analysis Notebook")
print("=" * 50)


In [None]:
## 1. Load Evaluation Results


In [None]:
# Load the most recent evaluation results
def load_latest_results(pattern="evaluation_results_*"):
    """Load the most recent evaluation results"""
    result_dirs = glob.glob(pattern)
    if not result_dirs:
        print("❌ No evaluation results found!")
        return None, None
    
    # Get most recent directory
    latest_dir = max(result_dirs, key=os.path.getmtime)
    print(f"📁 Loading results from: {latest_dir}")
    
    # Load CSV results
    results_df = pd.read_csv(f"{latest_dir}/full_results.csv")
    
    # Load JSON analysis report
    with open(f"{latest_dir}/analysis_report.json", 'r') as f:
        analysis_report = json.load(f)
    
    return results_df, analysis_report

# Load results
results_df, analysis_report = load_latest_results()

if results_df is not None:
    print(f"\n✅ Loaded {len(results_df)} evaluation results")
    print(f"📊 Approaches evaluated: {results_df['approach'].unique()}")
    print(f"❓ Questions: {results_df['question'].nunique()}")
    
    # Display basic info
    print("\n📈 Columns in dataset:")
    print(results_df.columns.tolist())


In [None]:
## 2. Summary Statistics and Performance Overview


In [None]:
# Create comprehensive summary statistics
if results_df is not None:
    # Define metrics to analyze
    quality_metrics = ['answer_relevancy', 'faithfulness', 'context_recall', 
                       'context_precision', 'answer_correctness']
    performance_metrics = ['latency_ms', 'retrieval_time_ms', 'generation_time_ms']
    
    # Create summary table
    summary_stats = results_df.groupby('approach')[quality_metrics + performance_metrics].agg(['mean', 'std'])
    
    # Round for display
    summary_stats = summary_stats.round(3)
    
    print("📊 SUMMARY STATISTICS BY APPROACH")
    print("=" * 80)
    display(summary_stats)
    
    # Best performing approach for each metric
    print("\n🏆 BEST PERFORMING APPROACHES")
    print("-" * 50)
    for metric in quality_metrics:
        best_approach = results_df.groupby('approach')[metric].mean().idxmax()
        best_score = results_df.groupby('approach')[metric].mean().max()
        print(f"{metric:20s}: {best_approach:20s} (score: {best_score:.3f})")
    
    print("\n⚡ FASTEST APPROACHES")
    print("-" * 50)
    fastest = results_df.groupby('approach')['latency_ms'].mean().sort_values()
    for approach, latency in fastest.head(3).items():
        print(f"{approach:20s}: {latency:.1f} ms")


In [None]:
## 3. Performance Visualizations


In [None]:
# Create comprehensive visualizations for paper
if results_df is not None:
    # 1. Overall Quality Metrics Comparison
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for idx, metric in enumerate(quality_metrics):
        ax = axes[idx]
        
        # Create box plot
        data_to_plot = [results_df[results_df['approach'] == approach][metric].dropna() 
                        for approach in results_df['approach'].unique()]
        
        bp = ax.boxplot(data_to_plot, patch_artist=True, showmeans=True)
        
        # Customize box plot
        for patch, color in zip(bp['boxes'], sns.color_palette("husl", len(data_to_plot))):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)
        
        ax.set_xticklabels(results_df['approach'].unique(), rotation=45, ha='right')
        ax.set_title(metric.replace('_', ' ').title(), fontsize=12)
        ax.set_ylabel('Score')
        ax.grid(True, alpha=0.3)
    
    # Remove empty subplot if any
    if len(quality_metrics) < 6:
        fig.delaxes(axes[-1])
    
    plt.suptitle('Quality Metrics Distribution by RAG Approach', fontsize=16)
    plt.tight_layout()
    plt.savefig('quality_metrics_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # 2. Performance vs Quality Trade-off
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Calculate mean scores for plotting
    approach_means = results_df.groupby('approach').agg({
        'answer_correctness': 'mean',
        'latency_ms': 'mean'
    }).reset_index()
    
    # Create scatter plot
    scatter = ax.scatter(approach_means['latency_ms'], 
                        approach_means['answer_correctness'],
                        s=200, alpha=0.7)
    
    # Add labels
    for idx, row in approach_means.iterrows():
        ax.annotate(row['approach'], 
                   (row['latency_ms'], row['answer_correctness']),
                   xytext=(5, 5), textcoords='offset points',
                   fontsize=10)
    
    ax.set_xlabel('Average Latency (ms)', fontsize=12)
    ax.set_ylabel('Average Answer Correctness', fontsize=12)
    ax.set_title('Performance vs Quality Trade-off', fontsize=14)
    ax.grid(True, alpha=0.3)
    
    # Add ideal region
    ax.axhspan(0.8, 1.0, alpha=0.1, color='green', label='High Quality')
    ax.axvspan(0, 200, alpha=0.1, color='blue', label='Low Latency')
    ax.legend()
    
    plt.tight_layout()
    plt.savefig('performance_quality_tradeoff.png', dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
## 4. Statistical Significance Testing


In [None]:
# Statistical significance testing between approaches
if results_df is not None:
    from scipy.stats import friedmanchisquare, wilcoxon
    import itertools
    
    print("📊 STATISTICAL SIGNIFICANCE TESTING")
    print("=" * 80)
    
    # Prepare data for Friedman test (non-parametric for related samples)
    approaches = results_df['approach'].unique()
    
    # Test each quality metric
    for metric in quality_metrics:
        print(f"\n📈 Testing metric: {metric}")
        print("-" * 50)
        
        # Prepare data for each approach
        approach_data = []
        valid_approaches = []
        
        for approach in approaches:
            data = results_df[results_df['approach'] == approach][metric].dropna()
            if len(data) > 0:
                approach_data.append(data.values)
                valid_approaches.append(approach)
        
        if len(approach_data) >= 3:
            # Friedman test
            stat, p_value = friedmanchisquare(*approach_data)
            print(f"Friedman test: χ² = {stat:.3f}, p = {p_value:.4f}")
            
            if p_value < 0.05:
                print("✅ Significant differences found! Performing pairwise comparisons...")
                
                # Pairwise Wilcoxon signed-rank tests with Bonferroni correction
                n_comparisons = len(list(itertools.combinations(valid_approaches, 2)))
                bonferroni_alpha = 0.05 / n_comparisons
                
                print(f"\nPairwise comparisons (Bonferroni corrected α = {bonferroni_alpha:.4f}):")
                
                for i, j in itertools.combinations(range(len(valid_approaches)), 2):
                    approach1, approach2 = valid_approaches[i], valid_approaches[j]
                    data1 = results_df[results_df['approach'] == approach1][metric].dropna()
                    data2 = results_df[results_df['approach'] == approach2][metric].dropna()
                    
                    # Align data by question for paired test
                    questions = set(results_df[results_df['approach'] == approach1]['question']) & \
                                set(results_df[results_df['approach'] == approach2]['question'])
                    
                    aligned_data1 = []
                    aligned_data2 = []
                    
                    for q in questions:
                        val1 = results_df[(results_df['approach'] == approach1) & 
                                         (results_df['question'] == q)][metric].values
                        val2 = results_df[(results_df['approach'] == approach2) & 
                                         (results_df['question'] == q)][metric].values
                        
                        if len(val1) > 0 and len(val2) > 0:
                            aligned_data1.append(val1[0])
                            aligned_data2.append(val2[0])
                    
                    if len(aligned_data1) > 5:  # Need sufficient samples
                        stat, p = wilcoxon(aligned_data1, aligned_data2)
                        sig = "***" if p < bonferroni_alpha else ""
                        mean_diff = np.mean(aligned_data1) - np.mean(aligned_data2)
                        print(f"  {approach1} vs {approach2}: p={p:.4f} {sig}, mean_diff={mean_diff:.3f}")
            else:
                print("❌ No significant differences found.")
                
    # Create significance matrix for answer_correctness
    print("\n📊 SIGNIFICANCE MATRIX FOR ANSWER CORRECTNESS")
    print("-" * 80)
    
    sig_matrix = pd.DataFrame(index=approaches, columns=approaches, data="-")
    
    for i, j in itertools.combinations(range(len(approaches)), 2):
        approach1, approach2 = approaches[i], approaches[j]
        
        # Get aligned data
        questions = set(results_df[results_df['approach'] == approach1]['question']) & \
                    set(results_df[results_df['approach'] == approach2]['question'])
        
        aligned_data1 = []
        aligned_data2 = []
        
        for q in questions:
            val1 = results_df[(results_df['approach'] == approach1) & 
                             (results_df['question'] == q)]['answer_correctness'].values
            val2 = results_df[(results_df['approach'] == approach2) & 
                             (results_df['question'] == q)]['answer_correctness'].values
            
            if len(val1) > 0 and len(val2) > 0:
                aligned_data1.append(val1[0])
                aligned_data2.append(val2[0])
        
        if len(aligned_data1) > 5:
            stat, p = wilcoxon(aligned_data1, aligned_data2)
            
            if p < 0.001:
                sig_matrix.loc[approach1, approach2] = "***"
                sig_matrix.loc[approach2, approach1] = "***"
            elif p < 0.01:
                sig_matrix.loc[approach1, approach2] = "**"
                sig_matrix.loc[approach2, approach1] = "**"
            elif p < 0.05:
                sig_matrix.loc[approach1, approach2] = "*"
                sig_matrix.loc[approach2, approach1] = "*"
    
    display(sig_matrix)
    print("\n* p < 0.05, ** p < 0.01, *** p < 0.001")


In [None]:
## 5. Export Tables for Academic Paper


In [None]:
# Export tables for LaTeX
if results_df is not None:
    print("📄 EXPORTING TABLES FOR LATEX")
    print("=" * 80)
    
    # 1. Main results table
    main_results = results_df.groupby('approach')[
        ['answer_correctness', 'answer_relevancy', 'faithfulness', 'latency_ms']
    ].agg(['mean', 'std']).round(3)
    
    # Format column names for LaTeX
    main_results.columns = [f'{metric}_{stat}' for metric, stat in main_results.columns]
    
    # Export to LaTeX
    latex_table = main_results.to_latex(
        caption="Performance comparison of different RAG approaches",
        label="tab:main_results",
        column_format="l" + "r" * len(main_results.columns),
        float_format="%.3f"
    )
    
    # Save to file
    with open('main_results_table.tex', 'w') as f:
        f.write(latex_table)
    
    print("✅ Main results table saved to: main_results_table.tex")
    
    # 2. Performance by question type
    type_performance = results_df.groupby(['approach', 'question_type'])['answer_correctness'].mean().unstack()
    type_performance = type_performance.round(3)
    
    latex_table_type = type_performance.to_latex(
        caption="Performance comparison by question type",
        label="tab:performance_by_type",
        float_format="%.3f"
    )
    
    with open('performance_by_type_table.tex', 'w') as f:
        f.write(latex_table_type)
    
    print("✅ Performance by type table saved to: performance_by_type_table.tex")
    
    # 3. Latency breakdown table
    latency_breakdown = results_df.groupby('approach')[
        ['retrieval_time_ms', 'generation_time_ms', 'latency_ms']
    ].mean().round(1)
    
    latex_table_latency = latency_breakdown.to_latex(
        caption="Latency breakdown by RAG approach (in milliseconds)",
        label="tab:latency_breakdown",
        float_format="%.1f"
    )
    
    with open('latency_breakdown_table.tex', 'w') as f:
        f.write(latex_table_latency)
    
    print("✅ Latency breakdown table saved to: latency_breakdown_table.tex")
    
    # 4. Create a summary for the paper
    print("\n📝 SUMMARY FOR PAPER")
    print("-" * 80)
    
    best_quality = results_df.groupby('approach')['answer_correctness'].mean().idxmax()
    best_quality_score = results_df.groupby('approach')['answer_correctness'].mean().max()
    
    fastest = results_df.groupby('approach')['latency_ms'].mean().idxmin()
    fastest_time = results_df.groupby('approach')['latency_ms'].mean().min()
    
    # Find best trade-off (high quality, reasonable speed)
    normalized_quality = results_df.groupby('approach')['answer_correctness'].mean() / \
                        results_df.groupby('approach')['answer_correctness'].mean().max()
    normalized_speed = 1 - (results_df.groupby('approach')['latency_ms'].mean() / \
                           results_df.groupby('approach')['latency_ms'].mean().max())
    
    trade_off_score = (normalized_quality + normalized_speed) / 2
    best_tradeoff = trade_off_score.idxmax()
    
    print(f"🏆 Best Quality: {best_quality} (score: {best_quality_score:.3f})")
    print(f"⚡ Fastest: {fastest} (latency: {fastest_time:.1f}ms)")
    print(f"⚖️  Best Trade-off: {best_tradeoff} (combined score: {trade_off_score[best_tradeoff]:.3f})")
    
    # Calculate improvements
    baseline_score = results_df[results_df['approach'] == 'baseline']['answer_correctness'].mean()
    hybrid_score = results_df[results_df['approach'] == 'hybrid_rag']['answer_correctness'].mean()
    improvement = ((hybrid_score - baseline_score) / baseline_score) * 100
    
    print(f"\n📈 Hybrid RAG shows {improvement:.1f}% improvement over baseline")
