# YouTube RAG System Evaluation

This notebook evaluates the RAG system using multiple metrics:
1. Retrieval Metrics
   - Retrieval Precision
   - Context Relevance
2. Generation Metrics
   - Answer Relevance
   - Factual Consistency
3. Overall Metrics
   - ROUGE Scores
   - BERTScore
   - Response Time

All metrics are tracked using MLflow for experiment monitoring.

In [None]:
import os
import json
import time
import mlflow
import numpy as np
import pandas as pd
from typing import List, Dict
from datetime import datetime
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import nltk
from nltk.translate.bleu_score import sentence_bleu
from ragas import evaluate
from ragas.metrics import (
    context_relevancy,
    faithfulness,
    answer_relevancy
)

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
def load_qa_samples() -> List[Dict]:
    """Load sample QA pairs for evaluation"""
    return [
        {
            "question": "What are the main topics discussed in the video?",
            "expected_answer": "The video discusses...",  # Replace with actual expected answer
            "context": "Full transcript section..."  # Replace with actual context
        },
        # Add more QA pairs as needed
    ]

def calculate_rouge_scores(prediction: str, reference: str) -> Dict:
    """Calculate ROUGE scores"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(prediction, reference)
    return {
        'rouge1_f1': scores['rouge1'].fmeasure,
        'rouge2_f1': scores['rouge2'].fmeasure,
        'rougeL_f1': scores['rougeL'].fmeasure
    }

def calculate_bert_scores(prediction: str, reference: str) -> Dict:
    """Calculate BERTScore"""
    P, R, F1 = bert_score([prediction], [reference], lang='en')
    return {
        'bert_precision': P.item(),
        'bert_recall': R.item(),
        'bert_f1': F1.item()
    }

def evaluate_retrieval(retrieved_contexts: List[str], relevant_context: str) -> Dict:
    """Evaluate retrieval performance"""
    # Use RAGAS metrics for context evaluation
    context_scores = evaluate(
        retrieved_contexts,
        [relevant_context],
        metrics=[context_relevancy]
    )
    
    return {
        'context_relevancy': context_scores['context_relevancy']
    }

def evaluate_generation(prediction: str, reference: str, context: str) -> Dict:
    """Evaluate answer generation"""
    # Use RAGAS metrics for answer evaluation
    generation_scores = evaluate(
        [prediction],
        [reference],
        [context],
        metrics=[faithfulness, answer_relevancy]
    )
    
    return {
        'faithfulness': generation_scores['faithfulness'],
        'answer_relevancy': generation_scores['answer_relevancy']
    }

In [None]:
# Load the RAG system from the previous notebook
from youtube_rag_mlflow import qa_chain, vectorstore

def evaluate_rag_system(qa_samples: List[Dict]):
    """Evaluate RAG system performance"""
    with mlflow.start_run(run_name=f"rag_evaluation_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
        all_metrics = []
        
        for i, sample in enumerate(qa_samples):
            # Get RAG system response
            start_time = time.time()
            result = qa_chain({"query": sample['question']})
            response_time = time.time() - start_time
            
            prediction = result['result']
            retrieved_contexts = [doc.page_content for doc in result['source_documents']]
            
            # Calculate metrics
            rouge_scores = calculate_rouge_scores(prediction, sample['expected_answer'])
            bert_scores = calculate_bert_scores(prediction, sample['expected_answer'])
            retrieval_scores = evaluate_retrieval(retrieved_contexts, sample['context'])
            generation_scores = evaluate_generation(
                prediction,
                sample['expected_answer'],
                sample['context']
            )
            
            # Combine all metrics
            metrics = {
                'sample_id': i,
                'response_time': response_time,
                **rouge_scores,
                **bert_scores,
                **retrieval_scores,
                **generation_scores
            }
            
            # Log metrics to MLflow
            for metric_name, value in metrics.items():
                if metric_name != 'sample_id':
                    mlflow.log_metric(f"{metric_name}_{i}", value)
            
            all_metrics.append(metrics)
        
        # Calculate and log average metrics
        metrics_df = pd.DataFrame(all_metrics)
        avg_metrics = metrics_df.mean(numeric_only=True)
        
        for metric_name, value in avg_metrics.items():
            if metric_name != 'sample_id':
                mlflow.log_metric(f"avg_{metric_name}", value)
        
        # Save detailed results
        metrics_df.to_csv('evaluation_results.csv', index=False)
        mlflow.log_artifact('evaluation_results.csv')
        
        return metrics_df

In [None]:
# Run evaluation
qa_samples = load_qa_samples()
results_df = evaluate_rag_system(qa_samples)

# Display results summary
print("\nEvaluation Results Summary:")
print("============================")
print("\nAverage Metrics:")
print(results_df.mean(numeric_only=True))

print("\nMetric Distributions:")
print(results_df.describe())

In [None]:
# Visualize results
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 10))

# Plot distribution of key metrics
metrics_to_plot = [
    'rouge1_f1', 'bert_f1', 'context_relevancy',
    'faithfulness', 'answer_relevancy', 'response_time'
]

for i, metric in enumerate(metrics_to_plot, 1):
    plt.subplot(2, 3, i)
    sns.histplot(results_df[metric], kde=True)
    plt.title(f'Distribution of {metric}')
    plt.xlabel(metric)
    plt.ylabel('Count')

plt.tight_layout()
plt.savefig('metric_distributions.png')
mlflow.log_artifact('metric_distributions.png')
plt.show()