In [1]:
# Title: RAG System Evaluation Framework
# Description: Comprehensive evaluation framework for RAG (Retrieval-Augmented Generation) systems

# Install required packages
!pip install -q langchain chromadb sentence-transformers rouge-score bert-score trulens-eval datasets evaluate nltk

# Imports and Dependencies
import numpy as np
import pandas as pd
from typing import List, Dict
from dataclasses import dataclass
import torch
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
import evaluate
from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.5/615.5 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

  from tqdm.autonotebook import tqdm, trange


In [2]:

# Metric Classes
@dataclass
class RetrievalMetrics:
    precision_at_k: float
    recall_at_k: float
    mrr: float
    ndcg: float
    semantic_similarity: float
    reranking_score: float

@dataclass
class GenerationMetrics:
    rouge_scores: Dict[str, float]
    bert_score: float
    faithfulness_score: float
    context_relevance: float
    answer_consistency: float
    bleu_score: float
    meteor_score: float


In [3]:

# Main Evaluator Class
class RAGEvaluator:
    def __init__(self, embedding_model: str = "all-mpnet-base-v2", reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.reranker = CrossEncoder(reranker_model)
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.bert_scorer = evaluate.load("bertscore")

    def compute_precision_at_k(self, relevant_docs: List[int], retrieved_docs: List[int], k: int) -> float:
        retrieved_k = retrieved_docs[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
        return relevant_retrieved / k if k > 0 else 0.0

    def compute_recall_at_k(self, relevant_docs: List[int], retrieved_docs: List[int], k: int) -> float:
        retrieved_k = retrieved_docs[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
        return relevant_retrieved / len(relevant_docs) if relevant_docs else 0.0

    def compute_mrr(self, relevant_docs: List[int], retrieved_docs: List[int]) -> float:
        for i, doc_id in enumerate(retrieved_docs, 1):
            if doc_id in relevant_docs:
                return 1.0 / i
        return 0.0

    def compute_ndcg(self, relevant_docs: List[int], retrieved_docs: List[int], k: int) -> float:
        dcg = 0
        idcg = 0
        retrieved_k = retrieved_docs[:k]

        for i, doc_id in enumerate(retrieved_k):
            rel = 1 if doc_id in relevant_docs else 0
            dcg += rel / np.log2(i + 2)

        for i in range(min(k, len(relevant_docs))):
            idcg += 1 / np.log2(i + 2)

        return dcg / idcg if idcg > 0 else 0.0

    def compute_semantic_similarity(self, query: str, retrieved_docs: List[str]) -> float:
        query_embedding = self.embedding_model.encode([query])
        doc_embeddings = self.embedding_model.encode(retrieved_docs)
        similarities = cosine_similarity(query_embedding, doc_embeddings)
        return float(similarities.mean())

    def rerank_documents(self, query: str, documents: List[str]) -> float:
        pairs = [[query, doc] for doc in documents]
        scores = self.reranker.predict(pairs)
        return float(np.mean(scores))

    def compute_faithfulness(self, generated_text: str, source_documents: List[str]) -> float:
        sentences = sent_tokenize(generated_text)
        if not sentences:
            return 0.0

        scores = []
        for sent in sentences:
            sent_embedding = self.embedding_model.encode([sent])[0]
            src_embeddings = self.embedding_model.encode(source_documents)
            max_similarity = cosine_similarity([sent_embedding], src_embeddings)[0].max()
            scores.append(max_similarity)

        return np.mean(scores)

    def compute_answer_consistency(self, generated_text: str, reference_text: str) -> float:
        gen_embedding = self.embedding_model.encode([generated_text])[0]
        ref_embedding = self.embedding_model.encode([reference_text])[0]
        return float(cosine_similarity([gen_embedding], [ref_embedding])[0][0])

    def compute_bleu_score(self, generated_text: str, reference_text: str) -> float:
        reference_tokens = [reference_text.split()]
        generated_tokens = generated_text.split()
        return sentence_bleu(reference_tokens, generated_tokens)

    def compute_meteor_score(self, generated_text: str, reference_text: str) -> float:
        return single_meteor_score(reference_text, generated_text)

    def evaluate_retrieval(self, query: str, retrieved_docs: List[str], relevant_doc_ids: List[int], retrieved_doc_ids: List[int], k: int = 5) -> RetrievalMetrics:
        precision = self.compute_precision_at_k(relevant_doc_ids, retrieved_doc_ids, k)
        recall = self.compute_recall_at_k(relevant_doc_ids, retrieved_doc_ids, k)
        mrr = self.compute_mrr(relevant_doc_ids, retrieved_doc_ids)
        ndcg = self.compute_ndcg(relevant_doc_ids, retrieved_doc_ids, k)
        semantic_sim = self.compute_semantic_similarity(query, retrieved_docs)
        reranking_score = self.rerank_documents(query, retrieved_docs)

        return RetrievalMetrics(
            precision_at_k=precision,
            recall_at_k=recall,
            mrr=mrr,
            ndcg=ndcg,
            semantic_similarity=semantic_sim,
            reranking_score=reranking_score
        )

    def evaluate_generation(self, generated_text: str, reference_text: str, source_documents: List[str]) -> GenerationMetrics:
        rouge_scores = self.rouge_scorer.score(reference_text, generated_text)
        rouge_dict = {
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure
        }

        bert_scores = self.bert_scorer.compute(predictions=[generated_text], references=[reference_text], lang="en")
        bert_f1 = bert_scores['f1'][0]

        faithfulness = self.compute_faithfulness(generated_text, source_documents)
        context_relevance = self.compute_semantic_similarity(generated_text, source_documents)
        answer_consistency = self.compute_answer_consistency(generated_text, reference_text)

        bleu_score = self.compute_bleu_score(generated_text, reference_text)
        meteor_score = self.compute_meteor_score(generated_text, reference_text)

        return GenerationMetrics(
            rouge_scores=rouge_dict,
            bert_score=bert_f1,
            faithfulness_score=faithfulness,
            context_relevance=context_relevance,
            answer_consistency=answer_consistency,
            bleu_score=bleu_score,
            meteor_score=meteor_score
        )


In [4]:
from nltk.tokenize import word_tokenize

def compute_meteor_score(self, generated_text: str, reference_text: str) -> float:
    # Tokenize the text
    gen_tokens = word_tokenize(generated_text)
    ref_tokens = word_tokenize(reference_text)

    # Compute the METEOR score using the tokenized version of the text
    return single_meteor_score(ref_tokens, gen_tokens)


In [5]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, CrossEncoder
from rouge_score import rouge_scorer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import evaluate
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
from typing import List, Dict
from dataclasses import dataclass

# Metric Classes
@dataclass
class RetrievalMetrics:
    precision_at_k: float
    recall_at_k: float
    mrr: float
    ndcg: float
    semantic_similarity: float
    reranking_score: float

@dataclass
class GenerationMetrics:
    rouge_scores: Dict[str, float]
    bert_score: float
    faithfulness_score: float
    context_relevance: float
    answer_consistency: float
    bleu_score: float
    meteor_score: float

# Main Evaluator Class
class RAGEvaluator:
    def __init__(self, embedding_model: str = "all-mpnet-base-v2", reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.reranker = CrossEncoder(reranker_model)
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.bert_scorer = evaluate.load("bertscore")

    # METEOR Score Fix
    def compute_meteor_score(self, generated_text: str, reference_text: str) -> float:
        # Tokenize the text
        gen_tokens = word_tokenize(generated_text)
        ref_tokens = word_tokenize(reference_text)

        # Compute the METEOR score using the tokenized version of the text
        return single_meteor_score(ref_tokens, gen_tokens)

    def compute_precision_at_k(self, relevant_docs: List[int], retrieved_docs: List[int], k: int) -> float:
        retrieved_k = retrieved_docs[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
        return relevant_retrieved / k if k > 0 else 0.0

    def compute_recall_at_k(self, relevant_docs: List[int], retrieved_docs: List[int], k: int) -> float:
        retrieved_k = retrieved_docs[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
        return relevant_retrieved / len(relevant_docs) if relevant_docs else 0.0

    def compute_mrr(self, relevant_docs: List[int], retrieved_docs: List[int]) -> float:
        for i, doc_id in enumerate(retrieved_docs, 1):
            if doc_id in relevant_docs:
                return 1.0 / i
        return 0.0

    def compute_ndcg(self, relevant_docs: List[int], retrieved_docs: List[int], k: int) -> float:
        dcg = 0
        idcg = 0
        retrieved_k = retrieved_docs[:k]

        for i, doc_id in enumerate(retrieved_k):
            rel = 1 if doc_id in relevant_docs else 0
            dcg += rel / np.log2(i + 2)

        for i in range(min(k, len(relevant_docs))):
            idcg += 1 / np.log2(i + 2)

        return dcg / idcg if idcg > 0 else 0.0

    def compute_semantic_similarity(self, query: str, retrieved_docs: List[str]) -> float:
        query_embedding = self.embedding_model.encode([query])
        doc_embeddings = self.embedding_model.encode(retrieved_docs)
        similarities = cosine_similarity(query_embedding, doc_embeddings)
        return float(similarities.mean())

    def rerank_documents(self, query: str, documents: List[str]) -> float:
        pairs = [[query, doc] for doc in documents]
        scores = self.reranker.predict(pairs)
        return float(np.mean(scores))

    def compute_faithfulness(self, generated_text: str, source_documents: List[str]) -> float:
        sentences = word_tokenize(generated_text)
        if not sentences:
            return 0.0

        scores = []
        for sent in sentences:
            sent_embedding = self.embedding_model.encode([sent])[0]
            src_embeddings = self.embedding_model.encode(source_documents)
            max_similarity = cosine_similarity([sent_embedding], src_embeddings)[0].max()
            scores.append(max_similarity)

        return np.mean(scores)

    def compute_answer_consistency(self, generated_text: str, reference_text: str) -> float:
        gen_embedding = self.embedding_model.encode([generated_text])[0]
        ref_embedding = self.embedding_model.encode([reference_text])[0]
        return float(cosine_similarity([gen_embedding], [ref_embedding])[0][0])

    def compute_bleu_score(self, generated_text: str, reference_text: str) -> float:
        reference_tokens = [reference_text.split()]
        generated_tokens = generated_text.split()
        return sentence_bleu(reference_tokens, generated_tokens)

    def evaluate_retrieval(self, query: str, retrieved_docs: List[str], relevant_doc_ids: List[int], retrieved_doc_ids: List[int], k: int = 5) -> RetrievalMetrics:
        precision = self.compute_precision_at_k(relevant_doc_ids, retrieved_doc_ids, k)
        recall = self.compute_recall_at_k(relevant_doc_ids, retrieved_doc_ids, k)
        mrr = self.compute_mrr(relevant_doc_ids, retrieved_doc_ids)
        ndcg = self.compute_ndcg(relevant_doc_ids, retrieved_doc_ids, k)
        semantic_sim = self.compute_semantic_similarity(query, retrieved_docs)
        reranking_score = self.rerank_documents(query, retrieved_docs)

        return RetrievalMetrics(
            precision_at_k=precision,
            recall_at_k=recall,
            mrr=mrr,
            ndcg=ndcg,
            semantic_similarity=semantic_sim,
            reranking_score=reranking_score
        )

    def evaluate_generation(self, generated_text: str, reference_text: str, source_documents: List[str]) -> GenerationMetrics:
        rouge_scores = self.rouge_scorer.score(reference_text, generated_text)
        rouge_dict = {
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure
        }

        bert_scores = self.bert_scorer.compute(predictions=[generated_text], references=[reference_text], lang="en")
        bert_f1 = bert_scores['f1'][0]

        faithfulness = self.compute_faithfulness(generated_text, source_documents)
        context_relevance = self.compute_semantic_similarity(generated_text, source_documents)
        answer_consistency = self.compute_answer_consistency(generated_text, reference_text)

        bleu_score = self.compute_bleu_score(generated_text, reference_text)
        meteor_score = self.compute_meteor_score(generated_text, reference_text)

        return GenerationMetrics(
            rouge_scores=rouge_dict,
            bert_score=bert_f1,
            faithfulness_score=faithfulness,
            context_relevance=context_relevance,
            answer_consistency=answer_consistency,
            bleu_score=bleu_score,
            meteor_score=meteor_score
        )

# Example usage (evaluating generation)
evaluator = RAGEvaluator()

generated_text = "Paris is the capital city of France."
reference_text = "The capital of France is Paris."
retrieved_docs = ["Paris is the capital of France.", "France is known for its landmarks."]

# Evaluate Generation
generation_metrics = evaluator.evaluate_generation(
    generated_text=generated_text,
    reference_text=reference_text,
    source_documents=retrieved_docs
)

# Print results for Generation Metrics
print("\nGeneration Metrics:")
print(f"ROUGE-1: {generation_metrics.rouge_scores['rouge1']:.3f}")
print(f"ROUGE-2: {generation_metrics.rouge_scores['rouge2']:.3f}")
print(f"ROUGE-L: {generation_metrics.rouge_scores['rougeL']:.3f}")
print(f"BERTScore: {generation_metrics.bert_score:.3f}")
print(f"Faithfulness: {generation_metrics.faithfulness_score:.3f}")
print(f"Context Relevance: {generation_metrics.context_relevance:.3f}")
print(f"Answer Consistency: {generation_metrics.answer_consistency:.3f}")
print(f"BLEU Score: {generation_metrics.bleu_score:.3f}")
print(f"METEOR Score: {generation_metrics.meteor_score:.3f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [8]:
# Install required packages
!pip install -q langchain chromadb sentence-transformers rouge-score bert-score trulens-eval datasets evaluate nltk

# Imports and Dependencies
import numpy as np
import pandas as pd
from typing import List, Dict
from dataclasses import dataclass
import torch
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
import evaluate
from datasets import load_dataset
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score

import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Metric Classes
@dataclass
class RetrievalMetrics:
    precision_at_k: float
    recall_at_k: float
    mrr: float
    ndcg: float
    semantic_similarity: float
    reranking_score: float

@dataclass
class GenerationMetrics:
    rouge_scores: Dict[str, float]
    bert_score: float
    faithfulness_score: float
    context_relevance: float
    answer_consistency: float
    bleu_score: float
    meteor_score: float

# Main Evaluator Class
class RAGEvaluator:
    def __init__(self, embedding_model: str = "all-mpnet-base-v2", reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.reranker = CrossEncoder(reranker_model)
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.bert_scorer = evaluate.load("bertscore")

    def compute_precision_at_k(self, relevant_docs: List[int], retrieved_docs: List[int], k: int) -> float:
        retrieved_k = retrieved_docs[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
        return relevant_retrieved / k if k > 0 else 0.0

    def compute_recall_at_k(self, relevant_docs: List[int], retrieved_docs: List[int], k: int) -> float:
        retrieved_k = retrieved_docs[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
        return relevant_retrieved / len(relevant_docs) if relevant_docs else 0.0

    def compute_mrr(self, relevant_docs: List[int], retrieved_docs: List[int]) -> float:
        for i, doc_id in enumerate(retrieved_docs, 1):
            if doc_id in relevant_docs:
                return 1.0 / i
        return 0.0

    def compute_ndcg(self, relevant_docs: List[int], retrieved_docs: List[int], k: int) -> float:
        dcg = 0
        idcg = 0
        retrieved_k = retrieved_docs[:k]

        for i, doc_id in enumerate(retrieved_k):
            rel = 1 if doc_id in relevant_docs else 0
            dcg += rel / np.log2(i + 2)

        for i in range(min(k, len(relevant_docs))):
            idcg += 1 / np.log2(i + 2)

        return dcg / idcg if idcg > 0 else 0.0

    def compute_semantic_similarity(self, query: str, retrieved_docs: List[str]) -> float:
        query_embedding = self.embedding_model.encode([query])
        doc_embeddings = self.embedding_model.encode(retrieved_docs)
        similarities = cosine_similarity(query_embedding, doc_embeddings)
        return float(similarities.mean())

    def rerank_documents(self, query: str, documents: List[str]) -> float:
        pairs = [[query, doc] for doc in documents]
        scores = self.reranker.predict(pairs)
        return float(np.mean(scores))

    def compute_faithfulness(self, generated_text: str, source_documents: List[str]) -> float:
        sentences = sent_tokenize(generated_text)
        if not sentences:
            return 0.0

        scores = []
        for sent in sentences:
            sent_embedding = self.embedding_model.encode([sent])[0]
            src_embeddings = self.embedding_model.encode(source_documents)
            max_similarity = cosine_similarity([sent_embedding], src_embeddings)[0].max()
            scores.append(max_similarity)

        return np.mean(scores)

    def compute_answer_consistency(self, generated_text: str, reference_text: str) -> float:
        gen_embedding = self.embedding_model.encode([generated_text])[0]
        ref_embedding = self.embedding_model.encode([reference_text])[0]
        return float(cosine_similarity([gen_embedding], [ref_embedding])[0][0])

    def compute_bleu_score(self, generated_text: str, reference_text: str) -> float:
        reference_tokens = [reference_text.split()]
        generated_tokens = generated_text.split()
        smoothing = SmoothingFunction().method1  # Apply smoothing to avoid zero n-gram counts
        bleu = sentence_bleu(reference_tokens, generated_tokens, smoothing_function=smoothing)
        return bleu

    def compute_meteor_score(self, generated_text: str, reference_text: str) -> float:
        # Tokenize the text
        gen_tokens = word_tokenize(generated_text)
        ref_tokens = word_tokenize(reference_text)

        # Compute the METEOR score using the tokenized version of the text
        return single_meteor_score(ref_tokens, gen_tokens)

    def evaluate_retrieval(self, query: str, retrieved_docs: List[str], relevant_doc_ids: List[int], retrieved_doc_ids: List[int], k: int = 5) -> RetrievalMetrics:
        precision = self.compute_precision_at_k(relevant_doc_ids, retrieved_doc_ids, k)
        recall = self.compute_recall_at_k(relevant_doc_ids, retrieved_doc_ids, k)
        mrr = self.compute_mrr(relevant_doc_ids, retrieved_doc_ids)
        ndcg = self.compute_ndcg(relevant_doc_ids, retrieved_doc_ids, k)
        semantic_sim = self.compute_semantic_similarity(query, retrieved_docs)
        reranking_score = self.rerank_documents(query, retrieved_docs)

        return RetrievalMetrics(
            precision_at_k=precision,
            recall_at_k=recall,
            mrr=mrr,
            ndcg=ndcg,
            semantic_similarity=semantic_sim,
            reranking_score=reranking_score
        )

    def evaluate_generation(self, generated_text: str, reference_text: str, source_documents: List[str]) -> GenerationMetrics:
        rouge_scores = self.rouge_scorer.score(reference_text, generated_text)
        rouge_dict = {
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure
        }

        bert_scores = self.bert_scorer.compute(predictions=[generated_text], references=[reference_text], lang="en")
        bert_f1 = bert_scores['f1'][0]

        faithfulness = self.compute_faithfulness(generated_text, source_documents)
        context_relevance = self.compute_semantic_similarity(generated_text, source_documents)
        answer_consistency = self.compute_answer_consistency(generated_text, reference_text)

        bleu_score = self.compute_bleu_score(generated_text, reference_text)
        meteor_score = self.compute_meteor_score(generated_text, reference_text)

        return GenerationMetrics(
            rouge_scores=rouge_dict,
            bert_score=bert_f1,
            faithfulness_score=faithfulness,
            context_relevance=context_relevance,
            answer_consistency=answer_consistency,
            bleu_score=bleu_score,
            meteor_score=meteor_score
        )


# Example usage

# Instantiate the evaluator
evaluator = RAGEvaluator()

# Sample data
query = "What is the capital of France?"
retrieved_docs = ["Paris is the capital of France.", "Berlin is the capital of Germany.", "Madrid is the capital of Spain."]
relevant_doc_ids = [0]  # Document 0 is relevant
retrieved_doc_ids = [0, 1, 2]

# Evaluate Retrieval
retrieval_metrics = evaluator.evaluate_retrieval(query, retrieved_docs, relevant_doc_ids, retrieved_doc_ids)

# Sample generation data
generated_text = "Paris is the capital city of France."
reference_text = "The capital of France is Paris."
source_documents = ["Paris is the capital of France.", "France is known for its landmarks."]

# Evaluate Generation
generation_metrics = evaluator.evaluate_generation(generated_text, reference_text, source_documents)

# Print results for Retrieval Metrics
print("\nRetrieval Metrics:")
print(f"Precision@5: {retrieval_metrics.precision_at_k:.3f}")
print(f"Recall@5: {retrieval_metrics.recall_at_k:.3f}")
print(f"MRR: {retrieval_metrics.mrr:.3f}")
print(f"NDCG: {retrieval_metrics.ndcg:.3f}")
print(f"Semantic Similarity: {retrieval_metrics.semantic_similarity:.3f}")
print(f"Reranking Score: {retrieval_metrics.reranking_score:.3f}")

# Print results for Generation Metrics
print("\nGeneration Metrics:")
print(f"ROUGE-1: {generation_metrics.rouge_scores['rouge1']:.3f}")
print(f"ROUGE-2: {generation_metrics.rouge_scores['rouge2']:.3f}")
print(f"ROUGE-L: {generation_metrics.rouge_scores['rougeL']:.3f}")
print(f"BERTScore: {generation_metrics.bert_score:.3f}")
print(f"Faithfulness Score: {generation_metrics.faithfulness_score:.3f}")
print(f"Context Relevance: {generation_metrics.context_relevance:.3f}")
print(f"Answer Consistency: {generation_metrics.answer_consistency:.3f}")
print(f"BLEU Score: {generation_metrics.bleu_score:.3f}")
print(f"METEOR Score: {generation_metrics.meteor_score:.3f}")


[0m

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Retrieval Metrics:
Precision@5: 0.200
Recall@5: 1.000
MRR: 1.000
NDCG: 1.000
Semantic Similarity: 0.557
Reranking Score: 1.077

Generation Metrics:
ROUGE-1: 0.923
ROUGE-2: 0.364
ROUGE-L: 0.615
BERTScore: 0.925
Faithfulness Score: 0.994
Context Relevance: 0.824
Answer Consistency: 0.893
BLEU Score: 0.043
METEOR Score: 0.806
