In [1]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
# Cell 1: Import required libraries
#import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import pandas as pd
from sentence_transformers import SentenceTransformer
from typing import Dict, Any
import logging
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import spacy
nlp = spacy.load("en_core_web_sm")
print("SpaCy model loaded successfully!")


# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)



SpaCy model loaded successfully!


In [3]:
!pip install seaborn




In [4]:
!pip install spacy




In [5]:
import numpy as np

In [6]:
# Cell 2: Define the RAGEvaluator class
class RAGEvaluator:
    def __init__(self, spacy_model: str = "en_core_web_sm", sentence_transformer: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the RAG evaluator with required models.
        """
        logger.info("Initializing RAG Evaluator...")
        self.nlp = spacy.load(spacy_model)
        self.model = SentenceTransformer(sentence_transformer)

    def compute_cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
        """Compute cosine similarity between two vectors."""
        return float(cosine_similarity([vec1], [vec2])[0][0])

    def evaluate_faithfulness(self, generated_answer: str, context: str) -> float:
        """Evaluate how faithful the generated answer is to the context."""
        context_embedding = self.model.encode([context])[0]
        answer_embedding = self.model.encode([generated_answer])[0]
        return self.compute_cosine_similarity(context_embedding, answer_embedding)

    def evaluate_answer_relevancy(self, answer: str, query: str) -> float:
        """Evaluate how relevant the answer is to the query."""
        query_embedding = self.model.encode([query])[0]
        answer_embedding = self.model.encode([answer])[0]
        return self.compute_cosine_similarity(query_embedding, answer_embedding)

    def evaluate_context_metrics(self, query: str, context: str) -> Dict[str, float]:
        """Evaluate context recall and precision."""
        query_doc = self.nlp(query)
        context_doc = self.nlp(context)

        query_entities = {ent.text.lower() for ent in query_doc.ents}
        context_entities = {ent.text.lower() for ent in context_doc.ents}

        if not query_entities:
            return {"recall": 0.0, "precision": 0.0}

        recall = len(query_entities.intersection(context_entities)) / len(query_entities)
        precision = len(query_entities.intersection(context_entities)) / len(context_entities) if context_entities else 0.0

        return {"recall": recall, "precision": precision}

    def evaluate_entity_recall(self, retrieved_context: str, ground_truth: str) -> float:
        """Evaluate entity recall against ground truth."""
        retrieved_doc = self.nlp(retrieved_context)
        truth_doc = self.nlp(ground_truth)

        retrieved_entities = {ent.text.lower() for ent in retrieved_doc.ents}
        truth_entities = {ent.text.lower() for ent in truth_doc.ents}

        return len(retrieved_entities.intersection(truth_entities)) / len(truth_entities) if truth_entities else 0.0

    def evaluate_approach(self, query: str, retrieved_context: str,
                         generated_answer: str, ground_truth: str) -> Dict[str, float]:
        """Evaluate a single RAG approach and return all metrics."""
        try:
            context_metrics = self.evaluate_context_metrics(query, retrieved_context)

            return {
                "faithfulness": self.evaluate_faithfulness(generated_answer, retrieved_context),
                "answer_relevancy": self.evaluate_answer_relevancy(generated_answer, query),
                "context_recall": context_metrics["recall"],
                "context_precision": context_metrics["precision"],
                "entity_recall": self.evaluate_entity_recall(retrieved_context, ground_truth)
            }
        except Exception as e:
            logger.error(f"Error evaluating approach: {str(e)}")
            return {
                "faithfulness": 0.0,
                "answer_relevancy": 0.0,
                "context_recall": 0.0,
                "context_precision": 0.0,
                "entity_recall": 0.0
            }


In [7]:
# Cell 3: Define visualization functions
def plot_metric_comparison(results_df: pd.DataFrame, metric_name: str):
    """Plot comparison of traditional vs modified RAG for a specific metric."""
    plt.figure(figsize=(10, 6))

    trad_metric = f'traditional_{metric_name}'
    mod_metric = f'modified_{metric_name}'

    data = pd.DataFrame({
        'Traditional RAG': results_df[trad_metric],
        'Modified RAG': results_df[mod_metric]
    })

    sns.boxplot(data=data)
    plt.title(f'{metric_name.replace("_", " ").title()} Comparison')
    plt.ylabel('Score')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()

def plot_all_metrics(results_df: pd.DataFrame):
    """Plot all metrics comparison in a single figure."""
    metrics = ['faithfulness', 'answer_relevancy', 'context_recall',
              'context_precision', 'entity_recall']

    fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 4*len(metrics)))
    fig.suptitle('RAG Evaluation Metrics Comparison', y=1.02, fontsize=16)

    for ax, metric in zip(axes, metrics):
        data = pd.DataFrame({
            'Traditional RAG': results_df[f'traditional_{metric}'],
            'Modified RAG': results_df[f'modified_{metric}']
        })

        sns.boxplot(data=data, ax=ax)
        ax.set_title(f'{metric.replace("_", " ").title()}')
        ax.set_ylabel('Score')

    plt.tight_layout()
    plt.show()

In [10]:
# Cell 4: Main evaluation function
def evaluate_rag_approaches(input_csv: str) -> pd.DataFrame:
    """
    Evaluate both traditional and modified RAG approaches using data from a CSV file.
    Returns the results DataFrame for further analysis.
    """
    try:
        # Load data
        logger.info(f"Loading data from {input_csv}")
        df = pd.read_csv(input_csv)
        required_columns = [
            'question', 'answer',
            'simple_rag_retrived_context', 'simple_rag_answer',
            'LQR_retrived_context', 'rag_answer_with_LQR',
            'modLQR_retrived_context','rag_answer_with_modLQR'

        ]

        # Validate columns
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        # Initialize evaluator
        evaluator = RAGEvaluator()
        results = []

        # Evaluate each row
        for idx, row in df.iterrows():
            logger.info(f"Evaluating row {idx + 1}/{len(df)}")

            # Evaluate traditional RAG
            simpleRAG_metrics = evaluator.evaluate_approach(
                row['question'],
                row['simple_rag_retrived_context'],
                row['simple_rag_answer'],
                row['answer']
            )

            # Evaluate modified RAG
            LQR_metrics = evaluator.evaluate_approach(
                row['question'],
                row['LQR_retrived_context'],
                row['rag_answer_with_LQR'],
                row['answer']
            )

            modLQR_metrics = evaluator.evaluate_approach(
                row['question'],
                row['modLQR_retrived_context'],
                row['rag_answer_with_modLQR'],
                row['answer']
            )

            # Combine results
            result = {
                'query': row['question'],
                'ground_truth_answer': row['answers'],
                **{f'simpleRAG_{k}': v for k, v in simpleRAG_metrics.items()},
                **{f'LQR_{k}': v for k, v in LQR_metrics.items()},
                **{f'modLQR_{k}': v for k, v in modLQR_metrics.items()}


            }
            results.append(result)

        # Create results DataFrame
        results_df = pd.DataFrame(results)

        # Display summary statistics
        metrics = ['faithfulness', 'answer_relevancy', 'context_recall',
                  'context_precision', 'entity_recall']

        print("\nAverage Metrics:")
        print("-" * 50)
        summary_data = []

        for metric in metrics:
            simpleRAG_avg = results_df[f'simpleRAG_{metric}'].mean()
            LQR_avg = results_df[f'LQR_{metric}'].mean()
            modLQR_avg = results_df[f'modLQR_{metric}'].mean()

            improvement_wrt_LQR = ((LQR_avg - simpleRAG_avg) / simpleRAG_avg * 100)
            improvement_wrt_modLQR = ((modLQR_avg - simpleRAG_avg) / simpleRAG_avg * 100)


            summary_data.append({
                'Metric': metric.replace('_', ' ').title(),
                'Simple RAG': f"{simpleRAG_avg:.3f}",
                'LQR Method': f"{LQR_avg:.3f}",
                'Modified LQR Method': f"{modLQR_avg:.3f}",
                'Improvement with respect to LQR': f"{improvement_wrt_LQR:+.1f}%",
                'Improvement with respect to Modified LQR': f"{improvement_wrt_modLQR:+.1f}%"
            })

        summary_df = pd.DataFrame(summary_data)
        display(summary_df)

        return results_df

    except Exception as e:
        logger.error(f"Error in evaluation process: {str(e)}")
        raise

In [11]:
input_csv = '/content/drive/MyDrive/RAGAS Evaluation of LQR/updated_results_with_all_data.csv'
results_df = evaluate_rag_approaches(input_csv)

ERROR:__main__:Error in evaluation process: [Errno 2] No such file or directory: 'updated_results_with_all_data.csv'


FileNotFoundError: [Errno 2] No such file or directory: 'updated_results_with_all_data.csv'