In [None]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision
from tqdm import tqdm
import os, sys

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.rag_pipeline import RAGPipeline

# Configuration
DATA_PATH = '../data/filtered_and_cleaned_complaints.csv'
VECTOR_STORE_PATH = '../vector_store/'
rag_pipeline = RAGPipeline(
    model_name="google/gemma-2b-it",
    embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
    vector_store_path=VECTOR_STORE_PATH
)


In [None]:
# Load models and FAISS index
rag_pipeline.load_embedding_model()
rag_pipeline.load_faiss_index()
rag_pipeline.load_llm()
rag_pipeline.setup_rag_chain()

In [None]:
# Sample test questions and ground truths
test_data = [
    {
        "question": "Why are customers unhappy with BNPL services?",
        "ground_truth": "Customers are unhappy with BNPL due to unclear terms, high interest rates, and issues with payment disputes.",
        "contexts": []
    },
    {
        "question": "What issues are reported with credit card billing?",
        "ground_truth": "Common issues include unauthorized charges, billing disputes, and delays in resolving complaints.",
        "contexts": []
    },
    {
        "question": "What problems occur with money transfers?",
        "ground_truth": "Money transfer issues include delayed transfers, incorrect recipient details, and poor customer service.",
        "contexts": []
    },
    {
        "question": "How do customers describe savings account issues?",
        "ground_truth": "Customers report frozen accounts, unexpected fees, and difficulty accessing funds.",
        "contexts": []
    },
    {
        "question": "What are common personal loan complaints?",
        "ground_truth": "Personal loan complaints often involve high interest rates, misleading terms, and slow approval processes.",
        "contexts": []
    }
]


In [None]:

# Generate RAG responses
rag_responses = []
for item in tqdm(test_data, desc="Generating RAG responses"):
    answer, _, retrieved_docs = rag_pipeline.query(item["question"])
    contexts = [doc['text_content'] for doc in retrieved_docs]
    rag_responses.append({
        "question": item["question"],
        "answer": answer,
        "ground_truth": item["ground_truth"],
        "contexts": contexts
    })

In [None]:

# Convert to Dataset for evaluation
eval_dataset = Dataset.from_pandas(pd.DataFrame(rag_responses))
print("Evaluation dataset prepared:")
print(pd.DataFrame(rag_responses).head())

In [None]:
# Ragas evaluation
class RagasLocalLLM:
    def __init__(self, llm):
        self.llm = llm
    def generate(self, messages, **kwargs):
        prompt = "".join([f"{'Human' if isinstance(m, HumanMessage) else 'AI'}: {m.content}\n" for m in messages])
        response = self.llm.invoke(prompt)
        return LLMResult(generations=[[Generation(text=response)]])

ragas_llm = RagasLocalLLM(rag_pipeline.llm)
result = evaluate(
    eval_dataset,
    metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
    llm=ragas_llm,
    embeddings=rag_pipeline.embeddings
)
print("Ragas Evaluation Results:")
print(result.to_pandas())
print("Mean Scores:")
print(result.to_pandas().mean(numeric_only=True))