### 1. IMPORT

In [None]:
import os
import json
from typing import List


from langchain_openai import ChatOpenAI
from ragas import evaluate, EvaluationDataset
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import MultiModalRelevance, MultiModalFaithfulness

from multimodal_rag import MultimodalRAG
from multimodal_embedder import create_embedder
from pdf_to_qa import generate_qa_for_pdf

### 2. Configuration

In [2]:
PDF_FILE = "knowledge/subset_riksbanken.pdf"
EMBEDDER = "siglip"

rag = MultimodalRAG(pdf_file=PDF_FILE, embedder=create_embedder(EMBEDDER))
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) # For Ragas evaluation

FAISS Input Embeddings Shape: (11, 768)


### Generate Dataset

In [3]:
# Check if QA file already exists
qa_filepath = "QA_" + os.path.basename(PDF_FILE).replace('.pdf', '.json')

if os.path.exists(qa_filepath):
    qa_path = qa_filepath
    print(f"Using existing QA file: {qa_path}")
else:
    qa_path = generate_qa_for_pdf(PDF_FILE)
    print(f"Generated new QA file: {qa_path}")

Using existing QA file: QA_subset_riksbanken.json


### 3. Answering the QA

In [4]:
with open(qa_path, 'r', encoding='utf-8') as f:
    qa_data = json.load(f)
    
# Generate dataset
rag_generated_answers = []

# Check if generated answers file already exists
rag_answers_path = "rag_generated_answers_" + os.path.basename(qa_path)

if os.path.exists(rag_answers_path):
    rag_generated_answers = json.load(open(rag_answers_path, 'r', encoding='utf-8'))
    print(f"Using existing RAG generated answers file: {rag_answers_path}")
else:
    for qa in qa_data:
        query = qa["question"]
        reference = qa["answer"]

        relevant_docs = rag.get_most_relevant_docs(query)
        response = rag.generate_answer(query, relevant_docs)
        rag_generated_answers.append(
            {
                "user_input":query,
                "retrieved_contexts":relevant_docs,
                "response":response,
                "reference":reference
            }
        )

    # Save the dataset to a JSON file
    output_dataset_file = "rag_generated_answers_" + os.path.basename(qa_path)

    with open(output_dataset_file, 'w', encoding='utf-8') as f:
        json.dump(rag_generated_answers, f, ensure_ascii=False, indent=4)
    print(f"Generated new RAG generated answers file: {output_dataset_file}")

Using existing RAG generated answers file: rag_generated_answers_QA_subset_riksbanken.json


In [5]:
for i, entry in enumerate(rag_generated_answers):
    print(f"Entry {i+1}:")
    print(f"  User Input: {entry['user_input']}")
    print(f"  Retrieved Contexts: {entry['retrieved_contexts']}")
    print(f"  Response: {entry['response']}")
    print(f"  Reference: {entry['reference']}")
    print("-" * 40)

Entry 1:
  User Input: What was the total amount of interest-bearing debt for Swedish non-financial companies at the end of 2023, and how is this amount divided between loans from banks and other lenders and issued debt securities?
  Retrieved Contexts: [{'type': 'text', 'content': 'The real economy’s need for financial services \n23 unrated companies has also increased significantly. One contributing factor may be the low interest rates that followed the financial crisis, which lowered absolute re-turns and increased investor demand for riskier assets, as well as making the price of securities financing more favourable compared with the price of bank loans.  It is mainly foreign actors who invest in the debt securities issued by companies, fol-lowed by Swedish funds and Swedish insurance and pension companies, see Figure 12. The Riksbank owned 0.4 percent of the securities at the end of 2023, having pur-chased corporate bonds and commercial papers between September 2020 and De-cember 

### Retrieval Evaluation (MRR@K)

In [None]:
def mrr_at_k(
    all_retrieved_pages: List[List[int]], 
    all_real_pages: List[int], 
    k: int
) -> float:
    """
    Compute MRR@K (Mean Reciprocal Rank at K).

    Parameters:
        all_retrieved_pages (List[List[int]]): A list of lists, where each sub-list
            is a ranked list of retrieved page numbers for a single query.
        all_real_pages (List[int]): A list of correct answer page numbers, aligned
            by index with `all_retrieved_pages`.
        k (int): The cutoff for top retrieved contexts to consider (e.g., 3 or 10).

    Returns:
        float: The overall MRR@K score (mean of each query's reciprocal rank).
    """
    # Store the reciprocal ranks for each query
    reciprocal_ranks = []

    for retrieved_pages, real_page in zip(all_retrieved_pages, all_real_pages):
        # Only consider the top-K retrieved pages
        retrieved_pages_k = retrieved_pages[:k]

        # Find the rank (1-based) if the real page is in the top-K
        if real_page in retrieved_pages_k:
            rank = retrieved_pages_k.index(real_page) + 1
            reciprocal_ranks.append(1.0 / rank)
        else:
            reciprocal_ranks.append(0.0)

    # If no queries, return 0.0 to avoid division by zero
    if not reciprocal_ranks:
        return 0.0

    # Mean Reciprocal Rank across all queries
    return sum(reciprocal_ranks) / len(reciprocal_ranks)

all_retrieved_pages = []
all_real_pages = []

# Gather data for each query
for rag_answer in rag_generated_answers:
    real_page = next(item["page_number"]
                     for item in qa_data
                     if item["question"] == rag_answer["user_input"])
    retrieved_pages = [doc["page_number"] for doc in rag_answer["retrieved_contexts"]]
    
    all_real_pages.append(real_page)
    all_retrieved_pages.append(retrieved_pages)

# Now compute MRR@K for various values of k
for k in [3, 5]:
    mean_mrr_k = mrr_at_k(all_retrieved_pages, all_real_pages, k)
    print(f"Overall MRR@{k}: {mean_mrr_k:.4f}")


Overall MRR@3: 0.3667
Overall MRR@5: 0.4467


### Generation Evaluation

In [None]:
import copy

evaluation_rag_generated_anwers = copy.deepcopy(rag_generated_answers)

for entry in evaluation_rag_generated_anwers:
    entry["retrieved_contexts"] = [context["content"] for context in entry["retrieved_contexts"]]
    


evaluation_dataset = EvaluationDataset.from_list(evaluation_rag_generated_anwers)

result = evaluate(dataset=evaluation_dataset, metrics=[MultiModalFaithfulness(), MultiModalRelevance()], llm=evaluator_llm)
result

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

{'faithful_rate': 1.0000, 'relevance_rate': 1.0000}