### 1. IMPORT

In [3]:
import os
import json

from langchain_openai import ChatOpenAI
from ragas import evaluate, EvaluationDataset
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import MultiModalRelevance, MultiModalFaithfulness

from multimodal_rag import MultimodalRAG
from multimodal_embedder import create_embedder
from pdf_to_qa import generate_qa_for_pdf

### 2. Configuration

In [4]:
PDF_FILE = "knowledge/subset_monetary_policy_report.pdf"
EMBEDDER = "siglip"

rag = MultimodalRAG(pdf_file=PDF_FILE, embedder=create_embedder(EMBEDDER), page_mode="image_only")
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) # For Ragas evaluation

FAISS Input Embeddings Shape: (5, 768)


### Generate Dataset

In [5]:
# Check if QA file already exists
qa_filepath = "QA_" + os.path.basename(PDF_FILE).replace('.pdf', '.json')

if os.path.exists(qa_filepath):
    qa_path = qa_filepath
    print(f"Using existing QA file: {qa_path}")
else:
    qa_path = generate_qa_for_pdf(PDF_FILE)
    print(f"Generated new QA file: {qa_path}")

Using existing QA file: QA_subset_monetary_policy_report.json


### 3. Answering the QA

In [6]:
with open(qa_path, 'r', encoding='utf-8') as f:
    qa_data = json.load(f)
    
# Generate dataset
rag_generated_answers = []

# Check if generated answers file already exists
generated_answer_filepath = "rag_generated_answers_" + os.path.basename(qa_path)

if os.path.exists(generated_answer_filepath):
    rag_generated_answers = json.load(open(generated_answer_filepath, 'r', encoding='utf-8'))
else:
    for qa in qa_data:
        query = qa["question"]
        reference = qa["answer"]

        relevant_docs = rag.get_most_relevant_docs(query)
        response = rag.generate_answer(query, relevant_docs)
        rag_generated_answers.append(
            {
                "user_input":query,
                "retrieved_contexts":relevant_docs,
                "response":response,
                "reference":reference
            }
        )

    # Save the dataset to a JSON file
    output_dataset_file = "rag_generated_answers_" + os.path.basename(qa_path)

    with open(output_dataset_file, 'w', encoding='utf-8') as f:
        json.dump(rag_generated_answers, f, ensure_ascii=False, indent=4)

In [7]:
for i, entry in enumerate(rag_generated_answers):
    print(f"Entry {i+1}:")
    print(f"  User Input: {entry['user_input']}")
    print(f"  Retrieved Contexts: {entry['retrieved_contexts']}")
    print(f"  Response: {entry['response']}")
    print(f"  Reference: {entry['reference']}")
    print("-" * 40)

Entry 1:
  User Input: What was the total amount of interest-bearing debt for Swedish non-financial companies at the end of 2023, and how is this divided between loans from banks and debt securities?
  Retrieved Contexts: [{'type': 'text', 'content': 'The real economy’s need for financial services \n23 unrated companies has also increased significantly. One contributing factor may be the low interest rates that followed the financial crisis, which lowered absolute re-turns and increased investor demand for riskier assets, as well as making the price of securities financing more favourable compared with the price of bank loans.  It is mainly foreign actors who invest in the debt securities issued by companies, fol-lowed by Swedish funds and Swedish insurance and pension companies, see Figure 12. The Riksbank owned 0.4 percent of the securities at the end of 2023, having pur-chased corporate bonds and commercial papers between September 2020 and De-cember 2022 to mitigate the effects of 

### Generation Evaluation

In [8]:
processed_rag_generated_answers = []

for sample in rag_generated_answers:
    processed_sample = sample.copy()
    processed_sample["retrieved_contexts"] = [
        context["content"] for context in sample["retrieved_contexts"]
    ]
    processed_rag_generated_answers.append(processed_sample)

evaluation_dataset = EvaluationDataset.from_list(processed_rag_generated_answers)

result = evaluate(dataset=evaluation_dataset, metrics=[MultiModalFaithfulness(), MultiModalRelevance()], llm=evaluator_llm)
result

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

{'faithful_rate': 1.0000, 'relevance_rate': 0.8000}