### 1. Import

In [1]:
import os
import json

from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper

from document_processor import TextProcessor, ImageProcessor, PageImageProcessor, TextAndInlineImageProcessor
from multimodal_rag import MultimodalRAG
from embedder import SigLIPEmbedder, VisRAGEmbedder, OpenAIEmbedder, ColPaliEmbedder #, SFREmbedder
from pdf_to_qa import generate_qa_for_pdf
from evaluation import evaluate_generation, compute_mrr_at_k, compute_recall_at_k, compute_precision_at_k, compute_f1_score
from datasets import load_dataset
from chartQAloader import generate_chartQA_pdf_and_json

#### 1.2 Load chartQA
Load from huggingface

In [2]:
dataset = load_dataset('lmms-lab/ChartQA', split='test')
subset_dataset = dataset.select(range(10))

Create PDF for knowledge base and JSON mapping 

In [3]:
# generate_chartQA_pdf_and_json(dataset, pdf_output_path='knowledge/ChartQA_Evaluation_Set.pdf', json_output_path='json_files/QA_ChartQA_Evaluation_Set.json')
generate_chartQA_pdf_and_json(subset_dataset, pdf_output_path='knowledge/subset_ChartQA_Evaluation_Set.pdf', json_output_path='json_files/QA_subset_ChartQA_Evaluation_Set.json')


Processing Charts: 100%|██████████| 10/10 [00:01<00:00,  7.87chart/s]

PDF saved as knowledge/subset_ChartQA_Evaluation_Set.pdf
JSON file saved as json_files/QA_subset_ChartQA_Evaluation_Set.json





### 2. Configuration

In [4]:
# PDF_FILE = "knowledge/subset_riksbanken.pdf"
# PDF_FILE = "knowledge/ChartQA_Evaluation_Set.pdf"
PDF_FILE = "knowledge/subset_ChartQA_Evaluation_Set.pdf"
# PDF_FILE = "knowledge/total_recall.pdf"

#text_processor = TextProcessor(SFREmbedder())
#image_processor = ImageProcessor(SigLIPEmbedder())
# page_image_processor = PageImageProcessor(VisRAGEmbedder())
#text_inline_processor = TextAndInlineImageProcessor(SFREmbedder())
text_inline_processor = TextAndInlineImageProcessor(OpenAIEmbedder(),no = 3) #Select no 3 for chartQA emb. 1 or 2 for other emb.

rag = MultimodalRAG([text_inline_processor], PDF_FILE)
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) # For Ragas evaluation

### 3. Generate Dataset - for CoinQA

In [5]:
# Check if QA file already exists
qa_filepath = "json_files/QA_" + os.path.basename(PDF_FILE).replace('.pdf', '.json')

if os.path.exists(qa_filepath):
    qa_path = qa_filepath
    print(f"Using existing QA file: {qa_path}")
else:
    qa_path = generate_qa_for_pdf(PDF_FILE)
    print(f"Generated new QA file: {qa_path}")

with open(qa_path, 'r', encoding='utf-8') as f:
    qa_data = json.load(f)
    

Using existing QA file: json_files/QA_subset_ChartQA_Evaluation_Set.json


### 4. Answering the QA

In [6]:
with open(qa_path, 'r', encoding='utf-8') as f:
    qa_data = json.load(f)

# Generate dataset
rag_generated_answers = []

# Check if generated answers file already exists
rag_answers_path = "json_files/rag_generated_answers_" + os.path.basename(qa_path)

if os.path.exists(rag_answers_path):
    rag_generated_answers = json.load(open(rag_answers_path, 'r', encoding='utf-8'))
    print(f"Using existing RAG generated answers file: {rag_answers_path}")
else:
    for qa in qa_data:
        query = qa["question"]
        reference = qa["answer"]

        relevant_docs = rag.get_most_relevant_docs(query, top_k=5)
        response = rag.generate_answer(query, relevant_docs)
        rag_generated_answers.append(
            {
                "query":query,
                "retrieved_contexts":relevant_docs,
                "generated_answer":response,
                "true_answer":reference
            }
        )

    # Save the dataset to a JSON file
    output_dataset_file = "json_files/rag_generated_answers_" + os.path.basename(qa_path)

    with open(output_dataset_file, 'w', encoding='utf-8') as f:
        json.dump(rag_generated_answers, f, ensure_ascii=False, indent=4)
    print(f"Generated new RAG generated answers file: {output_dataset_file}")

Generated new RAG generated answers file: json_files/rag_generated_answers_QA_subset_ChartQA_Evaluation_Set.json


### 5. Evaluate Retrieval

In [None]:
all_real_pages, all_retrieved_pages = [], []

for rag_answer in rag_generated_answers:
    real_page = next(
        item["page_number"] for item in qa_data if item["question"] == rag_answer["query"]
    )
    retrieved_pages = [doc["page_number"] for doc in rag_answer["retrieved_contexts"]]
    all_real_pages.append([real_page] if isinstance(real_page, int) else real_page)
    all_retrieved_pages.append(retrieved_pages)

# Function to test a specific question by index
def test_question(index):
    if index < 1 or index > len(rag_generated_answers):
        print("Invalid index. Please select a number between 1 and 5.")
        return

    rag_answer = rag_generated_answers[index - 1]
    real_page = all_real_pages[index - 1]
    retrieved_pages = all_retrieved_pages[index - 1]

    print(f"Question: {rag_answer['query']}")
    print(f"True Answer: {rag_answer['true_answer']}")
    print(f"Generated Answer: {rag_answer['generated_answer']}")
    print(f"Real Page(s): {real_page}")
    print(f"Retrieved Pages: {retrieved_pages}")
    return real_page, retrieved_pages

# Example usage: test the first question
# real_page, retrieved_pages = test_question(5)

# Or test everything
real_page, retrieved_pages = all_real_pages, all_retrieved_pages

for k in [3, 5]:
    mrr = compute_mrr_at_k(retrieved_pages, real_page, k)
    print(f"MRR@{k}: {mrr:.2f}")

for k in [3, 5]:
    recall = compute_recall_at_k(retrieved_pages, real_page, k)
    print(f"Recall@{k}: {recall:.2f}")

for k in [3, 5]:
    precision = compute_precision_at_k(retrieved_pages, real_page, k)
    print(f"Precision@{k}: {precision:.2f}")
    
for k in [3, 5]:
    f1_score = compute_f1_score(retrieved_pages, real_page, k)
    print(f"F1 Score@{k}: {f1_score:.2f}")

Question: What's the value of the lowest bar?
True Answer: 23
Generated Answer: The value of the lowest bar in the first chart, which represents the long-term price index in food commodities from 1850 to 2015, is **18.81**, corresponding to cocoa.
Real Page(s): [3]
Retrieved Pages: [1, 2, 5, 4, 3]
MRR@3: 0.00
MRR@5: 0.20
Recall@3: 0.00
Recall@5: 1.00
Precision@3: 0.00
Precision@5: 0.20
F1 Score@3: 0.00
F1 Score@5: 0.33


### 6. Evaluate Generation

In [None]:
faithfulness_and_relevance = evaluate_generation(rag_generated_answers, evaluator_llm)
print(faithfulness_and_relevance)