## Evaluation notebook

### 1. Install / Import

In [3]:
import os
import json
import asyncio
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from dotenv import load_dotenv

from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.dataset_schema import SingleTurnSample 
from ragas.metrics import MultiModalRelevance
from ragas import EvaluationDataset

from common_utils import encode_image_to_base64, call_gpt_4, extract_images_from_pdf, extract_text_from_pdf
from multimodal_rag import MultimodalRAG
from config import OPENAI_API_KEY
from multimodal_embedder import *

load_dotenv()

ModuleNotFoundError: No module named 'common_utils'

### 4. Generate questions

In [None]:
PDF_FILE = "../knowledge/subset_monetary_policy_report.pdf"

def generate_qa_for_pdf(pdf_path):
    output_json = "QA_" + os.path.basename(pdf_path).replace('.pdf', '.json')

    # Extract images and text from PDF
    image_data = extract_images_from_pdf(pdf_path)
    text_data = extract_text_from_pdf(pdf_path)

    sample_queries = []
    expected_responses = []
    qa_data = []

    # Iterate over text data to generate questions
    for text_info in text_data:
        page_number = text_info["page_number"]
        page_text = text_info["text"]

        user_prompt = json.dumps([
            {"type": "text", "text": (
                "Your task is to formulate a question from the given context while following these rules:\n"
                "1. The question must be answerable using the provided context.\n"
                "2. It should be based on non-trivial information.\n"
                "3. The answer must not contain any links.\n"
                "4. The question should be of moderate difficulty.\n"
                "5. Avoid phrases like 'provided context'.\n"
                "6. The response must be in valid JSON format as follows:\n"
                "{'question': 'Generated question here', 'answer': 'Generated answer here'}"
            )},
            {"type": "text", "text": page_text}
        ])
        
        response_text = call_gpt_4(user_prompt)
        #print(f"Raw API response for text (page {page_number}):", response_text)

        try:
            cleaned_response_text = response_text.strip("```json").strip("```")  # Remove surrounding backticks
            response_data = json.loads(cleaned_response_text)
            question = response_data.get("question")
            answer = response_data.get("answer")

            if question and answer:
                sample_queries.append(question)
                expected_responses.append(answer)
                qa_data.append({"page_number": page_number, "question": question, "answer": answer})
            else:
                print(f"Warning: Missing Q&A data for text on page {page_number}")

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for text on page {page_number}: {e}")
            print("Response text:", response_text)
            continue

    # Iterate over image data to generate questions
    for data in image_data:
        image = data["pil_image"]
        page_number = data["page_number"]

        # Convert the image to base64 for processing
        base64_str = encode_image_to_base64(image)

        user_prompt = json.dumps([
            {"type": "text", "text": (
                "Your task is to formulate a question from the given image while following these rules:\n"
                "1. The question must be answerable using the provided image.\n"
                "2. It should be based on non-trivial information.\n"
                "3. The answer must not contain any links.\n"
                "4. The question should be of moderate difficulty.\n"
                "5. Avoid phrases like 'provided image'.\n"
                "6. The response must be in valid JSON format as follows:\n"
                "{'question': 'Generated question here', 'answer': 'Generated answer here'}"
            )},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
        ])

        response_text = call_gpt_4(user_prompt)
        #print(f"Raw API response for image (page {page_number}):", response_text)

        try:
            cleaned_response_text = response_text.strip("```json").strip("```")  # Remove surrounding backticks
            response_data = json.loads(cleaned_response_text)
            question = response_data.get("question")
            answer = response_data.get("answer")

            if question and answer:
                sample_queries.append(question)
                expected_responses.append(answer)
                qa_data.append({"page_number": page_number, "question": question, "answer": answer})
            else:
                print(f"Warning: Missing Q&A data for image on page {page_number}")

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for image on page {page_number}: {e}")
            print("Response text:", response_text)
            continue

    # Output structured data
    output = {
        "sample_queries": sample_queries,
        "expected_responses": expected_responses,
        "qa_data": qa_data
    }

    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=4)

    #print("Final Q&A data saved to", output_json)
    return sample_queries, expected_responses, qa_data

sample_queries, expected_responses, qa_data = generate_qa_for_pdf(pdf_path=PDF_FILE)

#### 4.1 (Load from file instead)

In [5]:
file_path = "QA_subset_monetary_policy_report.json"

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

sample_queries = data["sample_queries"]
expected_responses = data["expected_responses"]
qa_data = data["qa_data"]

### 5.1 Initialize RAG

In [None]:
rag = MultimodalRAG(PDF_FILE)

### 5.2 Generate dataset

In [7]:
# Generate dataset
dataset = []

for query,reference in zip(sample_queries,expected_responses):

    relevant_docs = rag.get_most_relevant_docs(query)
    response = rag.generate_answer(query, relevant_docs)
    dataset.append(
        {
            "user_input":query,
            "retrieved_contexts":relevant_docs,
            "response":response,
            "reference":reference
        }
    )

# Save the dataset to a JSON file
output_dataset_file = "generated_dataset.json"
with open(output_dataset_file, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

#### 5.1 (Load the dataset to save time)

In [8]:
# Code to load the dataset
output_dataset_file = "generated_dataset.json"
with open(output_dataset_file, 'r', encoding='utf-8') as f:
    dataset = json.load(f)

### 6. Evaluate using RAGAs
#### 6.1 Prepare the data for testing

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=OPENAI_API_KEY)
embeddings = OpenAIEmbeddings()

dataset = []

for query,reference in zip(sample_queries,expected_responses):
    relevant_docs = rag.get_most_relevant_docs(query)
    response = rag.generate_answer(query, relevant_docs)
    dataset.append(
        {
            "user_input":query,
            "retrieved_contexts": [doc['content'] for doc in  relevant_docs],
            "response":response,
            "reference":reference
        }
    )

for i, entry in enumerate(dataset):
    print(f"Entry {i+1}:")
    print(f"  User Input: {entry['user_input']}")
    print(f"  Retrieved Contexts: {entry['retrieved_contexts']}")
    print(f"  Response: {entry['response']}")
    print(f"  Reference: {entry['reference']}")
    print("-" * 40)


evaluation_dataset = EvaluationDataset.from_list(dataset)


Entry 1:
  User Input: What was the total amount of interest-bearing debt for Swedish non-financial companies at the end of 2023, and how is it distributed between loans and issued debt securities?
  Retrieved Contexts: ['The real economy’s need for financial services \n24 Companies also use equity capital to finance themselves. At the end of 2023, the mar-ket value of companies’ outstanding equities amounted to just over SEK 21,000 bil-lion, equivalent to around 340 percent of GDP. Of this, just over a third were listed eq-uities and the rest unlisted.26 Foreign investors account for the largest share of invest-ment in listed equities. This is followed by Swedish funds, non-financial companies and households, see Figure 12. There are also venture capital companies that invest in companies’ equity capital, but this is mainly in unlisted companies. Read more in the section Private equity firms. Companies are linked to different actors in more ways than through their savings and financin

#### 6.2 Evaluate

In [12]:
evaluator_llm = LangchainLLMWrapper(llm)

result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],llm=evaluator_llm)
result

Evaluating:   0%|          | 0/33 [00:00<?, ?it/s]

{'context_recall': 0.4545, 'faithfulness': 0.1667, 'factual_correctness': 0.0000}

### 7. Multimodal RAGAs
#### 7.1 Data preperation (Not necessary)

In [13]:
# Code to load the dataset
output_dataset_file = "multimodal_data2.json"
with open(output_dataset_file, 'r', encoding='utf-8') as f:
    dataset = json.load(f)

for i, entry in enumerate(dataset):
    print(f"Entry {i+1}:")
    print(f"  User Input: {entry['user_input']}")
    print(f"  Retrieved Contexts: {entry['retrieved_contexts']}")
    print(f"  Response: {entry['response']}")
    print(f"  Reference: {entry['reference']}")
    print("-" * 40)

Entry 1:
  User Input: What was the total amount of interest-bearing debt for Swedish non-financial companies at the end of 2023, and how does it compare to Sweden's annual GDP?
  Retrieved Contexts: ['The type of financing on which they have to pay interest is called interest-bearing debt and amounted to just over SEK 5,500 billion at the end of 2023.']
  Response: The total amount of interest-bearing debt for Swedish non-financial companies at the end of 2023 was just over SEK 5,500 billion, which is equivalent to just under Sweden's annual GDP.
  Reference: The total amount of interest-bearing debt for Swedish non-financial companies at the end of 2023 was just over SEK 5,500 billion, which is equivalent to just under Sweden's annual GDP.
----------------------------------------
Entry 2:
  User Input: How have the financial assets of Swedish non-financial companies evolved between 2018 and 2023?
  Retrieved Contexts: ['/extracted_data/page_1_image_1.png']
  Response: The financial a

#### 7.2 Run sample test

In [4]:
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Initialize LLM and embeddings
llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# Define a valid sample
sample = SingleTurnSample(
    user_input="How have the financial assets of Swedish non-financial companies evolved between 2018 and 2023?",
    response="Swedish non-financial companies experienced a steady increase in financial assets over this period, driven by...",
    retrieved_contexts=["/Users/olofswedberg/Documents/thesis/masters-thesis/prototype/extracted_data/page_1_image_1.png"]
)

# ✅ Pass LLM to MultiModalRelevance
scorer = MultiModalRelevance(llm=llm)

# Run async function
async def evaluate():
    score = await scorer.single_turn_ascore(sample)
    print("Relevance Score:", score)

# Run the event loop
asyncio.run(evaluate())


Relevance Score: 0.0
