## Evaluation notebook

### 1. Install / Import

In [1]:
import json
from pdf2image import convert_from_path
from common_utils import embed_texts, embed_images, encode_image_to_base64, search_index, retrieve_context, call_gpt_4, extract_figures_from_pdf
from ragas import EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
from ipynb.fs.full.classRag import RAG
from dotenv import load_dotenv
from PIL import Image
load_dotenv()
from PyPDF2 import PdfReader

  from .autonotebook import tqdm as notebook_tqdm


### 2. EXTRACT IMAGES

In [2]:
file ="../knowledge/subset_monetary_policy_report.pdf"

import cv2
import numpy as np
from PIL import Image
import os
import fitz  # PyMuPDF

def extract_images_from_pdf(pdf_path, output_folder="extracted_data", padding=300, xpadding = 300):
    os.makedirs(output_folder, exist_ok=True)  # Create output folder
    image_paths = []
    
    # Open the PDF document
    doc = fitz.open(pdf_path)
    
    # Loop through all pages in the PDF
    for page_index in range(len(doc)):
        page = doc.load_page(page_index)  # Load the page
        
        # Rasterize the page to an image
        pix = page.get_pixmap(dpi=300)  # Convert to image with high DPI
        full_image_path = os.path.join(output_folder, f"full_page_{page_index + 1}.png")
        pix.save(full_image_path)
        
        # Convert the image to OpenCV format (numpy array)
        full_image = cv2.imread(full_image_path)
        
        # Convert to grayscale
        gray_image = cv2.cvtColor(full_image, cv2.COLOR_BGR2GRAY)
        
        # Threshold to create a binary image (to highlight potential image areas)
        _, thresh = cv2.threshold(gray_image, 240, 255, cv2.THRESH_BINARY_INV)
        
        # Find contours (regions that are "boxes" in the image)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # Loop through contours and crop the image regions
        img_index = 0
        for contour in contours:
            # Get the bounding box of each contour (x, y, width, height)
            x, y, w, h = cv2.boundingRect(contour)
            
            # Ignore small areas (you can adjust the threshold for min area size)
            if w > 50 and h > 50:
                # Add padding to the bounding box
                x_padded = max(x - padding, 0)  # Ensure x doesn't go below 0
                y_padded = max(y - padding, 0)  # Ensure y doesn't go below 0
                w_padded = min(w + 2 * xpadding, full_image.shape[1] - x_padded)  # Ensure width doesn't exceed image
                h_padded = min(h + 2 * padding, full_image.shape[0] - y_padded)  # Ensure height doesn't exceed image
                
                # Crop the image with padding
                cropped_image = full_image[y_padded:y_padded + h_padded, x_padded:x_padded + w_padded]
                
                # Convert cropped image to PIL format to save it as PNG
                pil_image = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
                img_filename = f"page_{page_index + 1}_image_{img_index + 1}.png"
                img_path = os.path.join(output_folder, img_filename)
                pil_image.save(img_path, "PNG")
                
                image_paths.append({"image_path": img_path})
                img_index += 1
    
    doc.close()
    return image_paths

# image_paths = extract_images_from_pdf(file)


### 3. EXTRACT TEXT

In [3]:
def extract_text_from_pdf(pdf_path):
    text_data = []
    # Extract text
    reader = PdfReader(file)
    num_pages = len(reader.pages)

    for page_i in range(num_pages):
        page = reader.pages[page_i]
        page_text = page.extract_text()
        
        if page_text and page_text.strip():
            text_data.append({
                "text": page_text.strip(),
                "page_number": page_i + 1
            })
    return text_data

#text_data = extract_text_from_pdf(file)
#print(text_data)



In [5]:
def generate_qa_for_pdf(pdf_path):
    output_json = "QA_" + os.path.basename(pdf_path).replace('.pdf', '.json')

    # Extract images and text from PDF
    image_data = extract_images_from_pdf(pdf_path)
    text_data = extract_text_from_pdf(pdf_path)

    sample_queries = []
    expected_responses = []
    qa_data = []

    # Iterate over text data to generate questions
    for text_info in text_data:
        page_number = text_info["page_number"]
        page_text = text_info["text"]

        user_prompt = json.dumps([
            {"type": "text", "text": (
                "Your task is to formulate a question from the given context while following these rules:\n"
                "1. The question must be answerable using the provided context.\n"
                "2. It should be based on non-trivial information.\n"
                "3. The answer must not contain any links.\n"
                "4. The question should be of moderate difficulty.\n"
                "5. Avoid phrases like 'provided context'.\n"
                "6. The response must be in valid JSON format as follows:\n"
                "{'question': 'Generated question here', 'answer': 'Generated answer here'}"
            )},
            {"type": "text", "text": page_text}
        ])
        
        response_text = call_gpt_4(user_prompt)
        #print(f"Raw API response for text (page {page_number}):", response_text)

        try:
            cleaned_response_text = response_text.strip("```json").strip("```")  # Remove surrounding backticks
            response_data = json.loads(cleaned_response_text)
            question = response_data.get("question")
            answer = response_data.get("answer")

            if question and answer:
                sample_queries.append(question)
                expected_responses.append(answer)
                qa_data.append({"page_number": page_number, "question": question, "answer": answer})
            else:
                print(f"Warning: Missing Q&A data for text on page {page_number}")

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for text on page {page_number}: {e}")
            print("Response text:", response_text)
            continue

    # Iterate over image data to generate questions
    for image_info in image_data:
        page_number = image_info["image_path"].split("_")[2]  # Extract page number from image path
        image_path = image_info["image_path"]

        # Convert the image to base64 for processing
        image = Image.open(image_path)
        base64_str = encode_image_to_base64(image)

        user_prompt = json.dumps([
            {"type": "text", "text": (
                "Your task is to formulate a question from the given image while following these rules:\n"
                "1. The question must be answerable using the provided image.\n"
                "2. It should be based on non-trivial information.\n"
                "3. The answer must not contain any links.\n"
                "4. The question should be of moderate difficulty.\n"
                "5. Avoid phrases like 'provided image'.\n"
                "6. The response must be in valid JSON format as follows:\n"
                "{'question': 'Generated question here', 'answer': 'Generated answer here'}"
            )},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
        ])

        response_text = call_gpt_4(user_prompt)
        #print(f"Raw API response for image (page {page_number}):", response_text)

        try:
            cleaned_response_text = response_text.strip("```json").strip("```")  # Remove surrounding backticks
            response_data = json.loads(cleaned_response_text)
            question = response_data.get("question")
            answer = response_data.get("answer")

            if question and answer:
                sample_queries.append(question)
                expected_responses.append(answer)
                qa_data.append({"page_number": page_number, "question": question, "answer": answer})
            else:
                print(f"Warning: Missing Q&A data for image on page {page_number}")

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for image on page {page_number}: {e}")
            print("Response text:", response_text)
            continue

    # Output structured data
    output = {
        "sample_queries": sample_queries,
        "expected_responses": expected_responses,
        "qa_data": qa_data
    }

    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=4)

    #print("Final Q&A data saved to", output_json)
    return sample_queries, expected_responses, qa_data

In [6]:
if __name__ == "__main__":
    PDF_FILE = "../knowledge/subset_monetary_policy_report.pdf"
    sample_queries, expected_responses, qa_data = generate_qa_for_pdf(pdf_path=PDF_FILE)
    print("Q&A extraction complete. Results saved to qa_results.json")


Q&A extraction complete. Results saved to qa_results.json


### 2. Initialize RAG

In [None]:
from RAGasClass import RAG
# Initialize RAG instance
from config import OPENAI_API_KEY as openai_api_key

rag = RAG(openai_api_key)
doc = rag.load_documents([PDF_FILE])  # Load documents
rag.load_documents(PDF_FILE)

# Query and retrieve the most relevant document
query = "What element in the image indicates a significant architectural feature, and how does it contribute to the overall aesthetic?"
relevant_doc = rag.get_most_relevant_docs(query)

# Generate an answer
answer = rag.generate_answer(query, relevant_doc)

print(f"Query: {query}")
print(f"Relevant Document: {relevant_doc}")
print(f"Answer: {answer}")



### 3. Collect Evaluation Data

In [7]:
dataset = []

for query,reference in zip(sample_queries,expected_responses):

    relevant_docs = rag.get_most_relevant_docs(query)
    response = rag.generate_answer(query, relevant_docs)
    dataset.append(
        {
            "user_input":query,
            "retrieved_contexts":relevant_docs,
            "response":response,
            "reference":reference
        }
    )




In [12]:
for i, entry in enumerate(dataset):
    print(f"Entry {i+1}:")
    print(f"  User Input: {entry['user_input']}")
    print(f"  Retrieved Contexts: {entry['retrieved_contexts']}")
    print(f"  Response: {entry['response']}")
    print(f"  Reference: {entry['reference']}")
    print("-" * 40)



Entry 1:
  User Input: What was the total amount of interest-bearing debt for Swedish non-financial companies at the end of 2023, and how does this compare to Sweden's annual GDP?
  Retrieved Contexts: [{'type': 'text', 'content': 'The real economy’s need for financial services \n24 Companies also use equity capital to finance themselves. At the end of 2023, the mar-ket value of companies’ outstanding equities amounted to just over SEK 21,000 bil-lion, equivalent to around 340 percent of GDP. Of this, just over a third were listed eq-uities and the rest unlisted.26 Foreign investors account for the largest share of invest-ment in listed equities. This is followed by Swedish funds, non-financial companies and households, see Figure 12. There are also venture capital companies that invest in companies’ equity capital, but this is mainly in unlisted companies. Read more in the section Private equity firms. Companies are linked to different actors in more ways than through their savings an

### 4. Evaluate

In [None]:
from ragas.dataset_schema import SingleTurnSample 
from ragas.metrics import MultiModalRelevance


sample = dataset[0]

# sample = SingleTurnSample(
#         user_input="What was the total amount of interest-bearing debt held by Swedish non-financial companies at the end of 2023, and how does it relate to Sweden's annual GDP?",
#         response="Cats are cute.",
#         retrieved_contexts=[
#             "custom_eval/multimodal/images/tesla.jpg"
#         ]
#     )
scorer = MultiModalRelevance()
await scorer.single_turn_ascore(sample)

In [None]:
from ragas.dataset_schema import SingleTurnSample 
from ragas.metrics import MultiModalRelevance

sample = SingleTurnSample(
        user_input="What was the total amount of interest-bearing debt for non-financial companies in Sweden at the end of 2023, and how was this amount divided between loans and issued debt securities?",
        response="Cats are cute.",
        retrieved_contexts=[
            "custom_eval/multimodal/images/tesla.jpg"
        ]
    )
scorer = MultiModalRelevance()

await scorer.single_turn_ascore(sample)