## Evaluation notebook

### 1. Install / Import

In [14]:
import json
from pdf2image import convert_from_path
from common_utils import embed_texts, embed_images, encode_image_to_base64, search_index, retrieve_context, call_gpt_4, extract_figures_from_pdf
from ragas import EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
from ipynb.fs.full.classRag import RAG
from dotenv import load_dotenv
from PIL import Image
load_dotenv()
from PyPDF2 import PdfReader

### 2. EXTRACT IMAGES

In [15]:
file ="../knowledge/subset_monetary_policy_report.pdf"

import cv2
import numpy as np
from PIL import Image
import os
import fitz  # PyMuPDF

def extract_images_from_pdf(pdf_path, output_folder="extracted_data", padding=300, xpadding = 300):
    os.makedirs(output_folder, exist_ok=True)  # Create output folder
    image_paths = []
    
    # Open the PDF document
    doc = fitz.open(pdf_path)
    
    # Loop through all pages in the PDF
    for page_index in range(len(doc)):
        page = doc.load_page(page_index)  # Load the page
        
        # Rasterize the page to an image
        pix = page.get_pixmap(dpi=300)  # Convert to image with high DPI
        full_image_path = os.path.join(output_folder, f"full_page_{page_index + 1}.png")
        pix.save(full_image_path)
        
        # Convert the image to OpenCV format (numpy array)
        full_image = cv2.imread(full_image_path)
        
        # Convert to grayscale
        gray_image = cv2.cvtColor(full_image, cv2.COLOR_BGR2GRAY)
        
        # Threshold to create a binary image (to highlight potential image areas)
        _, thresh = cv2.threshold(gray_image, 240, 255, cv2.THRESH_BINARY_INV)
        
        # Find contours (regions that are "boxes" in the image)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # Loop through contours and crop the image regions
        img_index = 0
        for contour in contours:
            # Get the bounding box of each contour (x, y, width, height)
            x, y, w, h = cv2.boundingRect(contour)
            
            # Ignore small areas (you can adjust the threshold for min area size)
            if w > 50 and h > 50:
                # Add padding to the bounding box
                x_padded = max(x - padding, 0)  # Ensure x doesn't go below 0
                y_padded = max(y - padding, 0)  # Ensure y doesn't go below 0
                w_padded = min(w + 2 * xpadding, full_image.shape[1] - x_padded)  # Ensure width doesn't exceed image
                h_padded = min(h + 2 * padding, full_image.shape[0] - y_padded)  # Ensure height doesn't exceed image
                
                # Crop the image with padding
                cropped_image = full_image[y_padded:y_padded + h_padded, x_padded:x_padded + w_padded]
                
                # Convert cropped image to PIL format to save it as PNG
                pil_image = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
                img_filename = f"page_{page_index + 1}_image_{img_index + 1}.png"
                img_path = os.path.join(output_folder, img_filename)
                pil_image.save(img_path, "PNG")
                
                image_paths.append({"image_path": img_path})
                img_index += 1
    
    doc.close()
    return image_paths

# image_paths = extract_images_from_pdf(file)


### 3. EXTRACT TEXT

In [17]:
def extract_text_from_pdf(pdf_path):
    text_data = []
    # Extract text
    reader = PdfReader(file)
    num_pages = len(reader.pages)

    for page_i in range(num_pages):
        page = reader.pages[page_i]
        page_text = page.extract_text()
        
        if page_text and page_text.strip():
            text_data.append({
                "text": page_text.strip(),
                "page_number": page_i + 1
            })
    return text_data

#text_data = extract_text_from_pdf(file)
#print(text_data)



In [23]:
def generate_qa_for_pdf(pdf_path):
    output_json = "QA_" + os.path.basename(pdf_path).replace('.pdf', '.json')

    # Extract images and text from PDF
    image_data = extract_images_from_pdf(pdf_path)
    text_data = extract_text_from_pdf(pdf_path)

    sample_queries = []
    expected_responses = []
    qa_data = []

    # Iterate over text data to generate questions
    for text_info in text_data:
        page_number = text_info["page_number"]
        page_text = text_info["text"]

        user_prompt = json.dumps([
            {"type": "text", "text": (
                "Your task is to formulate a question from the given context while following these rules:\n"
                "1. The question must be answerable using the provided context.\n"
                "2. It should be based on non-trivial information.\n"
                "3. The answer must not contain any links.\n"
                "4. The question should be of moderate difficulty.\n"
                "5. Avoid phrases like 'provided context'.\n"
                "6. The response must be in valid JSON format as follows:\n"
                "{'question': 'Generated question here', 'answer': 'Generated answer here'}"
            )},
            {"type": "text", "text": page_text}
        ])
        
        response_text = call_gpt_4(user_prompt)
        #print(f"Raw API response for text (page {page_number}):", response_text)

        try:
            cleaned_response_text = response_text.strip("```json").strip("```")  # Remove surrounding backticks
            response_data = json.loads(cleaned_response_text)
            question = response_data.get("question")
            answer = response_data.get("answer")

            if question and answer:
                sample_queries.append(question)
                expected_responses.append(answer)
                qa_data.append({"page_number": page_number, "question": question, "answer": answer})
            else:
                print(f"Warning: Missing Q&A data for text on page {page_number}")

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for text on page {page_number}: {e}")
            print("Response text:", response_text)
            continue

    # Iterate over image data to generate questions
    for image_info in image_data:
        page_number = image_info["image_path"].split("_")[2]  # Extract page number from image path
        image_path = image_info["image_path"]

        # Convert the image to base64 for processing
        image = Image.open(image_path)
        base64_str = encode_image_to_base64(image)

        user_prompt = json.dumps([
            {"type": "text", "text": (
                "Your task is to formulate a question from the given image while following these rules:\n"
                "1. The question must be answerable using the provided image.\n"
                "2. It should be based on non-trivial information.\n"
                "3. The answer must not contain any links.\n"
                "4. The question should be of moderate difficulty.\n"
                "5. Avoid phrases like 'provided image'.\n"
                "6. The response must be in valid JSON format as follows:\n"
                "{'question': 'Generated question here', 'answer': 'Generated answer here'}"
            )},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
        ])

        response_text = call_gpt_4(user_prompt)
        #print(f"Raw API response for image (page {page_number}):", response_text)

        try:
            cleaned_response_text = response_text.strip("```json").strip("```")  # Remove surrounding backticks
            response_data = json.loads(cleaned_response_text)
            question = response_data.get("question")
            answer = response_data.get("answer")

            if question and answer:
                sample_queries.append(question)
                expected_responses.append(answer)
                qa_data.append({"page_number": page_number, "question": question, "answer": answer})
            else:
                print(f"Warning: Missing Q&A data for image on page {page_number}")

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for image on page {page_number}: {e}")
            print("Response text:", response_text)
            continue

    # Output structured data
    output = {
        "sample_queries": sample_queries,
        "expected_responses": expected_responses,
        "qa_data": qa_data
    }

    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=4)

    #print("Final Q&A data saved to", output_json)
    return sample_queries, expected_responses, qa_data

if __name__ == "__main__":
    PDF_FILE = "../knowledge/subset_monetary_policy_report.pdf"
    sample_queries, expected_responses, qa_data = generate_qa_for_pdf(pdf_path=PDF_FILE)
    print(sample_queries)
    print(expected_responses)

['What was the total amount of interest-bearing debt for Swedish non-financial companies at the end of 2023?', 'What classification system is used to determine the size of non-financial companies in Sweden, and what factors influence this classification?', 'Which sectors primarily rely on loans to finance their activities, and which sector has the highest share of securities borrowing?', 'What was the percentage of corporate debt securities held by the Riksbank at the end of 2023?', "What was the approximate market value of outstanding equities at the end of 2023, and how does this value relate to Sweden's GDP?", 'What is the main focus of the image that reflects a specific theme or message?', 'What visual elements in the image suggest a theme of urban decay or neglect?', 'What is the main theme or subject depicted in the image provided, and how does it relate to the overall mood or message conveyed?', 'Based on the image, what are the key features of the graph in terms of its axes and

In [114]:
def generate_qa_from_text(text_data):
    qa_data = []

    for page in text_data:
        page_number = page["page_number"]
        text_content = page["text"]

        if not text_content.strip():
            print(f"Skipping empty text on page {page_number}")
            continue

        user_prompt = json.dumps([
            {"type": "text", "text": (
                "Generate a question and answer from the provided text. "
                "Ensure the question is non-trivial and answerable from the text. "
                "Return the response in JSON format: {'question': '...', 'answer': '...'}"
            )},
            {"type": "text", "text": text_content[:1000]}  # Limit to 1000 chars for token efficiency
        ])

        response_text = call_gpt_4(user_prompt)
        if not response_text.strip():  # Check for empty response
            print(f"Warning: Empty response for page {page_number}")
            continue  

        try:
            response_data = json.loads(response_text)
        except json.JSONDecodeError:
            print(f"Error decoding JSON response on page {page_number}. Response received: {response_text}")
            continue

        # try:
        #     response_data = json.loads(response_text)
        #     qa_data.append({
        #         "page_number": page_number,
        #         "question": response_data.get("question"),
        #         "answer": response_data.get("answer")
        #     })
        # except json.JSONDecodeError:
        #     print(f"Error decoding JSON response on page {page_number}")
        #     continue

    return qa_data

# Example usage
#qa_results = generate_qa_from_text(text_data)


In [115]:
if __name__ == "__main__":
    pdf_path = "../knowledge/subset_monetary_policy_report.pdf"

    # Step 1: Extract text and images
    text_data = extract_text_from_pdf(pdf_path)
    image_paths = extract_images_from_pdf(pdf_path)

    # Step 2: Generate questions and answers
    qa_results = generate_qa_from_text(text_data)

    # Save final Q&A to JSON file
    with open("qa_results.json", "w", encoding="utf-8") as f:
        json.dump(qa_results, f, ensure_ascii=False, indent=4)

    print("Q&A extraction complete. Results saved to qa_results.json")


[{"type": "text", "text": "Generate a question and answer from the provided text. Ensure the question is non-trivial and answerable from the text. Return the response in JSON format: {'question': '...', 'answer': '...'}"}, {"type": "text", "text": "The real economy\u2019s need for financial services \n20 Figure 8. Financial assets of non-financial companies SEK billion \n  Note. Refers to Swedish non-financial companies, including tenant-owner housing associations. \u201cLoans\u201d include net group loans. Excluding equity capital. Source: Statistics Sweden.  Companies need financing to make investments and run their operations. The type of financing on which they have to pay interest is called interest-bearing debt and amounted to just over SEK 5,500 billion at the end of 2023. This is equivalent to just under Sweden\u2019s annual GDP. Of this, SEK 4,000 billion were loans from banks and other lenders and SEK 1,500 billion were issued debt securities, see Figure 9.  01 0002 0003 0004

### 2. Generate QA Set in the correct format

In [None]:
def generate_qa_for_pdf(pdf_path):
    output_json = "QA_" + os.path.basename(pdf_path).replace('.pdf', '.json')
    pages = convert_from_path(pdf_path, dpi=100, poppler_path='/opt/homebrew/bin')
    
    sample_queries = []
    expected_responses = []
    
    qa_data = []
    
    for i, page_image in enumerate(pages, start=1):
        base64_str = encode_image_to_base64(page_image)

        user_prompt = json.dumps([
            {"type": "text", "text": "Your task is to formulate a question from given context satisfying the rules given below: 1. The question should be fully answered from the given context. 2. The question should be framed from a part that contains non-trivial information. 3. The answer should not contain any links. 4. The question should be of moderate difficulty. 5. The question must be reasonable and must be understood and responded to by humans. 6. Do not use phrases that ’provided context’, etc in the question context: 7. The response must be in valid JSON format: {'question': 'Generated question here','answer': 'Generated answer here'}"},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
        ])
        
        response_text = call_gpt_4(user_prompt)
        print(f"Raw API response (page {i}):", response_text)
        
        try:
            response_data = json.loads(response_text)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for page {i}: {e}")
            print("Response text:", response_text)
            continue
        
        question = response_data.get("question")
        answer = response_data.get("answer")
        if question and answer:
            sample_queries.append(question)
            expected_responses.append(answer)
            qa_data.append({"page_number": i, "questions": [question], "answers": [answer]})
        else:
            print(f"Warning: Missing Q&A data on page {i}") 

        

    output = {
        "sample_queries": sample_queries,
        "expected_responses": expected_responses
    }
    
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=4)
    
    print("Final Q&A data saved to", output_json)
    return sample_queries, expected_responses

if __name__ == "__main__":
    PDF_FILE = "../knowledge/subset_monetary_policy_report.pdf"
    sample_queries, expected_responses = generate_qa_for_pdf(pdf_path=PDF_FILE)


In [None]:
def generate_qa_for_pdf(pdf_path):
    output_json = "QA_" + os.path.basename(pdf_path).replace('.pdf', '.json')
    pages = convert_from_path(pdf_path, dpi=150, poppler_path='/opt/homebrew/bin')  # Reduce DPI to limit size

    sample_queries = []
    expected_responses = []
    
    qa_data = []
    
    for i, page_image in enumerate(pages, start=1):
        # Save image temporarily
        image_path = f"temp_page_{i}.png"
        page_image.save(image_path, format="PNG")

        user_prompt = [
            {"type": "text", "text": "Generate 2 question and answer pairs based on the content of this page."},
            {"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
        ]
        
        response_text = call_gpt_4(user_prompt)
        print(f"Raw API response (page {i}):", response_text)
        
        try:
            response_data = json.loads(response_text)
            questions = response_data.get("questions", [])
            answers = response_data.get("answers", [])
            
            if len(questions) == len(answers):
                sample_queries.extend(questions)
                expected_responses.extend(answers)
                qa_data.append({"page_number": i, "questions": questions, "answers": answers})
            else:
                print(f"Warning: Mismatched Q&A pairs on page {i}")
        except json.JSONDecodeError:
            print(f"Error decoding JSON response for page {i}")
            continue

    output = {
        "sample_queries": sample_queries,
        "expected_responses": expected_responses
    }
    
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=4)
    
    print("Final Q&A data saved to", output_json)
    return sample_queries, expected_responses

if __name__ == "__main__":
    PDF_FILE = "../knowledge/subset_monetary_policy_report.pdf"
    sample_queries, expected_responses = generate_qa_for_pdf(pdf_path=PDF_FILE)


In [None]:
def generate_qa_for_pdf(pdf_path):
    output_json = "QA_" + os.path.basename(pdf_path).replace('.pdf', '.json')
    pages = convert_from_path(pdf_path, dpi=100, poppler_path='/opt/homebrew/bin')

    sample_queries = []
    expected_responses = []
    qa_data = []

    for i, page_image in enumerate(pages, start=1):
        base64_str = encode_image_to_base64(page_image)

        user_prompt = json.dumps([
            {"type": "text", "text": (
                "Your task is to formulate a question from the given context while following these rules:\n"
                "1. The question must be answerable using the provided context.\n"
                "2. It should be based on non-trivial information.\n"
                "3. The answer must not contain any links.\n"
                "4. The question should be of moderate difficulty.\n"
                "5. Avoid phrases like 'provided context'.\n"
                "6. The response must be in valid JSON format as follows:\n"
                "{'question': 'Generated question here', 'answer': 'Generated answer here'}"
            )},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
        ])
        
        response_text = call_gpt_4(user_prompt)
        print(f"Raw API response (page {i}):", response_text)

        try:
            response_data = json.loads(response_text)
            question = response_data.get("question")
            answer = response_data.get("answer")

            if question and answer:
                sample_queries.append(question)
                expected_responses.append(answer)
                qa_data.append({"page_number": i, "question": question, "answer": answer})
            else:
                print(f"Warning: Missing Q&A data on page {i}")

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for page {i}: {e}")
            print("Response text:", response_text)
            continue

    # Output structured data
    output = {
        "sample_queries": sample_queries,
        "expected_responses": expected_responses,
        "qa_data": qa_data
    }

    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=4)

    print("Final Q&A data saved to", output_json)
    return sample_queries, expected_responses, qa_data

if __name__ == "__main__":
    PDF_FILE = "../knowledge/subset_monetary_policy_report.pdf"
    sample_queries, expected_responses, qa_data = generate_qa_for_pdf(pdf_path=PDF_FILE)
