### 1. INSTALL / IMPORT LIBRARIES

In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # TODO: This can introduce misscalculations. Opt for just CPU or GPU.
# os.environ["OMP_NUM_THREADS"] = "1" # This is to avoid conflicts with Faiss (for MAC users)

import numpy as np
import faiss
import torch

from PyPDF2 import PdfReader
from PIL import Image

# In your notebook (example):
from common_utils import encode_image_to_base64, search_index, retrieve_context, call_gpt_4
from multimodalembedder import create_embedder
from ipynb.fs.full.evalbook import extract_images_from_pdf


## Configure Embedder and PDF

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = create_embedder("CLIP")

PDF_FILE = "../knowledge/subset_monetary_policy_report.pdf"

preprocessor_config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.69G [00:00<?, ?B/s]

### 4. PROCESS THE PDF (TEXT + IMAGES)

In [3]:
text_data = []
image_data = []

# Extract text
reader = PdfReader(PDF_FILE)
num_pages = len(reader.pages)

for page_i in range(num_pages):
    page = reader.pages[page_i]
    page_text = page.extract_text()
    
    if page_text and page_text.strip():
        text_data.append({
            "text": page_text.strip(),
            "page_number": page_i + 1
        })

# Extract images
all_image_paths = extract_images_from_pdf(PDF_FILE)
all_images = []
for image_info in all_image_paths:
    image_path = image_info["image_path"]

    # Convert the image to base64 for processing
    image = Image.open(image_path)
    all_images.append(image)

for i, pil_img in enumerate(all_images):
    image_data.append({
        "image": pil_img,
        "image_number": i + 1
    })

### 5. CREATE EMBEDDINGS FOR TEXT AND IMAGE CHUNKS

In [4]:
all_metadata = []
all_embeddings = []

#### 5A. Embed all text chunks

In [5]:
texts_list = [td["text"] for td in text_data]
if len(texts_list) > 0:
    text_embeddings = embedder.embed_text(texts_list)
    for i, emb in enumerate(text_embeddings):
        all_metadata.append({
            "type": "text",
            "content": text_data[i]["text"],
            "page_number": text_data[i]["page_number"]
        })
        all_embeddings.append(emb)

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.


TypeError: Blip2ForImageTextRetrieval.forward() missing 1 required positional argument: 'pixel_values'

#### 5B. Embed all images

In [6]:
pil_images_list = [id_["image"] for id_ in image_data]
if len(pil_images_list) > 0:
    image_embeddings = embedder.embed_images(pil_images_list)
    for i, emb in enumerate(image_embeddings):
        
        # Convert PIL image to base64 once (so we can send it to GPT-4 with Vision)
        base64_str = encode_image_to_base64(image_data[i]["image"])
        all_metadata.append({
            "type": "image",
            "content": base64_str,
            "image_number": image_data[i]["image_number"]
        })
        all_embeddings.append(emb)

# Convert to NumPy array
all_embeddings = np.array(all_embeddings).astype('float32')
embedding_dimension = all_embeddings.shape[1]

### 6. BUILD & POPULATE FAISS

In [7]:
# Create and return a flat Faiss index of the specified dimension.
index = faiss.IndexFlatIP(embedding_dimension)

# Add embeddings to a Faiss index.
index.add(all_embeddings)

### 7. QUERY PIPELINE (RETRIEVAL + GENERATION)

In [8]:
def answer_query(user_query, top_k=3):
    """
    1. Embed the user query (as text).
    2. Retrieve top_k similar items from the PDF (text or image).
    3. Build a ChatCompletion messages list with text + images.
    4. Return GPT-4's answer.
    """
    # Step 1: Embed user query
    query_emb = embedder.embed_text([user_query])  # shape: (1, D)
    
    # Step 2: Retrieve from Faiss
    distances, faiss_indices = search_index(index, query_emb, top_k=top_k)
    retrieved_items = retrieve_context(faiss_indices, all_metadata)
    
    # Print distances and retrieved items in pairs
    for distance, item in zip(distances[0], retrieved_items):
        print(f"Distance: {distance}, Item: {item}\n")

    # Step 3: Build the messages payload
    # We'll pass the user's question as the first part of the content,
    # then each retrieved item (text or image) as separate parts.
    user_content = []
    
    # Add user query
    user_content.append({"type": "text", "text": f"User query: {user_query}"})
    
    # Add each retrieved item
    for item in retrieved_items:
        if item["type"] == "text":
            continue
            # Provide textual snippet
            user_content.append({
                "type": "text",
                "text": f"(page {item['page_number']}) {item['content'][:500]}..."
            })
        elif item["type"] == "image":
            # Provide the base64 image data as a data URI
            base64_str = item["content"]
            user_content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{base64_str}"
                }
            })

    # Step 4: Call GPT-4 with the full message payload
    answer = "call_gpt_4(user_content)"
    return answer

### 8. EXAMPLE USAGE

In [None]:
user_query_1 = "Figure 9. Non-financial companies' debts SEK billion 9000 Loans from banks, etc. Trade credits and advances debt securities other Note. Refers to Swedish non-financial companies, including tenant-owner housing associations."
response_1 = answer_query(user_query_1, top_k=100)
print("\nQ:", user_query_1)
print("A:", response_1)