In [1]:
import fitz  # PyMuPDF
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from io import BytesIO
import torch
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page_index in range(len(doc)):
        page = doc[page_index]
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            images.append((page_index + 1, image_bytes, image_ext))  # page number is 1-based
    return images

def generate_caption(image_bytes):
    image = Image.open(BytesIO(image_bytes)).convert('RGB')
    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

def process_pdf_for_images(pdf_path, output_json="image_captions.json"):
    images = extract_images_from_pdf(pdf_path)
    results = []

    for page_num, image_bytes, image_ext in images:
        caption = generate_caption(image_bytes)
        result = {
            "type": "figure",
            "page": page_num,
            "caption": caption,
            "source_pdf": os.path.basename(pdf_path)
        }
        results.append(result)
        print(f"[Page {page_num}] Caption: {caption}")

    with open(output_json, "w") as f:
        json.dump(results, f, indent=2)

    return results


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
results = process_pdf_for_images("/workspace/OllamaGraphRAGPoC/input-dir/split_5.pdf")
print(json.dumps(results, indent=2))

[Page 1] Caption: the logo for the software company, which is headquartered by broad
[Page 1] Caption: a rainbow colored background
[Page 2] Caption: the logo for the software company, which is headquartered by broad
[Page 2] Caption: a rainbow colored background
[Page 2] Caption: the diagram of the solar energy system
[Page 3] Caption: the logo for the software company, which is headquartered by broad
[Page 3] Caption: a rainbow colored background
[Page 3] Caption: vm and vm - vm diagram
[Page 4] Caption: the logo for the software company, which is headquartered by broad
[Page 4] Caption: a rainbow colored background
[Page 4] Caption: vm - vm - vm - vm - vm - vm vm v
[Page 4] Caption: the vkna server and vkna server
[Page 5] Caption: the logo for the software company, which is headquartered by broad
[Page 5] Caption: a rainbow colored background
[Page 5] Caption: the same image of the same image of the same image of the same image
[Page 6] Caption: the logo for the software company, w

In [3]:
results = process_pdf_for_images("/workspace/OllamaGraphRAGPoC/input-dir/test_1.pdf")
print(json.dumps(results, indent=2))

[Page 1] Caption: a diagram of the different types of wind turbines
[Page 1] Caption: wind turbines are used to generate electricity from the wind
[Page 2] Caption: a diagram of wind turbines
[Page 2] Caption: a wind turbine on a blue sky
[Page 2] Caption: a wind turbine on a pole
[Page 2] Caption: a wind turbine
[
  {
    "type": "figure",
    "page": 1,
    "caption": "a diagram of the different types of wind turbines",
    "source_pdf": "test_1.pdf"
  },
  {
    "type": "figure",
    "page": 1,
    "caption": "wind turbines are used to generate electricity from the wind",
    "source_pdf": "test_1.pdf"
  },
  {
    "type": "figure",
    "page": 2,
    "caption": "a diagram of wind turbines",
    "source_pdf": "test_1.pdf"
  },
  {
    "type": "figure",
    "page": 2,
    "caption": "a wind turbine on a blue sky",
    "source_pdf": "test_1.pdf"
  },
  {
    "type": "figure",
    "page": 2,
    "caption": "a wind turbine on a pole",
    "source_pdf": "test_1.pdf"
  },
  {
    "type": 

In [4]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image
from io import BytesIO
import fitz  # PyMuPDF
import os
import json

In [32]:
# Load LLaVA model (7B version; use 13B if enough VRAM)
model_id = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto"
)

def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page_index in range(len(doc)):
        page = doc[page_index]
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            images.append((page_index + 1, image_bytes, image_ext))
    return images

def generate_caption_llava(image_bytes, question="What does this diagram show?"):
    image = Image.open(BytesIO(image_bytes)).convert("RGB")
    prompt = f"USER: <image>\n{question}\nASSISTANT:"
    inputs = processor(prompt, image, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=150)
    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption

def process_pdf_with_llava(pdf_path, output_json="llava_image_captions.json"):
    images = extract_images_from_pdf(pdf_path)
    results = []

    for page_num, image_bytes, image_ext in images:
        caption = generate_caption_llava(image_bytes)
        result = {
            "type": "figure",
            "page": page_num,
            "caption": caption,
            "source_pdf": os.path.basename(pdf_path)
        }
        results.append(result)
        print(f"[Page {page_num}] Caption: {caption}")

    with open(output_json, "w") as f:
        json.dump(results, f, indent=2)

    return results


Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 17.60it/s]


In [6]:
results = process_pdf_with_llava("/workspace/OllamaGraphRAGPoC/input-dir/split_5.pdf")

You may have used the wrong order for inputs. `images` should be passed before `text`. The `images` and `text` inputs will be swapped. This behavior will be deprecated in transformers v4.47.


[Page 1] Caption: USER:  
What does this diagram show?
ASSISTANT: The diagram shows a computer network with multiple devices connected to it. There are two main devices in the image: a laptop and a desktop computer. The laptop is positioned on the left side of the image, while the desktop computer is located on the right side. 

In addition to the laptop and desktop computer, there are three other devices in the network: two cell phones and a mouse. The cell phones are placed near the laptop, while the mouse is situated closer to the desktop computer. This diagram illustrates a typical modern workspace with various devices connected to the network.
[Page 1] Caption: USER:  
What does this diagram show?
ASSISTANT: The diagram shows a blue sky with no clouds, providing a clear and bright atmosphere. The sky is a deep shade of blue, indicating a sunny day with good weather conditions.
[Page 2] Caption: USER:  
What does this diagram show?
ASSISTANT: The diagram shows a computer network wi

KeyboardInterrupt: 

In [35]:
results = process_pdf_with_llava("/workspace/OllamaGraphRAGPoC/input-dir/test_1.pdf")

[Page 1] Caption: USER:  
What does this diagram show?
ASSISTANT: The diagram shows a representation of a power plant, specifically focusing on the process of generating electricity. The image consists of three main components: a transformer, a power line, and a transformer. The transformer is connected to the power line, which is connected to the transformer. The image also includes a box labeled "transformers" and a box labeled "power lines." The arrangement of these components illustrates the flow of electricity from the power plant to the consumer.
[Page 1] Caption: USER:  
What does this diagram show?
ASSISTANT: The diagram shows a side-by-side comparison of two different views of wind turbines. In one view, the wind turbines are shown in a close-up perspective, while in the other view, they are shown from a distance, giving a broader perspective of the landscape. The two images are placed next to each other, allowing viewers to compare the size and scale of the wind turbines in b

In [29]:
# Install Poppler (required for pdf2image)
!apt-get update && apt-get install -y poppler-utils


Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB]                
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
0% [2 InRelease 19.3 kB/270 kB 7%] [1 InRelease 38.8 kB/129 kB 30%] [Waiting fo

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1683 kB]
Get:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]        
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [47.7 kB]
Get:9 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1245 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy/main amd64 Packages [1792 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2944 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [4387 kB]
Get:13 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [34.2 kB]
Get:14 http://archive.ubuntu.com/ubuntu jammy/restricted amd64 Packages [164 kB]
Get:15 http://archive.ubuntu.com/ubuntu jammy

In [31]:
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
from pdf2image import convert_from_path
from PIL import Image
import torch

# Load model and processor
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-base", token=token)
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base", token=token)

# Load your PDF and convert pages to images
pdf_path = "/workspace/OllamaGraphRAGPoC/input-dir/diagram.pdf"
images = convert_from_path(pdf_path, dpi=200)

# You can modify the question per your needs
question = "What does the diagram show?"

# Loop through pages
for idx, image in enumerate(images):
    print(f"\n--- Page {idx + 1} ---")
    # Convert image to RGB (in case it's grayscale)
    image = image.convert("RGB")

    # Preprocess and generate answer
    inputs = processor(images=image, text=question, return_tensors="pt").to(model.device)
    predictions = model.generate(**inputs)

    # Decode and print
    caption = processor.decode(predictions[0], skip_special_tokens=True)
    print("Answer:", caption)



--- Page 1 ---
Answer: What does the diagram show? diodes are used to generate the diodes. The diodes are connected to the AC and AC
