In [None]:
from pathlib import Path
from typing import List, Optional, Union, Literal, Tuple
from PIL import Image
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions, 
    PdfPipelineOptions,
    granite_picture_description,
    smolvlm_picture_description
)
from docling_core.types.doc import PictureItem

In [2]:
class DocumentProcessor:
    def __init__(
        self,
        device: Optional[str] = None,
        num_threads: int = 8,
        picture_description: Literal["none", "smolVLM", "granite"] = "none",
        images_scale: float = 300/72.0,
    ):
        """
        Initialize the processor

        Args:
            device: Device for processing ('cuda', 'mps', 'cpu', or 'auto'). 
                If None, will use 'auto'.
            num_threads: Number of threads to use for processing
            picture_description: Type of picture description to use:
                - 'none': No picture description
                - 'smolVLM': Lightweight vision-language model
                - 'granite': Advanced vision-language model
            images_scale: Scale factor for images (default: 300/72.0)
        """
        device_map = {
            "cuda": AcceleratorDevice.CUDA,
            "mps": AcceleratorDevice.MPS, 
            "cpu": AcceleratorDevice.CPU,
            "auto": AcceleratorDevice.AUTO,
        }
        
        self.device = device or "auto"
        if self.device not in device_map:
            msg = f"Invalid device '{device}'. Must be one of: {list(device_map.keys())}"
            raise ValueError(msg)

        self.pipeline_options = PdfPipelineOptions()
        self.pipeline_options.images_scale = images_scale
        self.pipeline_options.generate_page_images = True
        self.pipeline_options.generate_picture_images = True
        self.pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=num_threads,
            device=device_map[self.device]
        )

        if picture_description != "none":
            self.pipeline_options.do_picture_description = True
            prompt = "Describe what you can see in this image. Focus only on what is visually present. Be concise and accurate in three sentences maximum."
            
            if picture_description == "smolVLM":
                self.pipeline_options.picture_description_options = smolvlm_picture_description
                self.pipeline_options.picture_description_options.prompt = prompt
                self.pipeline_options.picture_description_options.generation_config = {
                    "max_new_tokens": 500,
                    "do_sample": False,
                }
            else:
                self.pipeline_options.picture_description_options = granite_picture_description
                self.pipeline_options.picture_description_options.prompt = prompt
                self.pipeline_options.picture_description_options.generation_config = {
                    "max_new_tokens": 800,
                    "do_sample": False,
                }

    def __call__(
        self, 
        file_path: Union[str, Path]
    ) -> Tuple[List[str], List[Image.Image], List[Image.Image]]:
        """
        Process a document and return markdown pages and images

        Args:
            file_path: Path to the PDF file

        Returns:
            Tuple containing:
            - List of markdown strings (one per page)
            - List of extracted images (figures/graphs)
            - List of page images
        """
        file_path = Path(file_path) if isinstance(file_path, str) else file_path
        
        # Update pipeline options to generate page images
        self.pipeline_options.generate_page_images = True
        
        converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=self.pipeline_options
                )
            }
        )
        result = converter.convert(file_path)

        page_images = []
        extracted_images = []
        markdown_pages = []
        picture_descriptions = {}

        # Get page images from the conversion result
        for page_no, page in result.document.pages.items():
            if hasattr(page, 'image') and page.image is not None:
                page_images.append(page.image.pil_image)

        # Collect images and descriptions
        for element, _level in result.document.iterate_items():
            if isinstance(element, PictureItem):
                page_no = element.prov[0].page_no
                
                if hasattr(element, 'image') and element.image is not None:
                    extracted_images.append(element.image.pil_image)
                
                if page_no not in picture_descriptions:
                    picture_descriptions[page_no] = []
                
                if element.annotations:
                    ann = element.annotations[0]
                    desc = f"**AI-Generated Image Description:** {ann.text}\n<!-- end image description -->"
                    picture_descriptions[page_no].append(desc)

        # Process markdown pages
        for i in range(result.document.num_pages()):
            page_no = i + 1
            page_md = result.document.export_to_markdown(page_no=page_no)
            
            if page_no in picture_descriptions:
                parts = page_md.split("<!-- image -->")
                new_page_md = parts[0]
                
                for idx, part in enumerate(parts[1:]):
                    if idx < len(picture_descriptions[page_no]):
                        description = picture_descriptions[page_no][idx]
                        new_page_md += f"<!-- image -->\n{description}\n{part}"
                    else:
                        new_page_md += f"<!-- image -->{part}"
                
                page_md = new_page_md
            
            markdown_pages.append(page_md)

        return markdown_pages, extracted_images, page_images

In [3]:
doc_processor = DocumentProcessor(
    device="cuda",
    num_threads=8,
    picture_description="smolVLM",
    images_scale=300/72.0
)

In [4]:
markdown_pages, extracted_images, page_images = doc_processor("samples/[research-paper] 1312.6114v11.pdf")

In [None]:
extracted_images

In [None]:
markdown_pages[0]