In [1]:
from typing import List, Tuple
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
    PowerpointFormatOption
)
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions, 
    PdfPipelineOptions,
)
from docling_core.types.doc import PictureItem
from docling.pipeline.simple_pipeline import SimplePipeline
import io
import base64

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DocumentProcessor:
    def __init__(self):
        self.pipeline_options = PdfPipelineOptions()
        self.pipeline_options.images_scale = 150/72.0
        self.pipeline_options.generate_page_images = True
        self.pipeline_options.generate_picture_images = True
        self.pipeline_options.do_formula_enrichment = True
        self.pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=8,
            device=AcceleratorDevice.CUDA
        )
        self.pipeline_options.do_picture_description = False
        self.converter = DocumentConverter(
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.PPTX,
                InputFormat.DOCX
            ],
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=self.pipeline_options
                ),
                InputFormat.PPTX: PowerpointFormatOption(
                    pipeline_cls=SimplePipeline
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline
                )
            }
        )

    def __call__(
        self, 
        base64_content: str
    ) -> Tuple[List[str], List[str], List[int]]:
        # Decode base64 content to bytes
        doc_content = base64.b64decode(base64_content)

        # Create BytesIO object and DocumentStream
        doc_stream = io.BytesIO(doc_content)
        source = DocumentStream(name="doc", stream=doc_stream)

        # Convert using DocumentStream
        result = self.converter.convert(source)

        extracted_images = []
        markdown_pages = []
        pages_with_images = []
        # Extract images
        for element, _level in result.document.iterate_items():
            if isinstance(element, PictureItem):
                # For DOCX files, we'll use a default page number of 1 since the number of pages aren't properly registered (Docling bug)
                page_no = element.prov[0].page_no if element.prov else 1
                if hasattr(element, 'image') and element.image is not None:
                    extracted_images.append(str(element.image.uri))
                    pages_with_images.append(page_no)
        # For DOCX files, we need to handle the case where num_pages() returns 0
        if result.document.num_pages() == 0:
            # Get markdown for the entire document as a single page
            full_md = result.document.export_to_markdown()
            markdown_pages.append(full_md)
        else:
            # Process markdown pages normally for PDF and PPTX
            for page in range(result.document.num_pages()):
                page_no = page + 1
                page_md = result.document.export_to_markdown(page_no=page_no)
                markdown_pages.append(page_md)
        return markdown_pages, extracted_images, pages_with_images

In [3]:
doc_processor = DocumentProcessor()

In [4]:
import subprocess
import os

def convert_file(input_path, output_format):
    output_dir = os.path.dirname(input_path)
    command = [
        "soffice",
        "--headless",
        "--convert-to", output_format,
        input_path,
        "--outdir", output_dir
    ]
    subprocess.run(command, check=True)

# Convert a .doc file to .docx
convert_file("samples/sample.doc", "docx")

Error: source file could not be loaded


In [None]:
import base64
import mimetypes

file_path = "samples/sample.pptx"

# Determine MIME type based on the file extension
mime_type, _ = mimetypes.guess_type(file_path)

with open(file_path, 'rb') as file:
    encoded_string = base64.b64encode(file.read()).decode('utf-8')

# Add the MIME type prefix
base64_string = f"data:{mime_type};base64,{encoded_string}"

print(base64_string[:100])

prefix = base64_string.split(',')[0]

if prefix == "data:application/vnd.openxmlformats-officedocument.presentationml.presentation;base64":
    convert_file(file_path, "pptx")
    

data:application/vnd.openxmlformats-officedocument.presentationml.presentation;base64,UEsDBBQABgAIAA


'data:application/vnd.openxmlformats-officedocument.presentationml.presentation;base64'

In [None]:
markdown_pages, extracted_images, pages_with_images = doc_processor(base64_string)

In [None]:
extracted_images

In [None]:
print("\n\n".join(markdown_pages))

In [None]:
from PIL import Image
import hashlib

def get_image_hash(image):
    """Generate a hash for a PIL image."""
    return hashlib.md5(image.tobytes()).hexdigest()

def get_unique_images_with_indices(images):
    """Return unique images and their original indices."""
    seen_hashes = {}
    unique_images = []
    unique_indices = []
    
    for idx, img in enumerate(images):
        img_hash = get_image_hash(img)
        if img_hash not in seen_hashes:
            seen_hashes[img_hash] = len(unique_images)
            unique_images.append(img)
        unique_indices.append(seen_hashes[img_hash])
    
    return unique_images, unique_indices

# Example usage:
# list_of_pil_images = [...]
unique_images, unique_indices = get_unique_images_with_indices(extracted_images)

# Process the unique images
processed_results = [img.height for img in unique_images]  # Replace with your actual processing

# Map the results back to the original list
original_results = [processed_results[i] for i in unique_indices]

In [None]:
original_results