In [1]:
from typing import List, Tuple
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
    PowerpointFormatOption
)
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions, 
    PdfPipelineOptions,
)
from docling_core.types.doc import PictureItem
from docling.pipeline.simple_pipeline import SimplePipeline
import io
import base64
import mimetypes
import subprocess
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DocumentProcessor:
    def __init__(self):
        self.pipeline_options = PdfPipelineOptions()
        self.pipeline_options.images_scale = 150/72.0
        self.pipeline_options.generate_page_images = True
        self.pipeline_options.generate_picture_images = True
        self.pipeline_options.do_formula_enrichment = True
        self.pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=8,
            device=AcceleratorDevice.CUDA
        )
        self.pipeline_options.do_picture_description = False
        self.converter = DocumentConverter(
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.PPTX,
                InputFormat.DOCX
            ],
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=self.pipeline_options
                ),
                InputFormat.PPTX: PowerpointFormatOption(
                    pipeline_cls=SimplePipeline
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline
                )
            }
        )

    def __call__(
        self, 
        base64_content: str
    ) -> Tuple[List[str], List[str], List[int]]:
        # Decode base64 content to bytes
        doc_content = base64.b64decode(base64_content)

        # Create BytesIO object and DocumentStream
        doc_stream = io.BytesIO(doc_content)
        source = DocumentStream(name="doc", stream=doc_stream)

        # Convert using DocumentStream
        result = self.converter.convert(source)

        extracted_images = []
        markdown_pages = []
        pages_with_images = []
        # Extract images
        for element, _level in result.document.iterate_items():
            if isinstance(element, PictureItem):
                # For DOCX files, we'll use a default page number of 1 since the number of pages aren't properly registered (Docling bug)
                page_no = element.prov[0].page_no if element.prov else 1
                if hasattr(element, 'image') and element.image is not None:
                    extracted_images.append(str(element.image.uri))
                    pages_with_images.append(page_no)
        # For DOCX files, we need to handle the case where num_pages() returns 0
        if result.document.num_pages() == 0:
            # Get markdown for the entire document as a single page
            full_md = result.document.export_to_markdown()
            markdown_pages.append(full_md)
        else:
            # Process markdown pages normally for PDF and PPTX
            for page in range(result.document.num_pages()):
                page_no = page + 1
                page_md = result.document.export_to_markdown(page_no=page_no)
                markdown_pages.append(page_md)
        return markdown_pages, extracted_images, pages_with_images

In [3]:
doc_processor = DocumentProcessor()

In [4]:
def convert_file(input_path: str, output_format: str) -> Path:
    """
    Convert a file to the specified format using LibreOffice.
    
    Args:
        input_path: Path to input file
        output_format: Target format (e.g., 'docx', 'pptx')
        
    Returns:
        Path to converted file
    """
    print(f"Converting {input_path} to {output_format}")
    input_path = Path(input_path)
    output_dir = input_path.parent
    
    command = [
        "soffice",
        "--headless",
        "--convert-to", output_format,
        str(input_path),
        "--outdir", str(output_dir)
    ]
    
    try:
        subprocess.run(command, check=True, capture_output=True)
        return output_dir / f"{input_path.stem}.{output_format}"
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Conversion failed: {e.stderr.decode()}")

def get_base64_content(file_path: str) -> str:
    """
    Convert file to base64 with proper MIME type prefix.
    
    Args:
        file_path: Path to the file
        
    Returns:
        Base64 encoded string with MIME type prefix
    """
    file_path = Path(file_path)
    mime_type, _ = mimetypes.guess_type(str(file_path))
    
    with open(file_path, 'rb') as file:
        encoded_string = base64.b64encode(file.read()).decode('utf-8')
    
    return f"data:{mime_type};base64,{encoded_string}"

def process_document(file_path: str) -> str:
    """
    Process document file, converting if necessary, and return base64 content.
    
    Args:
        file_path: Path to the document file
        
    Returns:
        Base64 encoded content with MIME type prefix
    """
    file_path = Path(file_path)
    base64_string = get_base64_content(file_path)
    prefix = base64_string.split(',')[0]
    
    # Define format mappings
    format_conversions = {
        "data:application/vnd.ms-powerpoint;base64": "pptx",
        "data:application/msword;base64": "docx"
    }
    
    # Check if conversion is needed
    if prefix in format_conversions:
        try:
            # Convert the file
            converted_path = convert_file(file_path, format_conversions[prefix])
            # Get base64 of converted file
            base64_string = get_base64_content(converted_path)
            # Clean up converted file
            converted_path.unlink()
        except Exception as e:
            raise RuntimeError(f"Failed to process document: {str(e)}")
            
    return base64_string

In [11]:
base64_content = process_document("samples/file-sample_500kB.doc")
print(f"Processed content prefix: {base64_content.split(',')[0]}")

Converting samples/file-sample_500kB.doc to docx
Processed content prefix: data:application/vnd.openxmlformats-officedocument.wordprocessingml.document;base64


In [12]:
markdown_pages, extracted_images, pages_with_images = doc_processor(base64_content.split(',')[1])

In [21]:
extracted_images[0].split(',')[0]

'data:image/png;base64'

In [22]:
print("\n\n".join(markdown_pages))

# Lorem ipsum

## Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio.

Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.

Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.

- Maecenas non lorem quis tellus placerat varius.
- Nulla facilisi.
- Aenean congue fringilla ju