In [1]:
from pathlib import Path
from typing import List, Optional, Union,Tuple
from PIL import Image
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
    PowerpointFormatOption
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions, 
    PdfPipelineOptions,
    PipelineOptions
)
from docling_core.types.doc import PictureItem
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DocumentProcessor:
    def __init__(
        self,
        device: Optional[str] = None,
        num_threads: int = 8
    ):
        device_map = {
            "cuda": AcceleratorDevice.CUDA,
            "mps": AcceleratorDevice.MPS,
            "cpu": AcceleratorDevice.CPU,
            "auto": AcceleratorDevice.AUTO,
        }
        self.device = device or "auto"
        if self.device not in device_map:
            msg = f"Invalid device '{device}'. Must be one of: {list(device_map.keys())}"
            raise ValueError(msg)
        # Configure pipeline options
        self.pipeline_options = PdfPipelineOptions()
        self.pipeline_options.images_scale = 150/72.0
        self.pipeline_options.generate_page_images = True
        self.pipeline_options.generate_picture_images = True
        self.pipeline_options.do_formula_enrichment = True
        self.pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=num_threads,
            device=device_map[self.device]
        )
        self.pipeline_options.do_picture_description = False
        self.converter = DocumentConverter(
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.PPTX,
                InputFormat.DOCX
            ],
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=self.pipeline_options
                ),
                InputFormat.PPTX: PowerpointFormatOption(
                    pipeline_cls=SimplePipeline
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline
                )
            }
        )
    def __call__(
        self,
        file_path: Union[str, Path]
    ) -> Tuple[List[str], List[Image.Image], List[int]]:
        file_path = Path(file_path) if isinstance(file_path, str) else file_path
        result = self.converter.convert(file_path)
        extracted_images = []
        markdown_pages = []
        pages_with_images = []
        # Extract images
        for element, _level in result.document.iterate_items():
            if isinstance(element, PictureItem):
                # For DOCX files, we'll use a default page number of 1 since the number of pages aren't properly registered (Docling bug)
                page_no = element.prov[0].page_no if element.prov else 1
                if hasattr(element, 'image') and element.image is not None:
                    extracted_images.append(element.image.pil_image)
                    pages_with_images.append(page_no)
        # For DOCX files, we need to handle the case where num_pages() returns 0
        if result.document.num_pages() == 0:
            # Get markdown for the entire document as a single page
            full_md = result.document.export_to_markdown()
            markdown_pages.append(full_md)
        else:
            # Process markdown pages normally for PDF and PPTX
            for page in range(result.document.num_pages()):
                page_no = page + 1
                page_md = result.document.export_to_markdown(page_no=page_no)
                markdown_pages.append(page_md)
        return markdown_pages, extracted_images, pages_with_images

In [3]:
doc_processor = DocumentProcessor(
    device="cuda",
    num_threads=8,
)

In [4]:
import subprocess
import os

def convert_file(input_path, output_format):
    output_dir = os.path.dirname(input_path)
    command = [
        "soffice",
        "--headless",
        "--convert-to", output_format,
        input_path,
        "--outdir", output_dir
    ]
    subprocess.run(command, check=True)

# Convert a .doc file to .docx
convert_file("samples/sample.doc", "docx")

Error: source file could not be loaded


In [5]:
markdown_pages, extracted_images, pages_with_images = doc_processor("samples/sample.pptx")

In [6]:
print("\n\n".join(markdown_pages))

Harnessing High Frequency Data To Inform Development and Humanitarian Interventions

Christopher B. Barrett

Keynote address to the World Bank conference

The Pulse of Progress: Harnessing High-Frequency Survey Data for Development Research in the Polycrisis Era

Washington, DC

December 17, 2024

<!-- image -->

Statistically representative observational data essential for accurate descriptive/predictive analysis. Often useful for inferential analysis.

World Bank established LSMS &gt;40 years ago for comparable measurement of living standards defined broadly. With improved measurement came improved analysis. 

				– Angus Deaton 1997 (and 2018)

“To direct scarce resources to where they can do the greatest good, actions must be guided by reliable information … Measurement drives diagnosis and response. ” 

				– Barrett (Science 2010)

<!-- image -->

Why national survey data?

<!-- image -->

<!-- image -->

Living standards are dynamic. A solid understanding of dynamic living stand

In [None]:
from PIL import Image
import hashlib

def get_image_hash(image):
    """Generate a hash for a PIL image."""
    return hashlib.md5(image.tobytes()).hexdigest()

def get_unique_images_with_indices(images):
    """Return unique images and their original indices."""
    seen_hashes = {}
    unique_images = []
    unique_indices = []
    
    for idx, img in enumerate(images):
        img_hash = get_image_hash(img)
        if img_hash not in seen_hashes:
            seen_hashes[img_hash] = len(unique_images)
            unique_images.append(img)
        unique_indices.append(seen_hashes[img_hash])
    
    return unique_images, unique_indices

# Example usage:
# list_of_pil_images = [...]
unique_images, unique_indices = get_unique_images_with_indices(extracted_images)

# Process the unique images
processed_results = [img.height for img in unique_images]  # Replace with your actual processing

# Map the results back to the original list
original_results = [processed_results[i] for i in unique_indices]

In [None]:
original_results