In [8]:
import logging
from pathlib import Path
from docling.datamodel.base_models import InputFormat
# from docling_core.types.doc.base import ImageRefMode

from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.chunking import HierarchicalChunker
from docling.document_converter import DocumentConverter, PdfFormatOption

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Input PDF and output paths
input_pdf_path = Path("data/input/DocLayNet.pdf")  # PDF file path
output_markdown_path = Path("output_with_metadata.md")  # Output Markdown file

# Initialize document converter with PDF options
pdf_options = PdfPipelineOptions()
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
    }
)

# Convert the PDF to a DoclingDocument
document_conversion_result = converter.convert(input_pdf_path)
docling_document = document_conversion_result.document

# Initialize the HierarchicalChunker
chunker = HierarchicalChunker(merge_list_items=True)
chunk_iterator = chunker.chunk(dl_doc=docling_document)

# Write the chunks with metadata to Markdown
with open(output_markdown_path, "w", encoding="utf-8") as md_file:
    md_file.write(f"# Extracted Chunks from {input_pdf_path.name}\n\n")

    for idx, chunk in enumerate(chunk_iterator):
        # Extract metadata from chunk.meta
        metadata = chunk.meta.model_dump() if chunk.meta else {}
        text = chunk.text

        md_file.write(f"## Chunk {idx + 1}\n\n")
        md_file.write(f"### Metadata:\n")
        for key, value in metadata.items():
            md_file.write(f"- **{key}**: {value}\n")

        md_file.write(f"\n### Content:\n\n{text}\n\n")

logger.info(f"Chunks with metadata saved to: {output_markdown_path}")


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document DocLayNet.pdf
INFO:docling.document_converter:Finished converting document DocLayNet.pdf in 41.72 sec.
INFO:__main__:Chunks with metadata saved to: output_with_metadata.md


In [10]:
import logging
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.chunking import HierarchicalChunker
from docling.document_converter import DocumentConverter, PdfFormatOption

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Input PDF and output paths
input_pdf_path = Path("data/input/DocLayNet.pdf")  # PDF file path
output_markdown_path = Path("data/output/DocLayNet_with_images_and_metadata.md")  # Output Markdown file

# Initialize document converter with PDF options
pdf_options = PdfPipelineOptions()
pdf_options.images_scale = 2.0  # Adjust image resolution as needed
pdf_options.generate_page_images = True
pdf_options.generate_picture_images = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
    }
)

# Convert the PDF to a DoclingDocument
document_conversion_result = converter.convert(input_pdf_path)
docling_document = document_conversion_result.document

# Initialize the HierarchicalChunker
chunker = HierarchicalChunker(merge_list_items=True)
chunk_iterator = chunker.chunk(dl_doc=docling_document)

# Write the chunks with metadata and images to Markdown
with open(output_markdown_path, "w", encoding="utf-8") as md_file:
    md_file.write(f"# Extracted Chunks with Images from {input_pdf_path.name}\n\n")

    for idx, chunk in enumerate(chunk_iterator):
        # Extract metadata from chunk.meta
        metadata = chunk.meta.model_dump() if chunk.meta else {}
        text = chunk.text

        md_file.write(f"## Chunk {idx + 1}\n\n")
        md_file.write(f"### Metadata:\n")
        for key, value in metadata.items():
            md_file.write(f"- **{key}**: {value}\n")

        md_file.write(f"\n### Content:\n\n{text}\n\n")

        # Embed images as Base64 if available
        if chunk.meta and hasattr(chunk.meta, 'images'):
            images = chunk.meta.images
            for img_idx, image_data in enumerate(images):
                base64_image = image_data['base64']  # Assuming `base64` is a key in the image metadata
                md_file.write(f"![Image {img_idx + 1}](data:image/png;base64,{base64_image})\n\n")

logger.info(f"Chunks with images and metadata saved to: {output_markdown_path}")

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document DocLayNet.pdf
INFO:docling.document_converter:Finished converting document DocLayNet.pdf in 63.27 sec.
INFO:__main__:Chunks with images and metadata saved to: data\output\DocLayNet_with_images_and_metadata.md


In [15]:
import logging
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.chunking import HierarchicalChunker
from docling.document_converter import DocumentConverter, PdfFormatOption

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Input PDF and output paths
input_pdf_path = Path("data/input/DocLayNet.pdf")  # PDF file path
output_markdown_path = Path("data/output/DocLayNet-chunked-with-images.md")  # Output Markdown file

# Initialize document converter with PDF options
pdf_options = PdfPipelineOptions()
pdf_options.images_scale = 2.0  # Adjust image resolution as needed
pdf_options.generate_page_images = True
pdf_options.generate_picture_images = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
    }
)

# Convert the PDF to a DoclingDocument
document_conversion_result = converter.convert(input_pdf_path)
docling_document = document_conversion_result.document

# Initialize the HierarchicalChunker
chunker = HierarchicalChunker(merge_list_items=True)
chunk_iterator = chunker.chunk(dl_doc=docling_document)

# Write the chunks with metadata and images to Markdown
with open(output_markdown_path, "w", encoding="utf-8") as md_file:
    md_file.write(f"# Extracted Chunks with Images from {input_pdf_path.name}\n\n")

    for idx, chunk in enumerate(chunk_iterator):
        # Extract metadata from chunk.meta
        metadata = chunk.meta.model_dump() if chunk.meta else {}
        text = chunk.text

        md_file.write(f"## Chunk {idx + 1}\n\n")
        md_file.write(f"### Metadata:\n")
        for key, value in metadata.items():
            md_file.write(f"- **{key}**: {value}\n")

        md_file.write(f"\n### Content:\n\n{text}\n\n")

        # Embed images as Base64 if available
        if chunk.meta and hasattr(chunk.meta, 'images'):
            images = chunk.meta.images
            for img_idx, image_data in enumerate(images):
                base64_image = image_data.get('base64', '')  # Safely retrieve Base64 string
                if base64_image:
                    md_file.write(f"![Image {img_idx + 1}](data:image/png;base64,{base64_image})\n\n")

logger.info(f"Chunks with images and metadata saved to: {output_markdown_path}")


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document DocLayNet.pdf
INFO:docling.document_converter:Finished converting document DocLayNet.pdf in 68.50 sec.
INFO:__main__:Chunks with images and metadata saved to: data\output\DocLayNet-chunked-with-images.md
