This notebook deals with the following
1) simple conversion to markdown
2) Multiple PDF conversion 
3) Making use of locally downloaded model from hugginface
4) Accelerator option to change GPU or CPU

Simple conversion

In [None]:
from docling.document_converter import DocumentConverter
source = " " # document path from local
converter = DocumentConverter()
doc = converter.convert(source).document  # source can be anything 
print(doc.export_to_markdown())

Multiple documents(PDF)

In [None]:
from pathlib import Path
from docling.document_converter import InputFormat  
from docling.document_converter import DocumentConverter, PdfFormatOption

#iterate over the pdf files in the folder
pdf_folder =  Path(".\pdf_file") 
pdf_files = list(pdf_folder.glob("*.pdf"))
converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption()})
                              
for pdf_file in pdf_files:  
    result = converter.convert(pdf_file)
    result.document.save_as_markdown(Path("output") / f"{pdf_file.stem}.md")

Download and Running model in local 
1. Refer the download_docling notebook to download the model in a local path

In [None]:
from pathlib import Path
from docling.document_converter import DocumentConverter, PdfFormatOption 
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions

from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
pdf_folder =  Path(".\pdf_file") 
pdf_files = list(pdf_folder.glob("*.pdf"))

# to change the artifacts_path
pipeline_options = PdfPipelineOptions(
    artifacts_path="C:/Users/Desktop/docling/models",  # local path 
)

# Changes in the DocumentConverter
converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
                backend=DoclingParseV2DocumentBackend,
            )
        }
    )

for pdf_file in pdf_files:
    result = converter.convert(pdf_file)
    result.document.save_as_markdown(Path("output") / f"{pdf_file.stem}.md")

Running Model overÂ GPU

In [None]:
import os
from pathlib import Path
from docling.document_converter import DocumentConverter, PdfFormatOption 
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    AcceleratorOptions,
    AcceleratorDevice,
)
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
pdf_folder =  Path(".\pdf_file") 
pdf_files = list(pdf_folder.glob("*.pdf"))

    # Explicitly set the accelerator options    
    # accelerator_options = AcceleratorOptions(
    #     num_threads=8, device=AcceleratorDevice.AUTO
    # )

    # easyocr doesnt support cuda:N allocation, defaults to cuda:0
    # accelerator_options = AcceleratorOptions(num_threads=8, device="cuda:1")

# changing the accelerator options 
# will Explicitly mamke use of the GPU
accelerator_options = AcceleratorOptions(
        num_threads=8,
        device=AcceleratorDevice.CUDA,
    )

pipeline_options = PdfPipelineOptions(
    accelerator_options=accelerator_options,
    artifacts_path="C:/Users/Desktop/docling/models",
    generate_picture_images=True, # to make sure we download the images. 
    images_scale=1.0, 
)
converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
                backend=DoclingParseV2DocumentBackend,
            )
        }
    )

for pdf_file in pdf_files:
    result = converter.convert(pdf_file)
    pdf_image_output_dir = Path("image") / f"(pdf_file.stem)"
    os.makedirs(pdf_image_output_dir, exist_ok=True)
    result.document.save_as_markdown(Path("output") / f"{pdf_file.stem}.md")

Simple approach to convert a single document 

In [None]:

import os
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions, AcceleratorOptions, AcceleratorDevice
)
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend

from pathlib import Path
artifacts_path="C:/Users/z0047npb/Desktop/docling/models"
accelerator_options = AcceleratorOptions(num_threads=8, device=AcceleratorDevice.CUDA, cuda_use_flash_attention2=False,)

pipeline_options = PdfPipelineOptions(
    accelerator_options=accelerator_options,
    artifacts_path=artifacts_path,
    #generate_page_images=False,
    generate_picture_images=False,
    images_scale= 1.0
)
pipeline_options.do_ocr = True # Bonus Tip : 01 improving the speed of the pipeline    
pipeline_options.do_table_structure = True

In [14]:
pdf_path = r"D:\personal_projects\Docling_project\pdf\immulite_dummy.pdf"
markdown_folder = r"D:\personal_projects\Docling_project\output"
os.makedirs(markdown_folder, exist_ok=True)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=DoclingParseV4DocumentBackend,
        )
    }
)

In [15]:
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
print(pdf_name)
md_filename = os.path.join(markdown_folder, f"{pdf_name}"+ ".md")  
result = converter.convert(pdf_path)
result.document.save_as_markdown(md_filename)

immulite_dummy
