Bonus Tip 2: On how to run multiple process simultaneously over different GPUs

In [3]:
import os
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions, AcceleratorOptions, AcceleratorDevice
)
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from pathlib import Path
import multiprocessing
try:
    multiprocessing.set_start_method("spawn")
except RuntimeError:
    pass

artifacts_path="C:/Users/z0047npb/Desktop/docling/models"

num_gpus = 4
batch_size_per_gpu = 2 # can be changed based in the GPU 

num_batches = num_gpus * batch_size_per_gpu
pdf_chunks = [[] for _ in range(num_batches)] 

# Collecting all PDF files from the specified directory
pdf_files = []
for pdf_file in Path(r"D:\personal_projects\Docling_project\pdf").glob("*.pdf"):            
    pdf_files.append(pdf_file)  

#assigning pdf_files to batches  
for i,pdf in enumerate(pdf_files):
    pdf_chunks[i % num_batches].append(pdf)

In [4]:
def process_pdf_chunk(gpu_id, pdf_file, batch_id):

    accelerator_options = AcceleratorOptions(
        num_threads=8,
        device=f"cuda:{gpu_id}",
        cuda_use_flash_attention2=False,
    )
    pipeline_options = PdfPipelineOptions(
    accelerator_options=accelerator_options,
    # artifacts_path="C:/Users/z0047npb/Desktop/docling/models",
    # generate_page_images=True,
    generate_picture_images=True,
    images_scale=3.0,
    )
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
                backend=DoclingParseV4DocumentBackend,
            )
        }
    )

    for i, pdf_file in enumerate(pdf_files):
        full_pdf_path = pdf_file
        base_name = os.path.splitext(os.path.basename(full_pdf_path))[0]
        markdown_output_folder = Path(r"D:\personal_projects\Docling_project\output") / base_name       
        markdown_output_folder.mkdir(parents=True, exist_ok=True)

        result = converter.convert(str(full_pdf_path))
        md_filename = os.path.join(markdown_output_folder, f"{base_name}.md")
        result.document.save_as_markdown(md_filename)

In [None]:
processes = []  
for batch_id in range(num_batches):
    assigned_gpu_id = batch_id % num_gpus
    print(f"Starting process for batch {batch_id} on GPU {assigned_gpu_id}")    
    process = multiprocessing.Process(
            target=process_pdf_chunk, 
            args=(assigned_gpu_id, pdf_chunks[batch_id], batch_id)
    )
    process.start()
    processes.append(process)

for process in processes:
    process.join()
    