In [1]:
import os
from pathlib import Path

# Document folder

In [None]:
doc_dir = "doc_folder_2"

files = list(Path(doc_dir).glob("*"))
print(files)

[PosixPath('doc_folder_1/Public003.pdf'), PosixPath('doc_folder_1/Public004.pdf')]


In [3]:
file = files[0]
print(file)

doc_folder_1/Public003.pdf


# Basic usage

Using `DocumentConverter` to process a document.

In [1]:
import os, sys
sys.path.append("..")

In [2]:
import os
import logging
from io import BytesIO
from pathlib import Path
import subprocess

import docling
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline


from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from docling_core.types.io import DocumentStream
from docling.datamodel.pipeline_options import TableFormerMode
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
)
from docling.pipeline.vlm_pipeline import VlmPipeline

from PIL import Image
import torch
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
import time
import socket

# Step 1 — Set env vars before starting server
os.environ["OLLAMA_FLASH_ATTENTION"] = "true"
os.environ["OLLAMA_LOW_VRAM"] = "false"
os.environ["OLLAMA_NUM_PARALLEL"] = "4"
os.environ["OLLAMA_CONTEXT_LENGTH"] = "2048"

# Step 2 — Start Ollama server with these variables

def is_ollama_running():
	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
	result = sock.connect_ex(('localhost', 11434))
	sock.close()
	return result == 0

if not is_ollama_running():
	ollama_proc = subprocess.Popen(["ollama", "serve"], env=os.environ)
	time.sleep(5)  # wait for Ollama to be ready
else:
	print("Ollama is already running")


# MODEL_PATH = "benhaotang/Nanonets-OCR-s:latest"  # local HF model
MODEL_PATH = "benhaotang/Nanonets-OCR-s:q4_k_m"
os.environ['TRANSFORMERS_VERBOSITY'] = 'info'


def create_vlm_options(model:str, prompt:str):
    options = ApiVlmOptions(
        url="http://localhost:11434/v1/chat/completions",  # the default Ollama endpoint # type: ignore
        params=dict(
            model=model,
        ),
        prompt=prompt,
        timeout=350,
        scale=1.0,
        response_format=ResponseFormat.MARKDOWN,
    )

    return options


# ===========================
# 2️⃣ Configure Docling
# ===========================
def doc_converter():
    logging.basicConfig(level=logging.INFO)
    system_prompt = """
        Hãy trích xuất toàn bộ văn bản từ tài liệu ở trên giống như cách bạn đọc nó một cách tự nhiên.
        Trả về các bảng dưới dạng mã HTML.
        Trả về các phương trình dưới dạng biểu diễn LaTeX.

        Nếu trong tài liệu có hình ảnh nhưng không có chú thích, hãy thêm một mô tả ngắn cho hình ảnh đó bên trong thẻ <img></img>;
        nếu hình ảnh đã có chú thích, hãy đặt chú thích đó bên trong thẻ <img></img>.

        Dấu watermark nên được đặt trong thẻ <watermark></watermark>.
        Số trang nên được đặt trong thẻ <page_number></page_number>.
        Ví dụ: <page_number>14</page_number> hoặc <page_number>9/22</page_number>.

        Ưu tiên sử dụng ký hiệu ☐ và ☑ cho các ô kiểm (checkbox).

        QUAN TRỌNG: Luôn luôn trả lời bằng **Tiếng Việt**.
    """

    #? run remote Ollama, Huggingface model
    pdf_options = VlmPipelineOptions(
        enable_remote_services=True  # required when calling remote VLM endpoints
    )
    pdf_options.accelerator_options = AcceleratorOptions(
        num_threads=12, device=AcceleratorDevice.AUTO
    )
    pdf_options.vlm_options = create_vlm_options(
        model=MODEL_PATH,
        prompt=system_prompt,
    )

    converter = DocumentConverter(
        # allowed_formats=[
        #     InputFormat.PDF,
        #     InputFormat.IMAGE,
        #     InputFormat.DOCX,
        #     InputFormat.HTML,
        #     InputFormat.PPTX,
        # ],  # whitelist formats, non-matching files are igno

        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pdf_options,
                pipeline_cls=VlmPipeline,
            )
        }
    )

    return converter


# PDF → Markdown pipeline
def extract_text_to_markdown(input_dir: str, output_dir: str):
    output_dir = Path(output_dir)  # type: ignore
    output_dir.mkdir(parents=True, exist_ok=True)  # type: ignore

    data_folder = 'D:\\VIETTEL_RACE\\qa-mcq-rag\\mcqaRAG_p2\\private-test-input'
    file_names = os.listdir(data_folder)
    file_list = [f'{data_folder}/{file_name}' for file_name in file_names]

    batch_size = 3
    converter = doc_converter()

    # Process files in batches
    for i in range(0, len(file_list), batch_size):
        batch_names = file_names[i:i+batch_size]
        batch_paths = file_list[i:i+batch_size]

        print(f"Processing batch {i//batch_size + 1}: {batch_names}")

        for file_name, file_path in zip(batch_names, batch_paths):
            try:
                conv_result = converter.convert(file_path)
                with open(f"{output_dir}/{file_name[:-4]}.md", "w", encoding='utf-8') as f:
                    f.write(conv_result.document.export_to_markdown())
                print(f"✓ Completed: {file_name}")
            except Exception as e:
                print(f"❌ Error processing {file_name}: {e}")


Ollama is already running


In [None]:
if __name__ == '__main__':
    start_time = time.perf_counter()  # Use perf_counter for higher precision
    extract_text_to_markdown('private-test-input', '../scratch')

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time:.4f} seconds")
    ollama_proc.terminate() # type: ignore

2025-10-19 01:34:24,219 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 01:34:24,259 - INFO - Going to convert document batch...
2025-10-19 01:34:24,260 - INFO - Initializing pipeline for VlmPipeline with options hash 309ed9ea03de69421ff1373882da7d51


Processing batch 1: ['Public001.pdf', 'Public002.pdf', 'Public003.pdf']


2025-10-19 01:34:24,710 - INFO - Loading plugin 'docling_defaults'
2025-10-19 01:34:24,710 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-19 01:34:24,710 - INFO - Processing document Public001.pdf
2025-10-19 01:36:04,880 - INFO - Finished converting document Public001.pdf in 100.66 sec.
2025-10-19 01:36:04,888 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 01:36:04,888 - INFO - Going to convert document batch...
2025-10-19 01:36:04,888 - INFO - Processing document Public002.pdf


✓ Completed: Public001.pdf


2025-10-19 01:36:56,096 - INFO - Finished converting document Public002.pdf in 51.20 sec.
2025-10-19 01:36:56,115 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 01:36:56,116 - INFO - Going to convert document batch...
2025-10-19 01:36:56,116 - INFO - Processing document Public003.pdf


✓ Completed: Public002.pdf


2025-10-19 01:43:59,731 - INFO - Finished converting document Public003.pdf in 423.62 sec.
2025-10-19 01:43:59,750 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 01:43:59,751 - INFO - Going to convert document batch...
2025-10-19 01:43:59,751 - INFO - Processing document Public004.pdf


✓ Completed: Public003.pdf
Processing batch 2: ['Public004.pdf', 'Public005.pdf', 'Public006.pdf']


2025-10-19 01:50:26,019 - INFO - Finished converting document Public004.pdf in 386.27 sec.
2025-10-19 01:50:26,126 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 01:50:26,134 - INFO - Going to convert document batch...
2025-10-19 01:50:26,134 - INFO - Processing document Public005.pdf


✓ Completed: Public004.pdf


2025-10-19 01:51:30,006 - INFO - Finished converting document Public005.pdf in 63.88 sec.
2025-10-19 01:51:30,029 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 01:51:30,032 - INFO - Going to convert document batch...
2025-10-19 01:51:30,033 - INFO - Processing document Public006.pdf


✓ Completed: Public005.pdf


2025-10-19 01:59:00,648 - INFO - Finished converting document Public006.pdf in 450.61 sec.
2025-10-19 01:59:00,784 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 01:59:00,793 - INFO - Going to convert document batch...
2025-10-19 01:59:00,793 - INFO - Processing document Public007.pdf


✓ Completed: Public006.pdf
Processing batch 3: ['Public007.pdf', 'Public008.pdf', 'Public009.pdf']


2025-10-19 02:00:07,630 - INFO - Finished converting document Public007.pdf in 66.84 sec.
2025-10-19 02:00:07,646 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:00:07,661 - INFO - Going to convert document batch...
2025-10-19 02:00:07,663 - INFO - Processing document Public008.pdf


✓ Completed: Public007.pdf


2025-10-19 02:01:34,219 - INFO - Finished converting document Public008.pdf in 86.58 sec.
2025-10-19 02:01:34,237 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:01:34,237 - INFO - Going to convert document batch...
2025-10-19 02:01:34,253 - INFO - Processing document Public009.pdf


✓ Completed: Public008.pdf


2025-10-19 02:02:36,668 - INFO - Finished converting document Public009.pdf in 62.44 sec.
2025-10-19 02:02:36,675 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:02:36,692 - INFO - Going to convert document batch...
2025-10-19 02:02:36,693 - INFO - Processing document Public010.pdf


✓ Completed: Public009.pdf
Processing batch 4: ['Public010.pdf', 'Public011.pdf', 'Public012.pdf']


2025-10-19 02:05:04,586 - INFO - Finished converting document Public010.pdf in 147.91 sec.
2025-10-19 02:05:04,648 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:05:04,650 - INFO - Going to convert document batch...
2025-10-19 02:05:04,651 - INFO - Processing document Public011.pdf


✓ Completed: Public010.pdf


2025-10-19 02:05:26,386 - INFO - Finished converting document Public011.pdf in 21.73 sec.
2025-10-19 02:05:26,386 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:05:26,386 - INFO - Going to convert document batch...
2025-10-19 02:05:26,386 - INFO - Processing document Public012.pdf


✓ Completed: Public011.pdf


2025-10-19 02:12:15,304 - INFO - Finished converting document Public012.pdf in 408.92 sec.
2025-10-19 02:12:15,597 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:12:15,597 - INFO - Going to convert document batch...
2025-10-19 02:12:15,597 - INFO - Processing document Public013.pdf


✓ Completed: Public012.pdf
Processing batch 5: ['Public013.pdf', 'Public014.pdf', 'Public015.pdf']


2025-10-19 02:12:42,052 - INFO - Finished converting document Public013.pdf in 26.45 sec.
2025-10-19 02:12:42,067 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:12:42,067 - INFO - Going to convert document batch...
2025-10-19 02:12:42,067 - INFO - Processing document Public014.pdf


✓ Completed: Public013.pdf


2025-10-19 02:14:24,326 - INFO - Finished converting document Public014.pdf in 102.26 sec.
2025-10-19 02:14:24,334 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:14:24,334 - INFO - Going to convert document batch...
2025-10-19 02:14:24,334 - INFO - Processing document Public015.pdf


✓ Completed: Public014.pdf


2025-10-19 02:18:17,834 - INFO - Finished converting document Public015.pdf in 233.50 sec.
2025-10-19 02:18:17,960 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:18:17,960 - INFO - Going to convert document batch...
2025-10-19 02:18:17,960 - INFO - Processing document Public016.pdf


✓ Completed: Public015.pdf
Processing batch 6: ['Public016.pdf', 'Public017.pdf', 'Public018.pdf']


2025-10-19 02:19:49,764 - INFO - Finished converting document Public016.pdf in 91.81 sec.
2025-10-19 02:19:49,789 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:19:49,789 - INFO - Going to convert document batch...
2025-10-19 02:19:49,798 - INFO - Processing document Public017.pdf


✓ Completed: Public016.pdf


2025-10-19 02:22:00,577 - INFO - Finished converting document Public017.pdf in 130.80 sec.
2025-10-19 02:22:00,577 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:22:00,595 - INFO - Going to convert document batch...
2025-10-19 02:22:00,596 - INFO - Processing document Public018.pdf


✓ Completed: Public017.pdf


2025-10-19 02:28:39,260 - INFO - Finished converting document Public018.pdf in 398.67 sec.
2025-10-19 02:28:39,397 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:28:39,404 - INFO - Going to convert document batch...
2025-10-19 02:28:39,405 - INFO - Processing document Public019.pdf


✓ Completed: Public018.pdf
Processing batch 7: ['Public019.pdf', 'Public020.pdf', 'Public021.pdf']


2025-10-19 02:36:23,992 - INFO - Finished converting document Public019.pdf in 464.59 sec.
2025-10-19 02:36:24,017 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:36:24,019 - INFO - Going to convert document batch...
2025-10-19 02:36:24,021 - INFO - Processing document Public020.pdf


✓ Completed: Public019.pdf


2025-10-19 02:36:48,066 - INFO - Finished converting document Public020.pdf in 24.05 sec.
2025-10-19 02:36:48,077 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:36:48,080 - INFO - Going to convert document batch...
2025-10-19 02:36:48,081 - INFO - Processing document Public021.pdf


✓ Completed: Public020.pdf


2025-10-19 02:43:49,273 - INFO - Finished converting document Public021.pdf in 421.20 sec.
2025-10-19 02:43:49,607 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:43:49,607 - INFO - Going to convert document batch...
2025-10-19 02:43:49,607 - INFO - Processing document Public022.pdf


✓ Completed: Public021.pdf
Processing batch 8: ['Public022.pdf', 'Public023.pdf', 'Public024.pdf']


2025-10-19 02:45:13,878 - INFO - Finished converting document Public022.pdf in 84.27 sec.
2025-10-19 02:45:13,898 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:45:13,898 - INFO - Going to convert document batch...
2025-10-19 02:45:13,898 - INFO - Processing document Public023.pdf


✓ Completed: Public022.pdf


2025-10-19 02:45:48,194 - INFO - Finished converting document Public023.pdf in 34.30 sec.
2025-10-19 02:45:48,202 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:45:48,211 - INFO - Going to convert document batch...
2025-10-19 02:45:48,211 - INFO - Processing document Public024.pdf


✓ Completed: Public023.pdf


2025-10-19 02:46:16,092 - INFO - Finished converting document Public024.pdf in 27.89 sec.
2025-10-19 02:46:16,107 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:46:16,107 - INFO - Going to convert document batch...
2025-10-19 02:46:16,107 - INFO - Processing document Public025.pdf


✓ Completed: Public024.pdf
Processing batch 9: ['Public025.pdf', 'Public026.pdf', 'Public027.pdf']


2025-10-19 02:46:41,996 - INFO - Finished converting document Public025.pdf in 25.89 sec.
2025-10-19 02:46:42,000 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:46:42,000 - INFO - Going to convert document batch...
2025-10-19 02:46:42,000 - INFO - Processing document Public026.pdf


✓ Completed: Public025.pdf


2025-10-19 02:47:02,658 - INFO - Finished converting document Public026.pdf in 20.66 sec.
2025-10-19 02:47:02,675 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:47:02,677 - INFO - Going to convert document batch...
2025-10-19 02:47:02,677 - INFO - Processing document Public027.pdf


✓ Completed: Public026.pdf


2025-10-19 02:47:24,786 - INFO - Finished converting document Public027.pdf in 22.11 sec.
2025-10-19 02:47:24,795 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:47:24,795 - INFO - Going to convert document batch...
2025-10-19 02:47:24,799 - INFO - Processing document Public028.pdf


✓ Completed: Public027.pdf
Processing batch 10: ['Public028.pdf', 'Public029.pdf', 'Public030.pdf']


2025-10-19 02:47:41,823 - INFO - Finished converting document Public028.pdf in 17.02 sec.
2025-10-19 02:47:41,823 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:47:41,823 - INFO - Going to convert document batch...
2025-10-19 02:47:41,839 - INFO - Processing document Public029.pdf


✓ Completed: Public028.pdf


2025-10-19 02:48:02,565 - INFO - Finished converting document Public029.pdf in 20.75 sec.
2025-10-19 02:48:02,573 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:48:02,581 - INFO - Going to convert document batch...
2025-10-19 02:48:02,583 - INFO - Processing document Public030.pdf


✓ Completed: Public029.pdf


2025-10-19 02:48:31,161 - INFO - Finished converting document Public030.pdf in 28.59 sec.
2025-10-19 02:48:31,163 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:48:31,163 - INFO - Going to convert document batch...
2025-10-19 02:48:31,163 - INFO - Processing document Public031.pdf


✓ Completed: Public030.pdf
Processing batch 11: ['Public031.pdf', 'Public032.pdf', 'Public033.pdf']


2025-10-19 02:49:37,911 - INFO - Finished converting document Public031.pdf in 66.75 sec.
2025-10-19 02:49:37,955 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:49:37,958 - INFO - Going to convert document batch...
2025-10-19 02:49:37,958 - INFO - Processing document Public032.pdf


✓ Completed: Public031.pdf


2025-10-19 02:49:59,894 - INFO - Finished converting document Public032.pdf in 21.94 sec.
2025-10-19 02:49:59,901 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:49:59,911 - INFO - Going to convert document batch...
2025-10-19 02:49:59,911 - INFO - Processing document Public033.pdf


✓ Completed: Public032.pdf


2025-10-19 02:50:27,595 - INFO - Finished converting document Public033.pdf in 27.70 sec.
2025-10-19 02:50:27,615 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:50:27,618 - INFO - Going to convert document batch...
2025-10-19 02:50:27,618 - INFO - Processing document Public034.pdf


✓ Completed: Public033.pdf
Processing batch 12: ['Public034.pdf', 'Public035.pdf', 'Public036.pdf']


2025-10-19 02:56:26,197 - INFO - Finished converting document Public034.pdf in 358.58 sec.
2025-10-19 02:56:26,206 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 02:56:26,206 - INFO - Going to convert document batch...
2025-10-19 02:56:26,206 - INFO - Processing document Public035.pdf


✓ Completed: Public034.pdf


2025-10-19 03:01:09,729 - INFO - Finished converting document Public035.pdf in 283.52 sec.
2025-10-19 03:01:10,082 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:01:10,082 - INFO - Going to convert document batch...
2025-10-19 03:01:10,098 - INFO - Processing document Public036.pdf


✓ Completed: Public035.pdf


2025-10-19 03:05:04,886 - INFO - Finished converting document Public036.pdf in 234.80 sec.
2025-10-19 03:05:05,006 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:05:05,010 - INFO - Going to convert document batch...
2025-10-19 03:05:05,010 - INFO - Processing document Public037.pdf


✓ Completed: Public036.pdf
Processing batch 13: ['Public037.pdf', 'Public038.pdf', 'Public039.pdf']


2025-10-19 03:18:35,710 - INFO - Finished converting document Public037.pdf in 810.70 sec.
2025-10-19 03:18:35,995 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:18:35,995 - INFO - Going to convert document batch...
2025-10-19 03:18:35,995 - INFO - Processing document Public038.pdf


✓ Completed: Public037.pdf


2025-10-19 03:24:31,606 - INFO - Finished converting document Public038.pdf in 355.62 sec.
2025-10-19 03:24:32,156 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:24:32,158 - INFO - Going to convert document batch...
2025-10-19 03:24:32,158 - INFO - Processing document Public039.pdf


✓ Completed: Public038.pdf


2025-10-19 03:25:55,185 - INFO - Finished converting document Public039.pdf in 83.05 sec.
2025-10-19 03:25:55,216 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:25:55,220 - INFO - Going to convert document batch...
2025-10-19 03:25:55,221 - INFO - Processing document Public040 copy.pdf


✓ Completed: Public039.pdf
Processing batch 14: ['Public040 copy.pdf', 'Public040.pdf', 'Public041.pdf']


2025-10-19 03:26:27,223 - INFO - Finished converting document Public040 copy.pdf in 32.00 sec.
2025-10-19 03:26:27,227 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:26:27,227 - INFO - Going to convert document batch...
2025-10-19 03:26:27,227 - INFO - Processing document Public040.pdf


✓ Completed: Public040 copy.pdf


2025-10-19 03:26:59,206 - INFO - Finished converting document Public040.pdf in 31.98 sec.
2025-10-19 03:26:59,218 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:26:59,218 - INFO - Going to convert document batch...
2025-10-19 03:26:59,218 - INFO - Processing document Public041.pdf


✓ Completed: Public040.pdf


2025-10-19 03:27:23,052 - INFO - Finished converting document Public041.pdf in 23.83 sec.
2025-10-19 03:27:23,060 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:27:23,060 - INFO - Going to convert document batch...
2025-10-19 03:27:23,060 - INFO - Processing document Public042.pdf


✓ Completed: Public041.pdf
Processing batch 15: ['Public042.pdf', 'Public043 copy.pdf', 'Public043.pdf']


2025-10-19 03:28:36,366 - INFO - Finished converting document Public042.pdf in 73.30 sec.
2025-10-19 03:28:36,381 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:28:36,386 - INFO - Going to convert document batch...
2025-10-19 03:28:36,386 - INFO - Processing document Public043 copy.pdf


✓ Completed: Public042.pdf


2025-10-19 03:32:08,566 - INFO - Finished converting document Public043 copy.pdf in 212.19 sec.
2025-10-19 03:32:08,815 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:32:08,824 - INFO - Going to convert document batch...
2025-10-19 03:32:08,824 - INFO - Processing document Public043.pdf


✓ Completed: Public043 copy.pdf


2025-10-19 03:38:51,662 - INFO - Finished converting document Public043.pdf in 402.84 sec.
2025-10-19 03:38:52,386 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:38:52,394 - INFO - Going to convert document batch...
2025-10-19 03:38:52,394 - INFO - Processing document Public044.pdf


✓ Completed: Public043.pdf
Processing batch 16: ['Public044.pdf', 'Public045.pdf', 'Public046.pdf']


2025-10-19 03:51:14,741 - INFO - Finished converting document Public044.pdf in 742.36 sec.
2025-10-19 03:51:15,233 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:51:15,242 - INFO - Going to convert document batch...
2025-10-19 03:51:15,242 - INFO - Processing document Public045.pdf


✓ Completed: Public044.pdf


2025-10-19 03:52:24,873 - INFO - Finished converting document Public045.pdf in 69.64 sec.
2025-10-19 03:52:24,891 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:52:24,908 - INFO - Going to convert document batch...
2025-10-19 03:52:24,909 - INFO - Processing document Public046.pdf


✓ Completed: Public045.pdf


2025-10-19 03:53:25,530 - INFO - Finished converting document Public046.pdf in 60.64 sec.
2025-10-19 03:53:25,563 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:53:25,566 - INFO - Going to convert document batch...
2025-10-19 03:53:25,568 - INFO - Processing document Public047.pdf


✓ Completed: Public046.pdf
Processing batch 17: ['Public047.pdf', 'Public048 copy.pdf', 'Public048.pdf']


2025-10-19 03:53:56,512 - INFO - Finished converting document Public047.pdf in 30.97 sec.
2025-10-19 03:53:56,528 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:53:56,534 - INFO - Going to convert document batch...
2025-10-19 03:53:56,534 - INFO - Processing document Public048 copy.pdf


✓ Completed: Public047.pdf


2025-10-19 03:54:22,525 - INFO - Finished converting document Public048 copy.pdf in 25.99 sec.
2025-10-19 03:54:22,538 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:54:22,538 - INFO - Going to convert document batch...
2025-10-19 03:54:22,545 - INFO - Processing document Public048.pdf


✓ Completed: Public048 copy.pdf


2025-10-19 03:54:48,436 - INFO - Finished converting document Public048.pdf in 25.91 sec.
2025-10-19 03:54:48,451 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:54:48,451 - INFO - Going to convert document batch...
2025-10-19 03:54:48,451 - INFO - Processing document Public049.pdf


✓ Completed: Public048.pdf
Processing batch 18: ['Public049.pdf', 'Public050.pdf', 'Public051 copy.pdf']


2025-10-19 03:56:09,150 - INFO - Finished converting document Public049.pdf in 80.69 sec.
2025-10-19 03:56:09,166 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 03:56:09,166 - INFO - Going to convert document batch...
2025-10-19 03:56:09,170 - INFO - Processing document Public050.pdf


✓ Completed: Public049.pdf


2025-10-19 04:06:53,444 - INFO - Finished converting document Public050.pdf in 644.30 sec.
2025-10-19 04:06:53,543 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:06:53,550 - INFO - Going to convert document batch...
2025-10-19 04:06:53,550 - INFO - Processing document Public051 copy.pdf


✓ Completed: Public050.pdf


2025-10-19 04:25:44,622 - INFO - Finished converting document Public051 copy.pdf in 1131.08 sec.
2025-10-19 04:25:45,583 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:25:45,589 - INFO - Going to convert document batch...
2025-10-19 04:25:45,589 - INFO - Processing document Public051.pdf


✓ Completed: Public051 copy.pdf
Processing batch 19: ['Public051.pdf', 'Public052.pdf', 'Public053.pdf']


2025-10-19 04:44:13,543 - INFO - Finished converting document Public051.pdf in 1107.97 sec.
2025-10-19 04:44:14,511 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:44:14,526 - INFO - Going to convert document batch...
2025-10-19 04:44:14,526 - INFO - Processing document Public052.pdf


✓ Completed: Public051.pdf


2025-10-19 04:45:10,599 - INFO - Finished converting document Public052.pdf in 56.09 sec.
2025-10-19 04:45:10,652 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:45:10,655 - INFO - Going to convert document batch...
2025-10-19 04:45:10,655 - INFO - Processing document Public053.pdf


✓ Completed: Public052.pdf


2025-10-19 04:47:19,467 - INFO - Finished converting document Public053.pdf in 128.83 sec.
2025-10-19 04:47:19,510 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:47:19,513 - INFO - Going to convert document batch...
2025-10-19 04:47:19,513 - INFO - Processing document Public054.pdf


✓ Completed: Public053.pdf
Processing batch 20: ['Public054.pdf', 'Public055.pdf', 'Public056.pdf']


2025-10-19 04:48:12,699 - INFO - Finished converting document Public054.pdf in 53.19 sec.
2025-10-19 04:48:12,717 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:48:12,722 - INFO - Going to convert document batch...
2025-10-19 04:48:12,722 - INFO - Processing document Public055.pdf


✓ Completed: Public054.pdf


2025-10-19 04:49:50,756 - INFO - Finished converting document Public055.pdf in 98.03 sec.
2025-10-19 04:49:50,787 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:49:50,789 - INFO - Going to convert document batch...
2025-10-19 04:49:50,790 - INFO - Processing document Public056.pdf


✓ Completed: Public055.pdf


2025-10-19 04:51:13,203 - INFO - Finished converting document Public056.pdf in 82.42 sec.
2025-10-19 04:51:13,238 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:51:13,240 - INFO - Going to convert document batch...
2025-10-19 04:51:13,241 - INFO - Processing document Public057.pdf


✓ Completed: Public056.pdf
Processing batch 21: ['Public057.pdf', 'Public058.pdf', 'Public059.pdf']


2025-10-19 04:52:28,234 - INFO - Finished converting document Public057.pdf in 75.00 sec.
2025-10-19 04:52:28,250 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:52:28,266 - INFO - Going to convert document batch...
2025-10-19 04:52:28,266 - INFO - Processing document Public058.pdf


✓ Completed: Public057.pdf


2025-10-19 04:53:05,933 - INFO - Finished converting document Public058.pdf in 37.67 sec.
2025-10-19 04:53:05,943 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:53:05,943 - INFO - Going to convert document batch...
2025-10-19 04:53:05,943 - INFO - Processing document Public059.pdf


✓ Completed: Public058.pdf


2025-10-19 04:53:37,300 - INFO - Finished converting document Public059.pdf in 31.36 sec.
2025-10-19 04:53:37,308 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:53:37,308 - INFO - Going to convert document batch...
2025-10-19 04:53:37,308 - INFO - Processing document Public060.pdf


✓ Completed: Public059.pdf
Processing batch 22: ['Public060.pdf', 'Public061.pdf', 'Public062.pdf']


2025-10-19 04:59:24,488 - INFO - Finished converting document Public060.pdf in 347.17 sec.
2025-10-19 04:59:24,504 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:59:24,508 - INFO - Going to convert document batch...
2025-10-19 04:59:24,508 - INFO - Processing document Public061.pdf


✓ Completed: Public060.pdf


2025-10-19 04:59:54,650 - INFO - Finished converting document Public061.pdf in 30.14 sec.
2025-10-19 04:59:54,663 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 04:59:54,663 - INFO - Going to convert document batch...
2025-10-19 04:59:54,663 - INFO - Processing document Public062.pdf


✓ Completed: Public061.pdf


2025-10-19 05:01:35,165 - INFO - Finished converting document Public062.pdf in 100.50 sec.
2025-10-19 05:01:35,186 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 05:01:35,187 - INFO - Going to convert document batch...
2025-10-19 05:01:35,190 - INFO - Processing document Public063.pdf


✓ Completed: Public062.pdf
Processing batch 23: ['Public063.pdf', 'Public064.pdf', 'Public065.pdf']


2025-10-19 05:13:24,737 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 05:13:24,741 - INFO - Going to convert document batch...
2025-10-19 05:13:24,742 - INFO - Processing document Public064.pdf


❌ Error processing Public063.pdf: maximum recursion depth exceeded


2025-10-19 05:20:00,261 - INFO - Finished converting document Public064.pdf in 395.52 sec.
2025-10-19 05:20:00,278 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 05:20:00,278 - INFO - Going to convert document batch...
2025-10-19 05:20:00,278 - INFO - Processing document Public065.pdf


✓ Completed: Public064.pdf


2025-10-19 05:20:21,502 - INFO - Finished converting document Public065.pdf in 21.22 sec.
2025-10-19 05:20:21,510 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 05:20:21,510 - INFO - Going to convert document batch...
2025-10-19 05:20:21,510 - INFO - Processing document Public066.pdf


✓ Completed: Public065.pdf
Processing batch 24: ['Public066.pdf', 'Public067.pdf', 'Public068.pdf']


2025-10-19 05:20:49,201 - INFO - Finished converting document Public066.pdf in 27.70 sec.
2025-10-19 05:20:49,201 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 05:20:49,201 - INFO - Going to convert document batch...
2025-10-19 05:20:49,201 - INFO - Processing document Public067.pdf


✓ Completed: Public066.pdf


2025-10-19 05:21:13,817 - INFO - Finished converting document Public067.pdf in 24.61 sec.
2025-10-19 05:21:13,833 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 05:21:13,833 - INFO - Going to convert document batch...
2025-10-19 05:21:13,833 - INFO - Processing document Public068.pdf


✓ Completed: Public067.pdf


2025-10-19 06:31:42,591 - INFO - Finished converting document Public068.pdf in 4228.78 sec.
2025-10-19 06:31:42,748 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-19 06:31:42,748 - INFO - Going to convert document batch...
2025-10-19 06:31:42,754 - INFO - Processing document Public069.pdf


✓ Completed: Public068.pdf
Processing batch 25: ['Public069.pdf', 'Public070.pdf', 'Public071.pdf']


# Annotate images with ollama

In [None]:
import sys
from pathlib import Path

# Thêm đường dẫn project vào sys.path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    PictureDescriptionApiOptions
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc.base import ImageRefMode
from src.config import config

In [None]:
def convert_with_image_annotation(input_doc_path):
    # Đọc cấu hình từ config.yaml
    base_url = config.get("model", "url", default="http://localhost:11434/v1")
    vision_model = config.get("model", "vision_model", default="")
    text_model = config.get("model", "text_generation", default="")
    picture_prompt = config.get("document", "picture_description", "prompt_picture_description",
                                default="Miêu tả chi tiết hình ảnh sau bằng một đoạn văn.")
    pd_enabled = config.get("document", "picture_description", "enabled", default=True)
    image_scale = config.get("document", "image_resolution_scale", default=2)

    # Nếu tắt picture description, không dùng remote API
    if not pd_enabled:
        pipeline_options = PdfPipelineOptions(
            images_scale=image_scale,
            generate_picture_images=False,
            do_picture_description=False,
            picture_description_options=None,
            enable_remote_services=False,
        )
    else:
        # Dùng vision_model nếu có, fallback sang text_model
        model_name = vision_model if vision_model else text_model

        # Chuẩn hóa URL cho Ollama OpenAI-compatible API
        if not base_url.endswith("/chat/completions") and not base_url.endswith("/v1/chat/completions"):
            if base_url.endswith("/v1"):
                api_url = f"{base_url}/chat/completions"
            else:
                api_url = f"{base_url}/v1/chat/completions"
        else:
            api_url = base_url

        picture_desc_api_option = PictureDescriptionApiOptions(
            url=api_url,
            prompt=picture_prompt,
            params={"model": model_name},
            timeout=60,
        )

        pipeline_options = PdfPipelineOptions(
            images_scale=image_scale,
            generate_picture_images=True,
            do_picture_description=True,
            picture_description_options=picture_desc_api_option,
            enable_remote_services=True,
        )

    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
    )
    conv_res = converter.convert(source=input_doc_path)
    return conv_res

In [9]:
result = convert_with_image_annotation(file)

2025-10-17 16:54:30,127 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-17 16:54:30,132 - INFO - Going to convert document batch...
2025-10-17 16:54:30,133 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 68e059f3e75bb40d4b4d82a1df76f3d7
2025-10-17 16:54:30,133 - INFO - Accelerator device: 'mps'


KeyboardInterrupt: 

In [None]:
print(result.document.export_to_markdown(mark_annotations = True, include_annotations=True))

<!-- image -->

## 1. L ờ i m ở đầ u

Bài toán nhận diện biển số xe Việt Nam là một bài toán không còn mới, đã được phát triển dựa trên các phương pháp xử lý ảnh truyền thống và cả những kỹ thuật mới sử dụng Deep Learning. Trong bài toán này tôi chỉ phát triển bài toán phát hiện biển số (một phần trong bài toán nhận diện biển số) dựa trên thuật toán YOLO -Tinyv4 với mục đích:

- Hướng dẫn chuẩn bị dữ liệu cho bài toán Object Detection.
- Hướng dẫn huấn luyện YOLO -TinyV4 dùng darknet trên Google Colab.

## 2. Chu ẩ n b ị d ữ li ệ u

## 2.1 Đánh giá bộ d ữ li ệ u

Trong bài viết tôi sử dụng bộ dữ liệu biển số xe máy Việt Nam chứa 1750 ảnh, bạn đọc có thể tải tại đây .

Hình 14.1: Ảnh biển số trong bộ dữ liệu

<!--<annotation kind="description">-->Bái có cảm ảng cấp trong mạnh trị của một vị trí chuyển lại phúc vào thành công tất cả.<!--<annotation/>-->

<!-- image -->

Ảnh biển số xe được trong bộ dữ liệu được chụp từ một camera tại vị trí kiểm soát xe ra vào trong hầm. Do vậy:

- Kích 

# Save to MD with external referenced images

In [None]:
def export_function_md_with_image_ref(conv_res, output_path:str, replace_blank:str="_"):

    output_dir = Path(output_path)
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem.replace(" ", replace_blank)

    # Save markdown with externally referenced pictures
    md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED, include_annotations=True)

In [None]:
export_function_md_with_image_ref(result, "outputs")