# Fine-tune opendatalab/MinerU2.5-2509-1.2B with LoRA on PDF→Markdown dataset

In [None]:
# Cell 1: Install dependencies
# %pip install --upgrade pip
# %pip install transformers peft accelerate datasets pdf2image pillow

In [1]:
# Cell 2: Imports và cấu hình chung
import os, json, base64
from pathlib import Path
from pdf2image import convert_from_path

import torch
from datasets import load_dataset
from transformers import (
    AutoProcessor,
    Qwen2VLForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from peft import LoraConfig, get_peft_model

# Thư mục dữ liệu
PDF_DIR      = Path("training2/training_input")     # chứa *.pdf
MD_DIR       = Path("training2/training_output")       # mỗi subfolder chứa main.md
PAGE_IMG_DIR = Path("pages")              # lưu ảnh full-page
TRAIN_JSON   = "train.json"

# Model & training config
MODEL_NAME   = "opendatalab/MinerU2.5-2509-1.2B"
BATCH_SIZE   = 2
ACCUM_STEPS  = 8
EPOCHS       = 3
LR           = 1e-4

PAGE_IMG_DIR.mkdir(exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed

# samples = []
# pdf_paths = list(PDF_DIR.glob("*.pdf"))

# def process_pdf(pdf_path):
#     # Import modules inside the function for ProcessPoolExecutor
#     import os
#     import base64
#     from pathlib import Path
#     from pdf2image import convert_from_path

#     # Define paths inside the function (avoid global state)
#     MD_DIR = Path("training2/training_output")
#     PAGE_IMG_DIR = Path("pages")

#     stem = pdf_path.stem
#     print(f'Processing: {stem}')  # Use f-string for clarity
#     md_file = MD_DIR / stem / "main.md"
#     if not os.path.exists(md_file):
#         print(f"Missing markdown for {stem}, skipped")
#         return None

#     # Convert PDF → ảnh full-page
#     try:
#         pages = convert_from_path(str(pdf_path), dpi=300, thread_count=14)  # Reduced thread_count for stability
#     except Exception as e:
#         poppler_path = r"D:\\Utilities\\poppler-25.07.0\\Library\bin"  # Adjust this path if needed
#         if os.path.exists(poppler_path):
#             pages = convert_from_path(str(pdf_path), dpi=300, poppler_path=poppler_path, thread_count=14)
#         else:
#             print(f"Error: Poppler not found for {stem}. Install and add to PATH.")
#             return None  # Return None instead of raising to avoid pool breakage

#     page_paths = []
#     for idx, page in enumerate(pages, start=1):
#         img_path = PAGE_IMG_DIR / f"{stem}_page{idx}.png"
#         page.save(img_path)
#         page_paths.append(str(img_path))

#     # Đọc và encode Markdown gốc
#     md_bytes = md_file.read_bytes()
#     md_b64 = base64.b64encode(md_bytes).decode("utf-8")

#     # Tạo user_content (ảnh full-page + prompt)
#     user_content = [
#         {"type": "image", "image": p, "resized_height": 1036, "resized_width": 1036}
#         for p in page_paths
#     ] + [
#         {"type": "text", "text": "Convert to Markdown:"}
#     ]

#     return {
#         "conversations": [
#             {"role": "user", "content": user_content},
#             {"role": "assistant", "content_base64": md_b64}
#         ]
#     }

# # Replace the parallel processing block in Cell 3:
# with ThreadPoolExecutor(max_workers=10) as executor:  # Use threads instead of processes
#     futures = {executor.submit(process_pdf, pdf_path): pdf_path for pdf_path in pdf_paths}
#     for future in as_completed(futures):
#         pdf_path = futures[future]
#         try:
#             result = future.result()
#             if result:
#                 samples.append(result)
#         except Exception as exc:
#             print(f"PDF {pdf_path.stem} generated an exception: {exc}")

# with open(TRAIN_JSON, "w", encoding="utf-8") as f:
#     json.dump(samples, f, ensure_ascii=False, indent=2)

# print(f"Prepared {len(samples)} samples in {TRAIN_JSON}")

In [3]:
raw_ds = load_dataset("json", data_files=TRAIN_JSON, split="train")
processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = processor.tokenizer

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [11]:


# Cell 4: Windows-compatible multiprocessing version
def preprocess_fn_batched(batch, model_name="opendatalab/MinerU2.5-2509-1.2B"):
    """Process multiple samples at once - Windows multiprocessing compatible"""
    import base64
    from transformers import AutoProcessor, DataCollatorForSeq2Seq  # ✅ Import inside function

    # ✅ Load processor inside function (each worker gets its own copy)
    processor = AutoProcessor.from_pretrained(model_name)
    tokenizer = processor.tokenizer
    batch_size = len(batch["conversations"])

    input_ids_list = []
    attention_mask_list = []
    pixel_values_list = []
    labels_list = []

    for i in range(batch_size):
        user = batch["conversations"][i][0]
        assistant_b64 = batch["conversations"][i][1]["content_base64"]

        assistant_text = base64.b64decode(assistant_b64).decode("utf-8")
        prompt = user["content"][-1]["text"]
        img_list = [c["image"] for c in user["content"] if c["type"]=="image"]

        inputs = processor(
            text=[prompt],
            images=img_list,
            padding=True,
            return_tensors="pt"
        )

        labels = tokenizer(
            assistant_text,
            return_tensors="pt",
            add_special_tokens=False
        ).input_ids

        # In preprocess_fn_batched, before appending
        if inputs.pixel_values is None:
            raise ValueError(f"Pixel values are None for sample {i}")

        print(f"Pixel values shape: {inputs.pixel_values.shape}")
        import traceback; traceback.print_exc()

        input_ids_list.append(inputs.input_ids[0])
        attention_mask_list.append(inputs.attention_mask[0])
        pixel_values_list.append(inputs.pixel_values[0])
        labels_list.append(labels[0])

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "pixel_values": pixel_values_list,
        "labels": labels_list,
    }

In [13]:
# Now multiprocessing will work!
ds = raw_ds.map(
    preprocess_fn_batched,
    batched=True,
    batch_size=4,
    remove_columns=raw_ds.column_names,
    num_proc=6,
    load_from_cache_file=True,
    desc="Processing samples",
    fn_kwargs={"model_name": MODEL_NAME}
)

Processing samples (num_proc=6): 100%|██████████| 100/100 [01:15<00:00,  1.33 examples/s]


In [14]:
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print(device)
print(ds)

cuda
Dataset({
    features: ['input_ids', 'attention_mask', 'pixel_values', 'labels'],
    num_rows: 100
})


In [15]:
# Cell 5: Load model và cấu hình LoRA
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    device_map="auto",
    dtype=torch.bfloat16
)

lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=32,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj"
    ]
)
model = get_peft_model(model, lora_cfg)

In [16]:
# Cell 6: Cấu hình Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)

# Cell 6: Training configuration
training_args = TrainingArguments(
    output_dir="mineru_lora_finetuned",

    # GPU parameters (independent of CPU)
    per_device_train_batch_size=2,    # ← Based on GPU VRAM
    gradient_accumulation_steps=8,     # ← Effective batch = 16

    # CPU-related (data loading)
    dataloader_num_workers=4,          # ← 4 workers for data loading
    # ↑ Separate from preprocessing num_proc

    learning_rate=LR,
    num_train_epochs=EPOCHS,
    bf16=True,                         # ← Better than fp16 for your model
    save_steps=200,
    save_total_limit=2,
    logging_steps=20,
    remove_unused_columns=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    data_collator=data_collator,
    processing_class=tokenizer
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [13]:
# Cell 7: Bắt đầu fine-tuning
train_result = trainer.train()
trainer.save_model("mineru_lora_finetuned/final")
print(f"Training completed! Metrics: {train_result.metrics}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


TypeError: 'NoneType' object is not iterable

In [None]:
# # Cell 8: Ví dụ inference
# from peft import PeftModel

# base = Qwen2VLForConditionalGeneration.from_pretrained(
#     MODEL_NAME,
#     trust_remote_code=True,
#     device_map="auto",
#     torch_dtype=torch.bfloat16
# )
# model_ft = PeftModel.from_pretrained(base, "mineru_lora_finetuned/final")

# def inference(image_paths, prompt="Convert to Markdown:"):
#     inp = processor(text=[prompt], images=image_paths, return_tensors="pt").to("cuda")
#     out = model_ft.generate(**inp, max_new_tokens=1024)
#     return processor.batch_decode(out, skip_special_tokens=True)[0]

# # Test với Public001
# imgs = sorted([str(p) for p in PAGE_IMG_DIR.glob("Public001_page*.png")])
# print(inference(imgs))