In [None]:
!pip install -U -q git+https://github.com/huggingface/trl.git bitsandbytes peft qwen-vl-utils trackio
# Tested with trl==0.22.0.dev0, bitsandbytes==0.47.0, peft==0.17.1, qwen-vl-utils==0.0.11, trackio==0.2.8

[0m

In [None]:
!pip install -q ipywidgets

[0m

In [3]:
!pip install transformers==4.57.3
!pip install huggingface_hub==0.36.0
# For CPU + CUDA compatible
!pip install torchvision --upgrade

Collecting transformers==4.57.3
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)


Using cached transformers-4.57.3-py3-none-any.whl (12.0 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.6
    Uninstalling transformers-4.57.6:
      Successfully uninstalled transformers-4.57.6
Successfully installed transformers-4.57.3
[0mCollecting torchvision
  Using cached torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.9 kB)
Using cached torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl (8.0 MB)
Installing collected packages: torchvision
Successfully installed torchvision-0.24.1
[0m

In [4]:
!pip install flash-attn --no-build-isolation

Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting einops (from flash-attn)
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Downloading einops-0.8.1-py3-none-any.whl (64 kB)
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25ldone
[?25h  Created wheel for flash-attn: filename=flash_attn-2.8.3-cp312-cp312-linux_x86_64.whl size=253780426 sha256=4e2f9e39313266b1544b68138b15b91ee6221eccf14f7902b7c6620351340810
  Stored in directory: /home/.cache/pip/wheels/3d/59/46/f282c12c73dd4bb3c2e3fe199f1a0d0f8cec06df0cccfeee27
Successfully built flash-attn
Installing collected packages: einops, flash-attn
Successfully installed einops-0.8.1 flash-attn-2.8.3
[0m

In [1]:
system_message = '''
You are an expert vision-language document extraction system specialized in industrial Bill of Materials (BOM).

Your ONLY task is to extract BOM data from images and return a STRICTLY VALID JSON object.

CORE OPERATIONAL RULES:
1. DYNAMIC EXTRACTION: Only include keys in the JSON output for columns that are physically visible in the provided image.
2. STRICT MAPPING: Map visual headers (even in other languages like Turkish 'Adet' or 'Parça No') to the 12 canonical keys provided in the user prompt.
3. CONSERVATIVE BIAS: NEVER guess or infer values. If a cell is blurry or a column is missing, omit the key or use an empty string "" only if the column header exists but the cell is blank.
4. UOM INTEGRITY: Never map Part Numbers/Codes to UOM. UOM must be a standard unit (EA, PC, SET, etc.). If a value looks like a part code (e.g., 'T250-01'), it belongs in 'Manufacturer’s real part No.'.
5. NO CHATTER: Output ONLY the JSON object. No explanations, markdown headers (unless requested), or comments.
'''

In [2]:
def format_data(sample):
    return {
      "images": [sample["image"]],
      "messages": [

          {
              "role": "system",
              "content": [
                  {
                      "type": "text",
                      "text": system_message
                  }
              ],
          },
          {
              "role": "user",
              "content": [
                  {
                      "type": "image",
                      "image": sample["image"]
                  },
                  {
                      "type": "text",
                      "text": sample['query'],
                  }
              ],
          },
          {
              "role": "assistant",
              "content": [
                  {
                      "type": "text",
                      "text": sample["label"][0]
                  }
              ],
          },
      ]
      }

In [3]:
from datasets import load_dataset

dataset_name = "Ibrah-N/bom_dataset"
train_dataset, eval_dataset = load_dataset(dataset_name, split=['train', 'validation'])

In [4]:
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]

In [5]:
train_dataset[0]

{'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGB size=2481x3509>],
 'messages': [{'role': 'system',
   'content': [{'type': 'text',
     'text': '\nYou are an expert vision-language document extraction system specialized in industrial Bill of Materials (BOM).\n\nYour ONLY task is to extract BOM data from images and return a STRICTLY VALID JSON object.\n\nCORE OPERATIONAL RULES:\n1. DYNAMIC EXTRACTION: Only include keys in the JSON output for columns that are physically visible in the provided image.\n2. STRICT MAPPING: Map visual headers (even in other languages like Turkish \'Adet\' or \'Parça No\') to the 12 canonical keys provided in the user prompt.\n3. CONSERVATIVE BIAS: NEVER guess or infer values. If a cell is blurry or a column is missing, omit the key or use an empty string "" only if the column header exists but the cell is blank.\n4. UOM INTEGRITY: Never map Part Numbers/Codes to UOM. UOM must be a standard unit (EA, PC, SET, etc.). If a value looks like a part co

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor

In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

# Update to the 2.5 3B version
model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    # Flash Attention 2 is critical for high-resolution table images
    attn_implementation="flash_attention_2", 
)

# Use AutoProcessor for the 2.5 version
processor = AutoProcessor.from_pretrained(
    model_id,     
    min_pixels=512 * 28 * 28,
    max_pixels=1792 * 28 * 28
    )

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [6]:
model_id = "Qwen/Qwen2-VL-2B-Instruct"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

processor = Qwen2VLProcessor.from_pretrained(model_id)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [None]:
from qwen_vl_utils import process_vision_info

def generate_text_from_sample(model, processor, sample, max_new_tokens=1536, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample['messages'][1:2],  # Use the sample without the system message
        tokenize=False,
        add_generation_prompt=True
    )

    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(sample['messages'])

    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [8]:
# Example of how to call the method with sample:
output = generate_text_from_sample(model, processor, train_dataset[0])
output

'```json\n{\n  "rows": [\n    {\n      "Position Number": "1",\n      "Description of parts": "DRUM ASSEMBLY",\n      "Quantity": "1"\n    },\n    {\n      "Position Number": "2",\n      "Description of parts": "CLUTCH PISTON AND SEALS ASSEMBLY",\n      "Quantity": "1"\n    },\n    {\n      "Position Number": "3",\n      "Description of parts": "CLUTCH PISTON INNER SEAL",\n      "Quantity": "1"\n    },\n    {\n      "Position Number": "4",\n      "Description of parts": "CLUTCH PISTON OUTER SEAL",\n      "Quantity": "1"\n    },\n    {\n      "Position Number": "5",\n      "Description of parts": "DISC",\n      "Quantity": "4"\n    },\n    {\n      "Position Number": "6",\n      "Description of parts": "FRICTION PLATE",\n      "Quantity": "4"\n    },\n    {\n      "Position Number": "7",\n      "Description of parts": "END PLATE",\n      "Quantity": "1"\n    },\n    {\n      "Position Number": "11",\n      "Description of parts": "PISTON RING",\n      "Quantity": "2"\n    }\n  ]\n}\n```

In [None]:
train_dataset[0]['messages'][1]['content'][0]['image']

In [9]:
train_dataset[0]['messages']


[{'role': 'system',
  'content': [{'type': 'text',
    'text': '\nYou are an expert vision-language document extraction system specialized in industrial Bill of Materials (BOM).\n\nYour ONLY task is to extract BOM data from images and return a STRICTLY VALID JSON object.\n\nCORE OPERATIONAL RULES:\n1. DYNAMIC EXTRACTION: Only include keys in the JSON output for columns that are physically visible in the provided image.\n2. STRICT MAPPING: Map visual headers (even in other languages like Turkish \'Adet\' or \'Parça No\') to the 12 canonical keys provided in the user prompt.\n3. CONSERVATIVE BIAS: NEVER guess or infer values. If a cell is blurry or a column is missing, omit the key or use an empty string "" only if the column header exists but the cell is blank.\n4. UOM INTEGRITY: Never map Part Numbers/Codes to UOM. UOM must be a standard unit (EA, PC, SET, etc.). If a value looks like a part code (e.g., \'T250-01\'), it belongs in \'Manufacturer’s real part No.\'.\n5. NO CHATTER: Outpu

In [10]:
import gc
import time

def clear_memory():
    # Delete variables if they exist in the current global scope
    if 'inputs' in globals(): del globals()['inputs']
    if 'model' in globals(): del globals()['model']
    if 'processor' in globals(): del globals()['processor']
    if 'trainer' in globals(): del globals()['trainer']
    if 'bnb_config' in globals(): del globals()['bnb_config']
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

clear_memory()

GPU allocated memory: 0.01 GB
GPU reserved memory: 6.99 GB


In [None]:
from transformers import BitsAndBytesConfig, Qwen2_5_VLForConditionalGeneration, AutoProcessor
import torch

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load Qwen2.5-VL-3B with 4-bit quantization
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id, # Ensure this is "Qwen/Qwen2.5-VL-3B-Instruct"
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    # Flash Attention 2 is highly recommended for speed
    attn_implementation="flash_attention_2" 
)

# Use AutoProcessor for Qwen2.5
processor = AutoProcessor.from_pretrained(
    model_id,
    min_pixels=512 * 28 * 28,
    max_pixels=1792 * 28 * 28
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
from transformers import BitsAndBytesConfig

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
processor = Qwen2VLProcessor.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from peft import LoraConfig

# Optimized LoRA for Qwen2.5-VL
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,  # Increased from 8 to 16 for better learning capacity
    bias="none",
    # Target all linear layers in the LLM and the Vision Tower
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "vision_proj"],
    task_type="CAUSAL_LM",
)

In [11]:
from peft import LoraConfig

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

In [None]:
from trl import SFTConfig

training_args = SFTConfig(
    output_dir="qwen2_5-3b-bom-extraction",
    num_train_epochs=1, 
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # Increased for better stability with 3B model
    gradient_checkpointing=True,    # Required for 3B + QLoRA
    gradient_checkpointing_kwargs={"use_reentrant": False},
    
    # CRITICAL: Do not use None. Set a length that fits your longest JSON table.
    max_length=1536,
    
    dataset_text_field="",         # Required by SFTConfig but handled by processor
    dataset_kwargs={"skip_prepare_dataset": True}, # We handle processing ourselves
    
    optim="paged_adamw_8bit",      # Better for 4-bit/QLoRA training memory
    learning_rate=5e-5,            # Slightly lower for more stable convergence
    bf16=True,
    
    logging_steps=10, 
    eval_strategy="steps",
    eval_steps=20,
    save_strategy="steps",
    save_steps=50,
    
    max_grad_norm=0.3,
    warmup_ratio=0.1,               # Increased warmup for 200-sample dataset
    report_to="none"               # Or "wandb" if you use weights & biases
)

In [None]:
from trl import SFTConfig



# Configure training arguments
training_args = SFTConfig(
    output_dir="qwen2-2b-instruct-trl-sft-ChartQA",  # Directory to save the model
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=1,  # Batch size for training
    per_device_eval_batch_size=1,  # Batch size for evaluation
    gradient_accumulation_steps=4,  # Steps to accumulate gradients
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    max_length=None,
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    # Logging and evaluation
    logging_steps=40,  # Steps interval for logging
    eval_steps=40,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=65,  # Steps interval for saving
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    # Hub and reporting
    # push_to_hub=True  # Whether to push model to Hugging Face Hub
)

In [None]:
from huggingface_hub import login

# Replace with your write-enabled token from Hugging Face
login()

In [15]:
import trackio

trackio.init(
    project="qwen2-2b-instruct-trl-sft-ChartQA",
    name="qwen2-2b-instruct-trl-sft-ChartQA",
    config=training_args,
    space_id=training_args.output_dir + "-trackio"
)

* Trackio project initialized: qwen2-2b-instruct-trl-sft-ChartQA
* Trackio metrics will be synced to Hugging Face Dataset: Ibrah-N/qwen2.5-3b-bom-extraction-trackio-dataset
* Found existing space: https://huggingface.co/spaces/Ibrah-N/qwen2.5-3b-bom-extraction-trackio
* View dashboard by going to: https://Ibrah-N-qwen2.5-3b-bom-extraction-trackio.hf.space/


* Created new run: qwen2-2b-instruct-trl-sft-ChartQA


<trackio.run.Run at 0x7f6c2ad83740>

In [16]:
import gc 
torch.cuda.empty_cache()
gc.collect()

50345

In [17]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    processing_class=processor,
)

In [18]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 9.77 GiB. GPU 0 has a total capacity of 39.49 GiB of which 7.67 GiB is free. Including non-PyTorch memory, this process has 31.69 GiB memory in use. Of the allocated memory 26.94 GiB is allocated by PyTorch, and 4.24 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [18]:
trainer.save_model(training_args.output_dir)

In [31]:
clear_memory()

GPU allocated memory: 0.02 GB
GPU reserved memory: 4.12 GB


In [32]:
# -- base model --
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

processor = Qwen2VLProcessor.from_pretrained(model_id)

# -- adopter --
adapter_path = "/home/PNID_VLM_Qwen2_Training/qwen2-2b-instruct-trl-sft-ChartQA/checkpoint-132"
model.load_adapter(adapter_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [33]:
test_sample_1 = eval_dataset[0]
test_sample_2 = eval_dataset[1]

test_sample_3 = eval_dataset[2]
test_sample_4 = eval_dataset[3]

test_sample_5 = eval_dataset[4]

In [34]:
output = generate_text_from_sample(model, processor, test_sample_3)
output

'```json\n{\n  "rows": [\n    {\n      "Position Number": "1",\n      "Tag Number": "1",\n      "Description of parts": "1",\n      "Material Specification": "1",\n      "Quantity": "1",\n      "Manufacturer\'s drawing or Ref.No.": "1",\n      "Original Equipment Manufacturer": "1",\n      "Original Part Manufacturer": "1",\n      "Manufacturer\'s real part No.": "1",\n      "Unit of Measurement (UOM)": "1",\n      "Approximate unit price in SAR": "1",\n      "Recommended Quantity": "1"\n    }\n  ]\n}\n```'

In [27]:
output

'```json\n{\n  "rows": [\n    {\n      "Position Number": "1",\n      "Tag Number": "1",\n      "Description of parts": "1",\n      "Material Specification": "1",\n      "Quantity": "1",\n      "Manufacturer\'s drawing or Ref.No.": "1",\n      "Original Equipment Manufacturer": "1",\n      "Original Part Manufacturer": "1",\n      "Manufacturer\'s real part No.": "1",\n      "Unit of Measurement (UOM)": "1",\n      "Approximate unit price in SAR": "1",\n      "Recommended Quantity": "1"\n    }\n  ]\n}\n```'