In [1]:
!pip install -U -q git+https://github.com/huggingface/trl.git bitsandbytes peft qwen-vl-utils trackio
# Tested with trl==0.22.0.dev0, bitsandbytes==0.47.0, peft==0.17.1, qwen-vl-utils==0.0.11, trackio==0.2.8

[0m

In [13]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.15-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.8-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl (914 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m914.9/914.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading widgetsnbextension-4.0.15-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
Successfully installed ipywidgets-8.1.8 jupyterlab_widgets-3.0.16 widgetsnbextension-4.0.15
[0m

In [None]:
!pip install transformers==4.57.3
!pip install huggingface_hub==0.36.0
# For CPU + CUDA compatible
!pip install torchvision --upgrade


[0m

In [1]:
system_message = '''You are an expert vision-language document extraction system.

Your ONLY task is to extract Bill of Materials (BOM) data from images of tables, drawings, PDFs, or scanned documents and return it as a STRICTLY VALID JSON object.

You must be:
- STRICT
- CONSERVATIVE
- CONSISTENT

NEVER guess or infer missing values.
If a value is not clearly visible or labeled, output an empty string "".

You must extract EXACTLY the following 12 fields for each row:
1. Position Number
2. Tag Number
3. Description of parts
4. Material Specification
5. Quantity
6. Manufacturer's drawing or Ref.No.
7. Original Equipment Manufacturer
8. Original Part Manufacturer
9. Manufacturer's real part No.
10. Unit of Measurement (UOM)
11. Approximate unit price in SAR
12. Recommended Quantity

STRICT RULES:
- NEVER map Part Number, Code, Parça No, or Item Code to UOM.
- NEVER invent Tag Numbers or UOM values.
- If there is no explicit Tag or UOM column, leave those fields empty.
- If you are unsure, leave the field empty "".

Your final output MUST:
- Be valid JSON
- Contain ONLY the JSON object
- Match the exact schema provided
- Contain no explanations, comments, or formatting outside JSON
'''

In [2]:
def format_data(sample):
    return {
      "images": [sample["image"]],
      "messages": [

          {
              "role": "system",
              "content": [
                  {
                      "type": "text",
                      "text": system_message
                  }
              ],
          },
          {
              "role": "user",
              "content": [
                  {
                      "type": "image",
                      "image": sample["image"],
                  },
                  {
                      "type": "text",
                      "text": sample['query'],
                  }
              ],
          },
          {
              "role": "assistant",
              "content": [
                  {
                      "type": "text",
                      "text": sample["label"][0]
                  }
              ],
          },
      ]
      }

In [3]:
from datasets import load_dataset

dataset_name = "Ibrah-N/bom_dataset"
train_dataset, eval_dataset = load_dataset(dataset_name, split=['train', 'validation'])

In [None]:
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor

In [None]:
model_id = "Qwen/Qwen2-VL-2B-Instruct"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

processor = Qwen2VLProcessor.from_pretrained(model_id)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [None]:
from qwen_vl_utils import process_vision_info

def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample['messages'][1:2],  # Use the sample without the system message
        tokenize=False,
        add_generation_prompt=True
    )

    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(sample['messages'])

    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [None]:
# Example of how to call the method with sample:
output = generate_text_from_sample(model, processor, train_dataset[0])
output

'```json\n{\n  "rows": [\n    {\n      "Position Number": "1",\n      "Tag Number": "100171093",\n      "Description of parts": "DRUM ASSEMBLY",\n      "Material Specification": "1",\n      "Quantity": "1",\n      "Manufacturer\'s drawing or Ref.No.": "100171089",\n      "Original Equipment Manufacturer": "NORMET",\n      "Original Part Manufacturer": "NORMET",\n      "Manufacturer\'s real part No.": "100171089",\n      "Unit of Measurement (UOM)": "1",\n      "Approximate unit price in SAR": "1",\n      "Recommended Quantity": "1"\n    },\n    {\n      "Position Number": "2",\n      "Tag Number": "100171089",\n      "Description of parts": "CLUTCH PISTON AND SEALS ASSEMBLY",\n      "Material Specification": "1",\n      "Quantity": "1",\n      "Manufacturer\'s drawing or Ref.No.": "54893151",\n      "Original Equipment Manufacturer": "NORMET",\n      "Original Part Manufacturer": "NORMET",\n      "Manufacturer\'s real part No.": "54893151",\n      "Unit of Measurement (UOM)": "1",\n   

In [17]:
import gc
import time

def clear_memory():
    # Delete variables if they exist in the current global scope
    if 'inputs' in globals(): del globals()['inputs']
    if 'model' in globals(): del globals()['model']
    if 'processor' in globals(): del globals()['processor']
    if 'trainer' in globals(): del globals()['trainer']
    if 'bnb_config' in globals(): del globals()['bnb_config']
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

clear_memory()

GPU allocated memory: 26.38 GB
GPU reserved memory: 27.17 GB


In [10]:
from transformers import BitsAndBytesConfig

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
processor = Qwen2VLProcessor.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
from peft import LoraConfig

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

In [12]:
from trl import SFTConfig



# Configure training arguments
training_args = SFTConfig(
    output_dir="qwen2-2b-instruct-trl-sft-ChartQA",  # Directory to save the model
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=2,  # Batch size for training
    per_device_eval_batch_size=2,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Steps to accumulate gradients
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    max_length=None,
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    # Logging and evaluation
    logging_steps=5,  # Steps interval for logging
    eval_steps=5,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=5,  # Steps interval for saving
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    # Hub and reporting
    # push_to_hub=True  # Whether to push model to Hugging Face Hub
)

In [13]:
from huggingface_hub import login

# Replace with your write-enabled token from Hugging Face
login(token="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")

In [14]:
import trackio

trackio.init(
    project="qwen2-2b-instruct-trl-sft-ChartQA",
    name="qwen2-2b-instruct-trl-sft-ChartQA",
    config=training_args,
    space_id=training_args.output_dir + "-trackio"
)

* Trackio project initialized: qwen2-2b-instruct-trl-sft-ChartQA
* Trackio metrics will be synced to Hugging Face Dataset: Ibrah-N/qwen2-2b-instruct-trl-sft-ChartQA-trackio-dataset
* Found existing space: https://huggingface.co/spaces/Ibrah-N/qwen2-2b-instruct-trl-sft-ChartQA-trackio
* View dashboard by going to: https://Ibrah-N-qwen2-2b-instruct-trl-sft-ChartQA-trackio.hf.space/


* Created new run: qwen2-2b-instruct-trl-sft-ChartQA


<trackio.run.Run at 0x7fe123d34290>

In [15]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    processing_class=processor,
)

In [16]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: Ibrah-N/trackio-dataset
* Found existing space: https://huggingface.co/spaces/Ibrah-N/trackio
* View dashboard by going to: https://Ibrah-N-trackio.hf.space/


* Created new run: Ibrah-N-1768634089


OutOfMemoryError: CUDA out of memory. Tried to allocate 13.14 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.82 GiB is free. Including non-PyTorch memory, this process has 27.66 GiB memory in use. Of the allocated memory 26.38 GiB is allocated by PyTorch, and 810.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model(training_args.output_dir)

In [None]:
clear_memory()

In [None]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

processor = Qwen2VLProcessor.from_pretrained(model_id)