In [1]:
from unsloth import FastVisionModel

model, tokenizer = FastVisionModel.from_pretrained(
    'unsloth/Qwen2-VL-2B-Instruct',
    load_in_4bit=True,
    use_gradient_checkpointing='unsloth'  # Reduces VRAM usage
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.52.0.dev0.
   \\   /|    NVIDIA GeForce RTX 4060 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True,
    finetune_language_layers=True,
    r=16,                  # LoRA rank
    lora_alpha=16,         # Scaling factor
    lora_dropout=0,
    bias='none'
)


Unsloth: Making `model.base_model.model.visual` require gradients


In [18]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer
from PIL import Image
from torchvision import transforms

# Step 1: Load the base model and tokenizer
model_name = "Qwen/Qwen2-VL-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Define image transformations
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize all images to 224x224
    transforms.ToTensor()           # Convert images to tensors
])

# Step 2: Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)

# Step 3: Load dataset and preprocess
def preprocess_function(examples):
    inputs = tokenizer(examples["description"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    
    images = [image_transform(Image.open(image_path).convert("RGB")) for image_path in examples["image"]]
    inputs["pixel_values"] = torch.stack(images)
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "pixel_values": inputs["pixel_values"]
    }

dataset = load_dataset("json", data_files="train_dataset.json")
train_dataset = dataset["train"].map(preprocess_function, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "pixel_values"])

# Step 4: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./qwen2vl_lora_finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
)

# Step 5: Define Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Remove labels if not provided
        if "labels" not in inputs:
            inputs.pop("labels", None)
        
        outputs = model(**inputs)
        loss = outputs.loss
        
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Step 6: Train the Model
trainer.train()

# Save the LoRA fine-tuned model
model.save_pretrained("./qwen2vl_lora_finetuned")
tokenizer.save_pretrained("./qwen2vl_lora_finetuned")


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:15<00:00,  7.72s/it]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 14.07 GiB is allocated by PyTorch, and 535.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [17]:

torch.cuda.reset_peak_memory_stats()

In [16]:
torch.cuda.empty_cache()

In [4]:
!pip uninstall transformers -y
!pip install git+https://github.com/huggingface/transformers

Found existing installation: transformers 4.52.0.dev0
Uninstalling transformers-4.52.0.dev0:
  Successfully uninstalled transformers-4.52.0.dev0
^C
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to c:\users\kmano\appdata\local\temp\pip-req-build-j7zerm_w
  Resolved https://github.com/huggingface/transformers to commit 953196a43dae6a3c474165fba7d215fcbc7b7730
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml): started
  Building wheel for transformers (pyproject.toml): finished with status 'done'
  Created wheel for transformers: filename=transfo

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers 'C:\Users\kmano\AppData\Local\Temp\pip-req-build-j7zerm_w'
