In [4]:
from transformers import AutoTokenizer, AutoModelForVision2Seq, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from datasets import load_dataset
import torch

# Load tokenizer and model
model_name = "Qwen/Qwen2-VL-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForVision2Seq.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)


# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # You might need to inspect this based on the architecture
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load your dataset (replace this with your own)
dataset = load_dataset("json", data_files="train_dataset.json")  # Needs fields like {"text": ..., "image": ...}

from PIL import Image

def preprocess(example):
    image = Image.open(example["image"]).convert("RGB")  # Load image file
    pixel_values = image_processor(image, return_tensors="pt")["pixel_values"].squeeze(0)

    tokenized = tokenizer(
        example["text"],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512,
    )

    return {
        "input_ids": tokenized["input_ids"].squeeze(0),
        "pixel_values": pixel_values,
        "labels": tokenized["input_ids"].squeeze(0)
    }


dataset = dataset.map(preprocess)

# Training args
training_args = TrainingArguments(
    output_dir="./qwen2-vl-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    learning_rate=2e-4,
    bf16=True if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8 else False,
    save_total_limit=2,
    report_to="none"
)

# Train
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
    args=training_args,
    packing=False
)

trainer.train()


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

trainable params: 2,523,136 || all params: 29,890,894,336 || trainable%: 0.0084




Map:   0%|          | 0/5 [00:00<?, ? examples/s]

AttributeError: 'Qwen2VLForConditionalGeneration' object has no attribute 'vis_processor'

In [2]:
torch.cuda.empty_cache()