In [5]:
data=[]

In [6]:
data.append(
    {
        "image": "C:/AI/Github/Reconnaissance_drone_report/Data/Images/earthquake/download.jpg",
        "text": "Describe the image.",
        "content": "This image shows a distructed building due to earthquake since there is a lot of debris is setteled in this area"
    }
)

In [1]:
# fine_tune_qwen2_vl_2b.py

import os
import json
import torch
from PIL import Image
from transformers import (
    Qwen2VLForConditionalGeneration,
    Qwen2VLProcessor,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model
from datasets import Dataset

# Configuration
MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
DATASET_PATH = "./train_dataset.json"
IMAGE_DIR = "./Data/Images/"
OUTPUT_DIR = "./qwen2_vl_2b_finetuned"
LORA_CONFIG = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 1. Prepare Dataset
def create_dataset(json_path, image_dir):
    with open(json_path) as f:
        data = json.load(f)
    
    processed_data = []
    for item in data:
        conversations = []
        for conv in item["messages"]:
            if conv["role"] == "user":
                content = []
                for c in conv["content"]:
                    if c["type"] == "image":
                        img_path = os.path.join(image_dir, c["image_path"])
                        content.append({"type": "image", "image": Image.open(img_path)})
                    else:
                        content.append({"type": "text", "text": c["text"]})
                conversations.append({"role": "user", "content": content})
            else:
                conversations.append({"role": "assistant", "content": conv["content"]})
        processed_data.append({"messages": conversations})
    
    return Dataset.from_list(processed_data)

# 2. Data Collator
def collator(features):
    processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME)
    conversations = [feature["messages"] for feature in features]
    
    inputs = processor.process_conversation(
        conversations,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=512
    )
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "pixel_values": inputs["pixel_values"],
        "labels": inputs["labels"]
    }

# 3. Load Model and Processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    load_in_8bit=True,
    torch_dtype=torch.float16
)
processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME)

# Apply LoRA
model = get_peft_model(model, LORA_CONFIG)
model.print_trainable_parameters()

# 4. Training Setup
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    save_strategy="epoch",
    logging_steps=10,
    remove_unused_columns=False,
    optim="adafactor",  # Memory-efficient optimizer
    torch_compile=True  # Uses CUDA graphs
)

# 5. Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=create_dataset(DATASET_PATH, IMAGE_DIR),
    data_collator=collator,
)

# 6. Start Training
trainer.train()
trainer.save_model(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)

print("Training complete! Model saved to:", OUTPUT_DIR)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


trainable params: 4,358,144 || all params: 2,213,343,744 || trainable%: 0.1969


KeyError: 'image_path'

In [None]:
torch.cuda.empty_cache()

In [None]:
!pip install matplotlib tiktoken