In [1]:
!pip install transformers trl datasets bitsandbytes peft qwen-vl-utils accelerate

Collecting trl
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.11-py3-none-any.whl.metadata (6.3 kB)
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-15.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [2]:
#google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import torch
import random
import warnings
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tqdm import tqdm
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, BitsAndBytesConfig
from transformers import AutoTokenizer,AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer
from qwen_vl_utils import process_vision_info
from sklearn.model_selection import train_test_split

In [4]:
# 환경 설정
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
print("✅ Using device:", device)

✅ Using device: cuda


In [5]:
# 시드 고정
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

In [None]:
os.chdir('/path/to/lib/Qwen2-VL-2B-MMStar')

# Load Datasets And Preprocessing

## local train data

In [7]:
system_message = """You are a multimodal assistant specialized in answering visual multiple-choice questions.
Given an image and a related question with options A, B, C, and D, select the single best answer based solely on the visual content and question.
Answer only with one of: A, B, C, or D. Do not provide explanations unless explicitly requested."""

In [8]:
def format_localdata(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": "." + sample["img_path"]
                },
                {
                    "type": "text",
                    "text": f"Question: {sample['Question']}\n" +
                    "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate([sample[c] for c in ['A', 'B', 'C', 'D']])]) +
                    "\nAnswer:"
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["answer"]}],
        }
    ]

In [None]:
localtrain_dataset = load_dataset("csv", data_files = "../train.csv")

In [10]:
train_dataset = [format_localdata(sample) for sample in localtrain_dataset['train']]

In [11]:
train_dataset

[[{'role': 'system',
   'content': [{'type': 'text',
     'text': 'You are a multimodal assistant specialized in answering visual multiple-choice questions.\nGiven an image and a related question with options A, B, C, and D, select the single best answer based solely on the visual content and question.\nAnswer only with one of: A, B, C, or D. Do not provide explanations unless explicitly requested.'}]},
  {'role': 'user',
   'content': [{'type': 'image',
     'image': '../train_input_images/TRAIN_000.jpg'},
    {'type': 'text',
     'text': "Question: What might be the purpose of the person's workout in the image?\nA. Building muscle and strength\nB. Practicing for a marathon\nC. Training for a cycling race\nD. Preparing for a swimming competition\nAnswer:"}]},
  {'role': 'assistant', 'content': [{'type': 'text', 'text': 'A'}]}],
 [{'role': 'system',
   'content': [{'type': 'text',
     'text': 'You are a multimodal assistant specialized in answering visual multiple-choice questions.\n

# Model finetuning

In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [13]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

In [14]:
model_id = "Qwen/Qwen2-VL-2B-Instruct"

In [15]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    './qwen2-VL-2B-instruct-trl-VQA-MMStar3/checkpoint-220',
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
processor = Qwen2VLProcessor.from_pretrained(model_id)

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json: 0.00B [00:00, ?B/s]

In [16]:
peft_model = get_peft_model(model, peft_config)
peft_model
peft_model.print_trainable_parameters()

trainable params: 1,089,536 || all params: 2,210,075,136 || trainable%: 0.0493


In [17]:
model.to(device)

Qwen2VLForConditionalGeneration(
  (model): Qwen2VLModel(
    (visual): Qwen2VisionTransformerPretrainedModel(
      (patch_embed): PatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2VLVisionBlock(
          (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
          (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
          (attn): VisionAttention(
            (qkv): Linear4bit(in_features=1280, out_features=3840, bias=True)
            (proj): Linear4bit(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): VisionMlp(
            (fc1): Linear4bit(in_features=1280, out_features=5120, bias=True)
            (act): QuickGELUActivation()
            (fc2): Linear4bit(in_features=5120, out_features=1280, bias=True)
          )
        )
      )
      (merger): PatchMerger(
      

In [18]:
def collate_fn(examples):
    texts = [
        processor.apply_chat_template(example, tokenize = False) for example in examples
    ]
    image_inputs = [process_vision_info(example)[0] for example in examples]

    batch = processor(
        text = texts, images = image_inputs, return_tensors = "pt", padding = True
    )

    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652, 151653, 151655]
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]

    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100

    batch["labels"] = labels

    return batch

In [22]:
training_args = SFTConfig(
    output_dir = "./qwen2-VL-2B-instruct-trl-VQA-MMStar4",
    num_train_epochs = 50,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 8,
    gradient_checkpointing = True,
    optim = "adamw_torch",
    learning_rate = 2e-4,
    lr_scheduler_type = "constant",
    logging_steps = 10,
    eval_steps = 10,
    eval_strategy = "steps",
    save_strategy = "steps",
    save_steps = 10,
    greater_is_better = False,
    load_best_model_at_end = True,
    bf16 = True,
    tf32 = True,
    max_grad_norm = 0.3,
    warmup_ratio = 0.03,
    push_to_hub = False,
    report_to = "none",
    gradient_checkpointing_kwargs={"use_reentrant": False},
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True}
)
training_args.remove_unused_columns = False

In [23]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [24]:
from datasets import Dataset

trainer = SFTTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = train_dataset,
    data_collator = collate_fn,
    peft_config = peft_config,
    processing_class = processor.tokenizer
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
10,17.8367,1.927547
20,12.1771,1.269841
30,7.2555,0.668608
40,4.2929,0.520298
50,3.671,0.482098
60,3.3965,0.443157
70,3.0762,0.401484
80,2.737,0.342422
90,2.1966,0.270416
100,1.8179,0.235475


TrainOutput(global_step=100, training_loss=5.845728263854981, metrics={'train_runtime': 1059.2809, 'train_samples_per_second': 2.832, 'train_steps_per_second': 0.094, 'total_flos': 3.434664237225984e+16, 'train_loss': 5.845728263854981})

In [25]:
trainer.save_model(training_args.output_dir)