In [1]:
!pip install transformers trl datasets bitsandbytes peft qwen-vl-utils accelerate

Collecting trl
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.11-py3-none-any.whl.metadata (6.3 kB)
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-15.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [2]:
#google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import torch
import random
import warnings
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tqdm import tqdm
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer
from qwen_vl_utils import process_vision_info

In [4]:
# 환경 설정
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
print("✅ Using device:", device)

✅ Using device: cuda


In [None]:
os.chdir('/path/to/lib/Qwen2-VL-2B')

In [6]:
# 시드 고정
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [8]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

In [9]:
model_id = "Qwen/Qwen2-VL-2B-Instruct"

In [10]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
processor = Qwen2VLProcessor.from_pretrained(model_id)

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json: 0.00B [00:00, ?B/s]

In [11]:
peft_model = get_peft_model(model, peft_config)
peft_model.to(device)
peft_model.print_trainable_parameters()

trainable params: 1,089,536 || all params: 2,210,075,136 || trainable%: 0.0493


In [12]:
def collate_fn(examples):
    texts = [
        processor.apply_chat_template(example, tokenize = False) for example in examples
    ]
    image_inputs = [process_vision_info(example)[0] for example in examples]

    batch = processor(
        text = texts, images = image_inputs, return_tensors = "pt", padding = True
    )

    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652, 151653, 151655]
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]

    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100

    batch["labels"] = labels

    return batch

In [13]:
system_message = """You are a multimodal assistant specialized in answering visual multiple-choice questions.
Given an image and a related question with options A, B, C, and D, select the single best answer based solely on the visual content and question.
Answer only with one of: A, B, C, or D. Do not provide explanations unless explicitly requested."""

In [14]:
def format_traindata(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": "." + sample["img_path"]
                },
                {
                    "type": "text",
                    "text": f"Question: {sample['Question']}\n" +
                    "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate([sample[c] for c in ['A', 'B', 'C', 'D']])]) +
                    "\nAnswer:"
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["answer"]}],
        }
    ]

def format_testdata(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": "." + sample["img_path"]
                },
                {
                    "type": "text",
                    "text": f"Question: {sample['Question']}\n" +
                    "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate([sample[c] for c in ['A', 'B', 'C', 'D']])]) +
                    "\nAnswer:"
                },
            ],
        }
    ]

In [15]:
train_dataset = load_dataset("csv", data_files = "../train.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [16]:
train_dataset = [format_traindata(sample) for sample in train_dataset['train']]

In [17]:
training_args = SFTConfig(
    output_dir = "./qwen2-VL-2B-instruct-trl-VQA",
    num_train_epochs = 100,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 8,
    gradient_checkpointing = True,
    optim = "adamw_torch",
    learning_rate = 2e-4,
    lr_scheduler_type = "constant",
    logging_steps = 10,
    eval_steps = 10,
    eval_strategy = "steps",
    save_strategy = "steps",
    save_steps = 20,
    greater_is_better = False,
    load_best_model_at_end = True,
    bf16 = True,
    tf32 = True,
    max_grad_norm = 0.3,
    warmup_ratio = 0.03,
    push_to_hub = False,
    report_to = "none",
    gradient_checkpointing_kwargs={"use_reentrant": False},
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True}
)
training_args.remove_unused_columns = False

In [20]:
from datasets import Dataset

trainer = SFTTrainer(
    model = model.to(device),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = train_dataset,
    data_collator = collate_fn,
    peft_config = peft_config,
    processing_class = processor.tokenizer
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
10,17.8359,1.928035
20,12.1748,1.270549
30,7.2567,0.667551
40,4.2933,0.520246
50,3.6717,0.482162
60,3.3973,0.44316
70,3.0767,0.401589
80,2.7416,0.342538
90,2.2008,0.271518
100,1.8281,0.236264


TrainOutput(global_step=200, training_loss=3.4224680161476133, metrics={'train_runtime': 2532.9968, 'train_samples_per_second': 2.369, 'train_steps_per_second': 0.079, 'total_flos': 6.900416453339136e+16, 'train_loss': 3.4224680161476133})

In [21]:
trainer.save_model(training_args.output_dir)

# Inference

In [22]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
).to(device)

processor = Qwen2VLProcessor.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
adapter_path = "./qwen2-VL-2B-instruct-trl-VQA"
model.load_adapter(adapter_path)

In [24]:
test = load_dataset("csv", data_files = "../test.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [25]:
test_dataset = [format_testdata(sample) for sample in test['train']]

In [35]:
text = processor.apply_chat_template(test_dataset[1], tokenize=False, add_generation_prompt=True)

image_inputs, _ = process_vision_info(test_dataset[1])

inputs = processor(
    text=[text],
    images=image_inputs,
    return_tensors="pt",
)

inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=3)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

output_text

['A']

In [36]:
results = []
for row in tqdm(test_dataset):
    text = processor.apply_chat_template(row, tokenize = False, add_generation_prompt = True)
    image_inputs, _ = process_vision_info(row)

    inputs = processor(
        text = [text],
        images = image_inputs,
        return_tensors = "pt"
    )

    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=3)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    results.append(output_text)

100%|██████████| 852/852 [03:59<00:00,  3.55it/s]


In [37]:
results

[['B'],
 ['A'],
 ['B'],
 ['C'],
 ['A'],
 ['C'],
 ['B'],
 ['D'],
 ['D'],
 ['B'],
 ['B'],
 ['B'],
 ['B'],
 ['C'],
 ['C'],
 ['C'],
 ['B'],
 ['A'],
 ['C'],
 ['C'],
 ['D'],
 ['C'],
 ['A'],
 ['A'],
 ['A'],
 ['A'],
 ['B'],
 ['C'],
 ['A'],
 ['C'],
 ['A'],
 ['C'],
 ['C'],
 ['A'],
 ['A'],
 ['D'],
 ['D'],
 ['D'],
 ['A'],
 ['A'],
 ['C'],
 ['B'],
 ['D'],
 ['C'],
 ['B'],
 ['B'],
 ['D'],
 ['C'],
 ['B'],
 ['C'],
 ['B'],
 ['D'],
 ['C'],
 ['D'],
 ['D'],
 ['A'],
 ['C'],
 ['D'],
 ['D'],
 ['C'],
 ['C'],
 ['C'],
 ['C'],
 ['C'],
 ['D'],
 ['B'],
 ['D'],
 ['A'],
 ['B'],
 ['D'],
 ['D'],
 ['D'],
 ['A'],
 ['C'],
 ['A'],
 ['D'],
 ['A'],
 ['A'],
 ['A'],
 ['C'],
 ['C'],
 ['D'],
 ['A'],
 ['D'],
 ['A'],
 ['D'],
 ['B'],
 ['A'],
 ['B'],
 ['D'],
 ['A'],
 ['D'],
 ['C'],
 ['B'],
 ['A'],
 ['A'],
 ['A'],
 ['C'],
 ['D'],
 ['D'],
 ['C'],
 ['D'],
 ['C'],
 ['D'],
 ['C'],
 ['A'],
 ['B'],
 ['D'],
 ['D'],
 ['C'],
 ['C'],
 ['B'],
 ['C'],
 ['B'],
 ['D'],
 ['C'],
 ['C'],
 ['A'],
 ['A'],
 ['C'],
 ['B'],
 ['A'],
 ['C'],
 ['B'],
 ['D'],


In [38]:
results = [result[0] for result in results]

In [39]:
results

['B',
 'A',
 'B',
 'C',
 'A',
 'C',
 'B',
 'D',
 'D',
 'B',
 'B',
 'B',
 'B',
 'C',
 'C',
 'C',
 'B',
 'A',
 'C',
 'C',
 'D',
 'C',
 'A',
 'A',
 'A',
 'A',
 'B',
 'C',
 'A',
 'C',
 'A',
 'C',
 'C',
 'A',
 'A',
 'D',
 'D',
 'D',
 'A',
 'A',
 'C',
 'B',
 'D',
 'C',
 'B',
 'B',
 'D',
 'C',
 'B',
 'C',
 'B',
 'D',
 'C',
 'D',
 'D',
 'A',
 'C',
 'D',
 'D',
 'C',
 'C',
 'C',
 'C',
 'C',
 'D',
 'B',
 'D',
 'A',
 'B',
 'D',
 'D',
 'D',
 'A',
 'C',
 'A',
 'D',
 'A',
 'A',
 'A',
 'C',
 'C',
 'D',
 'A',
 'D',
 'A',
 'D',
 'B',
 'A',
 'B',
 'D',
 'A',
 'D',
 'C',
 'B',
 'A',
 'A',
 'A',
 'C',
 'D',
 'D',
 'C',
 'D',
 'C',
 'D',
 'C',
 'A',
 'B',
 'D',
 'D',
 'C',
 'C',
 'B',
 'C',
 'B',
 'D',
 'C',
 'C',
 'A',
 'A',
 'C',
 'B',
 'A',
 'C',
 'B',
 'D',
 'A',
 'D',
 'D',
 'B',
 'D',
 'D',
 'A',
 'A',
 'D',
 'D',
 'D',
 'B',
 'C',
 'D',
 'A',
 'D',
 'A',
 'D',
 'A',
 'D',
 'A',
 'B',
 'C',
 'C',
 'A',
 'C',
 'B',
 'B',
 'D',
 'C',
 'D',
 'D',
 'D',
 'C',
 'B',
 'D',
 'D',
 'C',
 'B',
 'B',
 'A',
 'B'

In [40]:
submission = pd.read_csv('../sample_submission.csv')
submission['answer'] = results
submission.to_csv('./qwen2_submit.csv', index=False)
print("✅ Done.")

✅ Done.
