In [None]:
!pip install transformers trl datasets bitsandbytes peft qwen-vl-utils accelerate

Collecting trl
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.11-py3-none-any.whl.metadata (6.3 kB)
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-15.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:
#google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import torch
import random
import warnings
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tqdm import tqdm
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer
from qwen_vl_utils import process_vision_info

In [None]:
# 환경 설정
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
print("✅ Using device:", device)

✅ Using device: cuda


In [None]:
# 시드 고정
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

In [None]:
os.chdir('/path/to/lib/Qwen2-VL-2B-MMStar')

In [None]:
model_id = "Qwen/Qwen2-VL-2B-Instruct"

In [None]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    './qwen2-VL-2B-instruct-trl-VQA-MMStar4',
    device_map="auto",
    torch_dtype=torch.bfloat16,
).to(device)

processor = Qwen2VLProcessor.from_pretrained(model_id)

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json: 0.00B [00:00, ?B/s]

In [None]:
test = load_dataset("csv", data_files = "../test.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
system_message = """You are a multimodal assistant specialized in answering visual multiple-choice questions.
Given an image and a related question with options A, B, C, and D, select the single best answer based solely on the visual content and question.
Answer only with one of: A, B, C, or D. Do not provide explanations unless explicitly requested."""

In [None]:
def format_testdata(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": "." + sample["img_path"]
                },
                {
                    "type": "text",
                    "text": f"Question: {sample['Question']}\n" +
                    "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate([sample[c] for c in ['A', 'B', 'C', 'D']])]) +
                    "\nAnswer:"
                },
            ],
        }
    ]

In [None]:
test_dataset = [format_testdata(sample) for sample in test['train']]

In [None]:
results = []
for row in tqdm(test_dataset):
    text = processor.apply_chat_template(row, tokenize = False, add_generation_prompt = True)
    image_inputs, _ = process_vision_info(row)

    inputs = processor(
        text = [text],
        images = image_inputs,
        return_tensors = "pt"
    )

    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=3)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    results.append(output_text[0])

100%|██████████| 852/852 [06:48<00:00,  2.09it/s]


In [None]:
submission = pd.read_csv('../sample_submission.csv')
submission['answer'] = results
submission.to_csv('./qwen2_MMstar_submit_4.csv', index=False)
print("✅ Done.")

✅ Done.
