# Colab All-In-One (Qwen3-VL-8B)

Colab A100 80GB 기준, 학습 1epoch와 추론을 각각 1시간 이내로 단축하도록 최적화한 통합 노트북입니다.
- 모델: Qwen/Qwen3-VL-8B-Instruct (bf16, FlashAttention2 시도)
- 학습: LoRA(q/v), bf16, fused AdamW, 큰 배치, 빠른 DataLoader
- 추론: 배치 생성, 짧은 토큰 길이, 그리디 디코딩, KV 캐시


In [None]:
# Colab 전용 설치
import os
if "COLAB_RELEASE_TAG" in os.environ or "COLAB_GPU" in os.environ:
    !pip install -U --quiet "transformers>=4.46.0" "accelerate>=0.34.0" "peft>=0.12.0" "bitsandbytes>=0.43.3"
    !pip install -U --quiet qwen-vl-utils[decord]>=0.0.10 datasets pillow opencv-python pandas tqdm matplotlib seaborn python-dotenv
    try:
        !pip install -U --quiet flash-attn --no-build-isolation
    except Exception as e:
        print(f'[WARN] flash-attn install skipped: {e}')
else:
    print('[INFO] Non-Colab env detected. Skipping installs.')


In [None]:
# 환경/전역 설정
import os, sys, math, random, time, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig,
    TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

warnings.filterwarnings('ignore')
Image.MAX_IMAGE_PIXELS = None

SEED = int(os.environ.get('SEED', 42))
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
torch.backends.cuda.matmul.allow_tf32 = True
if hasattr(torch.backends, 'cuda') and hasattr(torch.backends.cuda, 'sdp_kernel'):
    torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=True)
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)
if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(0))
    print('VRAM(GB):', round(torch.cuda.get_device_properties(0).total_memory/1e9, 2))


In [None]:
# Google Drive 마운트 (Colab 환경에서만)
import os
if "COLAB_RELEASE_TAG" in os.environ or "COLAB_GPU" in os.environ:
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        print('[OK] Google Drive mounted.')
    except Exception as e:
        print('[WARN] Drive mount skipped:', e)
else:
    print('[INFO] Non-Colab env detected. Skipping drive mount.')


In [None]:
# 경로/설정
PROJECT_ROOT = Path(os.environ.get('PROJECT_ROOT', '/content/drive/MyDrive/Colab Notebooks'))
# 기본 데이터 경로: /content/drive/MyDrive/Colab Notebooks/data
DATA_DIR = Path(os.environ.get('DATA_DIR', '/content/drive/MyDrive/Colab Notebooks/data'))
# CSV가 가리키는 이미지 상대 경로(train/xxx.jpg 등)를 그대로 붙일 수 있도록 기본은 DATA_DIR로 설정
IMAGE_DIR = Path(os.environ.get('IMAGE_DIR', DATA_DIR))
TRAIN_CSV = Path(os.environ.get('TRAIN_CSV', DATA_DIR / 'train.csv'))
TEST_CSV  = Path(os.environ.get('TEST_CSV', DATA_DIR / 'test.csv'))
OUT_DIR = Path(os.environ.get('OUT_DIR', PROJECT_ROOT / 'runs' / 'qwen3vl_colab'))
OUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_ID = os.environ.get('MODEL_ID', 'Qwen/Qwen3-VL-8B-Instruct')
USE_4BIT_DEFAULT = os.environ.get('USE_4BIT', 'false').lower() == 'true'
LORA_R = int(os.environ.get('LORA_R', 8))
LORA_DROPOUT = float(os.environ.get('LORA_DROPOUT', 0.05))
LORA_ALPHA = int(os.environ.get('LORA_ALPHA', 16))
IMAGE_SIZE = int(os.environ.get('IMAGE_SIZE', 384))
MAX_SEQ_LEN = int(os.environ.get('MAX_SEQ_LEN', 768))
TRAIN_EPOCHS = int(os.environ.get('EPOCHS', 1))
PREFER_FULL_BF16 = (device.type == 'cuda')
print('MODEL_ID =', MODEL_ID)


In [None]:
# VQA 데이터셋 (이미지 + 텍스트)
from typing import Dict, Any

class VqaDataset(Dataset):
    def __init__(self, df: pd.DataFrame, image_root: Path, processor: AutoProcessor, max_len: int = 768, image_size: int = 384, train: bool = True):
        self.df = df.reset_index(drop=True)
        self.image_root = Path(image_root)
        self.processor = processor
        self.max_len = max_len
        self.image_size = image_size
        self.train = train

    def __len__(self):
        return len(self.df)

    def _build_messages(self, row: pd.Series) -> Any:
        question = row.get('question', '')
        choices = [row.get('A',''), row.get('B',''), row.get('C',''), row.get('D','')]
        sys_msg = {'role': 'system', 'content': 'You are a helpful vision-language assistant for multiple-choice VQA.'}
        # 문자열 연결 방식으로 구성하여 노트북 JSON 안정화
        text_prompt = (
            'Question: ' + str(question) + '\n' +
            'Choices: A) ' + str(choices[0]) + ' B) ' + str(choices[1]) + ' C) ' + str(choices[2]) + ' D) ' + str(choices[3]) + '\n' +
            'Answer with only one letter (A/B/C/D).'
        )
        user_msg = {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': text_prompt},
                {'type': 'image'}
            ]
        }
        return [sys_msg, user_msg]

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        row = self.df.iloc[idx]
        img_path = self.image_root / str(row['image'])
        image = Image.open(img_path).convert('RGB')
        messages = self._build_messages(row)
        chat_text = self.processor.apply_chat_template(messages, add_generation_prompt=True)
        enc = self.processor(
            text=[chat_text],
            images=[image],
            do_resize=True,
            size={'shortest_edge': self.image_size},
            padding=True, truncation=True, max_length=self.max_len, return_tensors='pt'
        )
        item = {k: v[0] for k, v in enc.items()}
        return item


In [None]:
# 모델/프로세서 로드: bf16 full 우선, 실패 시 4bit QLoRA
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
if hasattr(processor, 'tokenizer'):
    processor.tokenizer.padding_side = 'left'

def load_model(prefer_full_bf16: bool = True, use_4bit_default: bool = False):
    compute_dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) else torch.float16
    if prefer_full_bf16 and not use_4bit_default:
        try:
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID, torch_dtype=compute_dtype, device_map='auto', trust_remote_code=True, low_cpu_mem_usage=True
            )
            print('[OK] Loaded full model (bf16/fp16).')
            return model, False
        except Exception as e:
            print('[WARN] Full model load failed, fallback to 4bit:', e)
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=compute_dtype)
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, quantization_config=bnb_config, device_map='auto', trust_remote_code=True)
    print('[OK] Loaded 4bit model.')
    return model, True

model, is_4bit = load_model(PREFER_FULL_BF16, USE_4BIT_DEFAULT)
target_modules = ['q_proj', 'v_proj']
if is_4bit:
    model = prepare_model_for_kbit_training(model)
if hasattr(model, 'enable_input_require_grads'):
    model.enable_input_require_grads()
model.config.use_cache = False
lora_cfg = LoraConfig(r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT, bias='none', task_type='CAUSAL_LM', target_modules=target_modules)
model = get_peft_model(model, lora_cfg)
# pad/eos 설정 보정
if getattr(model, 'generation_config', None) and hasattr(processor, 'tokenizer'):
    if model.generation_config.pad_token_id is None:
        model.generation_config.pad_token_id = processor.tokenizer.pad_token_id
    if model.generation_config.eos_token_id is None:
        model.generation_config.eos_token_id = processor.tokenizer.eos_token_id
print('Trainable params (LoRA) ready.')

# 선택적 torch.compile
USE_COMPILE = os.environ.get('USE_COMPILE', 'true').lower() == 'true'
if USE_COMPILE and hasattr(torch, 'compile') and device.type == 'cuda':
    try:
        model = torch.compile(model, mode='max-autotune')
        print('[OK] torch.compile enabled')
    except Exception as e:
        print('[WARN] torch.compile disabled:', e)


In [None]:
# 데이터 로드 및 DataLoader 구성
assert TRAIN_CSV.exists(), f'TRAIN_CSV not found: {TRAIN_CSV}'
train_df = pd.read_csv(TRAIN_CSV)
from sklearn.model_selection import train_test_split
tr_df, val_df = train_test_split(train_df, test_size=0.05, random_state=SEED, shuffle=True)

train_ds = VqaDataset(tr_df, IMAGE_DIR, processor, max_len=MAX_SEQ_LEN, image_size=IMAGE_SIZE, train=True)
val_ds   = VqaDataset(val_df, IMAGE_DIR, processor, max_len=MAX_SEQ_LEN, image_size=IMAGE_SIZE, train=False)

cpu_workers = max(2, min(os.cpu_count() or 8, 8))
BATCH_TRAIN = int(os.environ.get('BATCH_TRAIN', 8 if not is_4bit else 12))
BATCH_EVAL  = int(os.environ.get('BATCH_EVAL', 12 if not is_4bit else 16))

def data_collator(features):
    batch = processor.pad(features, return_tensors='pt')
    # pixel_values가 리스트로 반환되면 텐서로 스택
    if isinstance(batch.get('pixel_values', None), list):
        import torch as _torch
        batch['pixel_values'] = _torch.stack(batch['pixel_values'])
    if 'labels' not in batch:
        batch['labels'] = batch['input_ids'].clone()
    return batch

train_loader = DataLoader(train_ds, batch_size=BATCH_TRAIN, shuffle=True, num_workers=cpu_workers, pin_memory=True, persistent_workers=True, prefetch_factor=4)
eval_loader  = DataLoader(val_ds, batch_size=BATCH_EVAL, shuffle=False, num_workers=cpu_workers, pin_memory=True, persistent_workers=True, prefetch_factor=4)
print('BATCH_TRAIN =', BATCH_TRAIN, 'BATCH_EVAL =', BATCH_EVAL, 'workers =', cpu_workers)


In [None]:
# Trainer 설정
optim_name = 'adamw_torch_fused' if (torch.cuda.is_available()) else 'adamw_torch'
args = TrainingArguments(
    output_dir=str(OUT_DIR / 'checkpoints'),
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=TRAIN_EPOCHS,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    lr_scheduler_type='cosine',
    warmup_ratio=0.03,
    bf16=(torch.cuda.is_available() and torch.cuda.is_bf16_supported()),
    fp16=False,
    logging_steps=20,
    evaluation_strategy='steps', eval_steps=200,
    save_strategy='no', report_to='none',
    optim=optim_name,
    dataloader_num_workers=(os.cpu_count() or 8),
    dataloader_pin_memory=True,
    gradient_checkpointing=False,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=train_ds, eval_dataset=val_ds,
    data_collator=data_collator
)
t0=time.time(); train_out = trainer.train(); t1=time.time()
print('Train time (min):', round((t1-t0)/60,2))
(OUT_DIR / 'lora_adapter').mkdir(parents=True, exist_ok=True)
try:
    model.save_pretrained(OUT_DIR / 'lora_adapter')
except Exception as e:
    print('[WARN] save_pretrained failed:', e)


In [None]:
# 추론: 배치 생성 + 그리디 디코딩
assert TEST_CSV.exists(), f'TEST_CSV not found: {TEST_CSV}'
test_df = pd.read_csv(TEST_CSV)
inf_ds = VqaDataset(test_df, IMAGE_DIR, processor, max_len=MAX_SEQ_LEN, image_size=IMAGE_SIZE, train=False)
inf_loader = DataLoader(inf_ds, batch_size=BATCH_EVAL, shuffle=False, num_workers=max(2, (os.cpu_count() or 8)//2), pin_memory=True, persistent_workers=True, prefetch_factor=4)

gen_cfg = dict(max_new_tokens=6, do_sample=False, temperature=0.0, top_p=1.0, use_cache=True)

model.eval()
preds = []
t0=time.time()
with torch.inference_mode():
    for batch in inf_loader:
        batch = {k: v.to(device) for k,v in batch.items()}
        out = model.generate(**batch, **gen_cfg)
        texts = processor.batch_decode(out, skip_special_tokens=True)
        for txt in texts:
            # 단일 문자(A/B/C/D)만 간단 추출
            ans = 'A'
            for cand in ['A','B','C','D']:
                if (' '+cand+' ') in (' '+txt+' '):
                    ans = cand; break
            preds.append(ans)
t1=time.time(); print('Inference time (min):', round((t1-t0)/60,2))

sub = pd.DataFrame({'id': test_df.get('id', pd.RangeIndex(len(test_df))), 'answer': preds[:len(test_df)]})
sub_path = OUT_DIR / 'submission.csv'
sub.to_csv(sub_path, index=False)
print('Saved:', sub_path)
