# 📒 Kaggle_AllInOne_Pro2.ipynb – 고급 최적화 버전

## 🎯 Pro2 주요 개선사항

### ✅ 학습 개선
- Val Accuracy + Confusion Matrix 로깅
- Best 모델: Val Acc 우선 저장
- 학습 곡선 시각화
- 라벨 마스킹 (프롬프트 손실 제외)
- 검증 데이터 train=False

### ✅ 추론 개선
- Direct Logits (a/b/c/d 토큰 확률)
- TTA [0.9, 1.0, 1.1]
- 배치 추론
- pad_token_id 자동 보정

### ✅ 앙상블 개선
- Temperature Scaling
- 확률 앙상블
- 확률 컬럼 저장

### ⚙️ 튜닝 설정
```
USE_SAMPLE=False, IMAGE_SIZE=512, NUM_EPOCHS=3
GRAD_ACCUM_STEPS=8, WARMUP_RATIO=0.06, LORA_R=16
USE_DIRECT_LOGIT_DECODE=True, TTA_SCALES=[0.9,1.0,1.1]
ENSEMBLE_METHOD='prob'
```

**🤖 SSAFY AI Project 2025**

## 📦 1. 패키지 설치

In [None]:
# !pip install -q transformers accelerate peft bitsandbytes datasets pillow pandas torch torchvision scikit-learn matplotlib seaborn tqdm --upgrade
# !pip install -q qwen-vl-utils==0.0.8
print("✅ 설치 완료! 런타임 재시작하세요.")

## 📚 2. 라이브러리 임포트

In [None]:
import os, sys, re, math, random, warnings, json, pickle
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Any, Optional, Tuple
from collections import Counter, defaultdict
import unicodedata

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.swa_utils import AveragedModel, SWALR

from transformers import (
    AutoModelForVision2Seq,
    Qwen2_5_VLForConditionalGeneration,
    AutoProcessor,
    BitsAndBytesConfig,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from qwen_vl_utils import process_vision_info

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')
Image.MAX_IMAGE_PIXELS = None
sns.set_style('whitegrid')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔧 Device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## ⚙️ 3. Config 설정 (Pro2 튜닝)

In [None]:
class Config:
    # 시드
    SEED = 42
    
    # 모델
    MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
    IMAGE_SIZE = 512  # ✅ Pro2: 고해상도
    USE_ADVANCED_MODEL = False  # True면 Qwen2_5_VL (VRAM 확인)
    
    # 데이터
    DATA_DIR = "/content"
    TRAIN_CSV = f"{DATA_DIR}/train.csv"
    TEST_CSV = f"{DATA_DIR}/test.csv"
    
    # K-Fold
    N_FOLDS = 3
    USE_KFOLD = True
    TRAIN_FOLDS = [0, 1, 2]
    
    # QLoRA
    LORA_R = 16  # ✅ Pro2: 더 큰 표현력
    LORA_ALPHA = 32
    LORA_DROPOUT = 0.05
    TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    
    # 학습
    NUM_EPOCHS = 3  # ✅ Pro2
    BATCH_SIZE = 1
    GRAD_ACCUM_STEPS = 8  # ✅ Pro2
    LEARNING_RATE = 1e-4
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.06  # ✅ Pro2
    MAX_GRAD_NORM = 1.0
    
    # 고급 기법
    USE_AMP = True
    USE_EMA = True
    EMA_DECAY = 0.999
    USE_SWA = True  # ✅ Pro2: Epoch 1 이후 ON
    SWA_START_EPOCH = 1
    USE_COSINE_SCHEDULE = True
    
    # TTA
    USE_TTA = True  # ✅ Pro2
    TTA_SCALES = [0.9, 1.0, 1.1]  # ✅ Pro2
    
    # 추론
    USE_DIRECT_LOGIT_DECODE = True  # ✅ Pro2: Direct logits
    USE_BATCH_INFERENCE = False  # 메모리 허용 시 True
    INFER_BATCH_SIZE = 4
    MAX_NEW_TOKENS = 8
    
    # Temperature Scaling
    USE_TEMPERATURE_SCALING = True  # ✅ Pro2
    
    # 앙상블
    ENSEMBLE_METHOD = "prob"  # ✅ Pro2: "prob" or "vote"
    
    # 저장
    SAVE_DIR = f"{DATA_DIR}/checkpoints"
    OUTPUT_DIR = f"{DATA_DIR}/outputs"
    LOG_DIR = f"{DATA_DIR}/logs"
    
    # 샘플링
    USE_SAMPLE = False  # ✅ Pro2: 전체 데이터
    SAMPLE_SIZE = 200
    
    # 프롬프트
    SYSTEM_INSTRUCT = (
        "You are a helpful visual question answering assistant. "
        "Answer using exactly one letter among a, b, c, or d. No explanation."
    )

cfg = Config()

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(cfg.SEED)
print(f"✅ Config 설정 완료")
print(f"   Model: {cfg.MODEL_ID}")
print(f"   Image Size: {cfg.IMAGE_SIZE}")
print(f"   Epochs: {cfg.NUM_EPOCHS}, Grad Accum: {cfg.GRAD_ACCUM_STEPS}")
print(f"   LoRA R: {cfg.LORA_R}, Warmup: {cfg.WARMUP_RATIO}")
print(f"   Direct Logits: {cfg.USE_DIRECT_LOGIT_DECODE}, TTA: {cfg.USE_TTA}")
print(f"   Ensemble: {cfg.ENSEMBLE_METHOD}, Temp Scaling: {cfg.USE_TEMPERATURE_SCALING}")

## 📊 4. 데이터 로드 & EDA

In [None]:
train_df = pd.read_csv(cfg.TRAIN_CSV)
test_df = pd.read_csv(cfg.TEST_CSV)

print(f"📁 Train: {len(train_df):,} samples")
print(f"📁 Test: {len(test_df):,} samples")

if cfg.USE_SAMPLE:
    train_df = train_df.sample(n=min(cfg.SAMPLE_SIZE, len(train_df)), random_state=cfg.SEED).reset_index(drop=True)
    print(f"⚠️  Sampled {len(train_df)} samples")

print(f"\n📊 Answer Distribution:")
print(train_df['answer'].value_counts().sort_index())

fig, axes = plt.subplots(1, 2, figsize=(14, 4))
train_df['answer'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Answer Distribution')
axes[0].set_xlabel('Answer')
axes[0].set_ylabel('Count')

train_df['question_len'] = train_df['question'].str.len()
train_df['question_len'].hist(bins=30, ax=axes[1], color='salmon')
axes[1].set_title('Question Length')
plt.tight_layout()
plt.show()

## 🔄 5. Stratified K-Fold CV

In [None]:
if cfg.USE_KFOLD:
    skf = StratifiedKFold(n_splits=cfg.N_FOLDS, shuffle=True, random_state=cfg.SEED)
    train_df['fold'] = -1
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['answer'])):
        train_df.loc[val_idx, 'fold'] = fold
    print(f"✅ {cfg.N_FOLDS}-Fold CV 생성")
    print(train_df['fold'].value_counts().sort_index())
else:
    split_idx = int(len(train_df) * 0.9)
    train_df['fold'] = -1
    train_df.loc[split_idx:, 'fold'] = 0
    print(f"✅ Single split (90:10)")

## 🗂️ 6. Dataset & DataCollator

✅ **라벨 마스킹**: 프롬프트 토큰 손실 제외, assistant 정답 토큰만 감독

In [None]:
def build_mc_prompt(question, a, b, c, d):
    return (
        f"{question}\n"
        f"(a) {a}\n(b) {b}\n(c) {c}\n(d) {d}\n\n"
        "정답을 반드시 a, b, c, d 중 하나의 소문자 한 글자로만 출력하세요."
    )

class VQADataset(Dataset):
    def __init__(self, df, processor, data_dir="", train=True, use_advanced=False):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.data_dir = data_dir
        self.train = train
        self.use_advanced = use_advanced
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # 이미지 로드 (path 컬럼 지원)
        img_col = 'path' if 'path' in row else 'image'
        img_path = os.path.join(self.data_dir, row[img_col])
        try:
            img = Image.open(img_path).convert("RGB")
        except:
            img = Image.new('RGB', (cfg.IMAGE_SIZE, cfg.IMAGE_SIZE), color='white')
        
        user_text = build_mc_prompt(
            str(row["question"]), str(row["a"]), 
            str(row["b"]), str(row["c"]), str(row["d"])
        )
        
        messages = [
            {"role": "system", "content": [{"type": "text", "text": cfg.SYSTEM_INSTRUCT}]},
            {"role": "user", "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": user_text}
            ]}
        ]
        
        # ✅ 학습 시에만 정답 포함
        answer = None
        if self.train:
            answer = str(row["answer"]).strip().lower()
            messages.append({
                "role": "assistant",
                "content": [{"type": "text", "text": answer}]
            })
        
        return {"messages": messages, "image": img, "answer": answer}

@dataclass
class DataCollator:
    processor: Any
    train: bool = True
    use_advanced: bool = False
    
    def __call__(self, batch):
        texts, images, answers = [], [], []
        
        for sample in batch:
            text = self.processor.apply_chat_template(
                sample["messages"],
                tokenize=False,
                add_generation_prompt=False  # ✅ False!
            )
            text = unicodedata.normalize('NFKC', text)
            texts.append(text)
            images.append(sample["image"])
            answers.append(sample["answer"])
        
        enc = self.processor(
            text=texts,
            images=images,
            padding=True,
            return_tensors="pt"
        )
        
        # ✅ 라벨 마스킹: 정답 토큰만 감독
        if self.train:
            labels = enc["input_ids"].clone()
            for i, answer in enumerate(answers):
                if answer is None:
                    labels[i, :] = -100
                else:
                    # 프롬프트 부분 -100
                    labels[i, :] = -100
                    # 정답 토큰만 유지
                    answer_ids = self.processor.tokenizer.encode(answer, add_special_tokens=False)
                    if len(answer_ids) > 0:
                        labels[i, -len(answer_ids):] = torch.tensor(answer_ids)
            enc["labels"] = labels
        
        return enc

print("✅ Dataset & DataCollator 정의 완료")

## 🤖 7. Model & Processor 로드

✅ T4 호환: Float16, SDPA attention, 4-bit QLoRA

In [None]:
def create_model_and_processor(model_id, use_advanced=False):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    
    processor = AutoProcessor.from_pretrained(
        model_id,
        min_pixels=cfg.IMAGE_SIZE * cfg.IMAGE_SIZE,
        max_pixels=cfg.IMAGE_SIZE * cfg.IMAGE_SIZE,
        trust_remote_code=True,
    )
    
    if use_advanced:
        base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            attn_implementation="sdpa",
        )
    else:
        base_model = AutoModelForVision2Seq.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            trust_remote_code=True,
        )
    
    base_model = prepare_model_for_kbit_training(base_model)
    base_model.gradient_checkpointing_enable()
    
    lora_config = LoraConfig(
        r=cfg.LORA_R,
        lora_alpha=cfg.LORA_ALPHA,
        lora_dropout=cfg.LORA_DROPOUT,
        bias="none",
        target_modules=cfg.TARGET_MODULES,
        task_type="CAUSAL_LM",
    )
    
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()
    
    # 단일 디바이스로 이동 (device_map 대신)
    model = model.to(device)
    
    return model, processor

print("🔧 모델 로드 중...")
model, processor = create_model_and_processor(cfg.MODEL_ID, cfg.USE_ADVANCED_MODEL)
print(f"✅ 모델 로드 완료")

## 🎓 8. Training Loop

✅ **Val Accuracy 로깅** + Confusion Matrix + 학습 곡선

In [None]:
class EMA:
    def __init__(self, model, decay=0.999):
        self.model = model
        self.decay = decay
        self.shadow = {}
        self.backup = {}
        self.register()
    
    def register(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()
    
    def update(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                new_average = self.decay * self.shadow[name] + (1.0 - self.decay) * param.data
                self.shadow[name] = new_average.clone()
    
    def apply_shadow(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.backup[name] = param.data.clone()
                param.data = self.shadow[name]
    
    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                param.data = self.backup[name]
        self.backup = {}


def validate_with_accuracy(model, valid_loader, processor):
    """✅ Val Loss + Accuracy + Confusion Matrix"""
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc="Validating", leave=False):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', enabled=cfg.USE_AMP, dtype=torch.float16):
                outputs = model(**batch)
                total_loss += outputs.loss.item()
            
            # ✅ Accuracy 계산 (정답 토큰 예측)
            logits = outputs.logits
            labels = batch["labels"]
            
            for i in range(len(labels)):
                # 마지막 비-패딩 토큰 위치 찾기
                valid_mask = labels[i] != -100
                if valid_mask.any():
                    last_valid_idx = valid_mask.nonzero(as_tuple=True)[0][-1]
                    pred_id = logits[i, last_valid_idx].argmax().item()
                    label_id = labels[i, last_valid_idx].item()
                    
                    # 토큰 → 문자 변환
                    pred_char = processor.tokenizer.decode([pred_id]).strip().lower()
                    label_char = processor.tokenizer.decode([label_id]).strip().lower()
                    
                    # a/b/c/d만 수집
                    if pred_char in ['a', 'b', 'c', 'd']:
                        all_preds.append(pred_char)
                    else:
                        all_preds.append('a')  # Fallback
                    
                    if label_char in ['a', 'b', 'c', 'd']:
                        all_labels.append(label_char)
                    else:
                        all_labels.append('a')
    
    avg_loss = total_loss / len(valid_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds, labels=['a', 'b', 'c', 'd'])
    
    model.train()
    return avg_loss, accuracy, cm, all_preds, all_labels


def train_one_fold(model, train_loader, valid_loader, fold=0):
    """단일 Fold 학습 (Val Acc 우선 저장)"""
    
    print(f"\n{'='*60}")
    print(f"Training Fold {fold}")
    print(f"{'='*60}")
    
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=cfg.LEARNING_RATE,
        weight_decay=cfg.WEIGHT_DECAY
    )
    
    num_training_steps = cfg.NUM_EPOCHS * math.ceil(len(train_loader) / cfg.GRAD_ACCUM_STEPS)
    num_warmup_steps = int(num_training_steps * cfg.WARMUP_RATIO)
    
    if cfg.USE_COSINE_SCHEDULE:
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    else:
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    
    scaler = torch.amp.GradScaler('cuda', enabled=cfg.USE_AMP)
    ema = EMA(model, decay=cfg.EMA_DECAY) if cfg.USE_EMA else None
    
    swa_model = None
    if cfg.USE_SWA:
        swa_model = AveragedModel(model)
        swa_scheduler = SWALR(optimizer, swa_lr=cfg.LEARNING_RATE * 0.1)
    
    best_val_acc = 0.0
    best_val_loss = float('inf')
    history = {"train_loss": [], "val_loss": [], "val_acc": []}
    
    for epoch in range(cfg.NUM_EPOCHS):
        model.train()
        running_loss = 0.0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.NUM_EPOCHS} [train]")
        
        for step, batch in enumerate(progress_bar, start=1):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', enabled=cfg.USE_AMP, dtype=torch.float16):
                outputs = model(**batch)
                loss = outputs.loss / cfg.GRAD_ACCUM_STEPS
            
            scaler.scale(loss).backward()
            running_loss += loss.item()
            
            if step % cfg.GRAD_ACCUM_STEPS == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.MAX_GRAD_NORM)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)
                
                if cfg.USE_SWA and epoch >= cfg.SWA_START_EPOCH:
                    swa_scheduler.step()
                else:
                    scheduler.step()
                
                if cfg.USE_EMA and ema is not None:
                    ema.update()
                
                avg_loss = running_loss / cfg.GRAD_ACCUM_STEPS
                progress_bar.set_postfix({"loss": f"{avg_loss:.4f}", "lr": f"{scheduler.get_last_lr()[0]:.2e}"})
                running_loss = 0.0
        
        # SWA update
        if cfg.USE_SWA and swa_model is not None and epoch >= cfg.SWA_START_EPOCH:
            swa_model.update_parameters(model)
        
        # ✅ Validation with Accuracy
        if cfg.USE_EMA and ema is not None:
            ema.apply_shadow()
        
        val_loss, val_acc, cm, preds, labels = validate_with_accuracy(model, valid_loader, processor)
        
        if cfg.USE_EMA and ema is not None:
            ema.restore()
        
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)
        
        print(f"[Epoch {epoch+1}] Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        print(f"Confusion Matrix:\n{cm}")
        
        # ✅ Best 모델 저장 (Acc 우선, 동률 시 Loss)
        is_best = False
        if val_acc > best_val_acc:
            is_best = True
            best_val_acc = val_acc
            best_val_loss = val_loss
        elif val_acc == best_val_acc and val_loss < best_val_loss:
            is_best = True
            best_val_loss = val_loss
        
        if is_best:
            save_path = f"{cfg.SAVE_DIR}/fold{fold}_best"
            os.makedirs(save_path, exist_ok=True)
            
            if cfg.USE_EMA and ema is not None:
                ema.apply_shadow()
            
            model.save_pretrained(save_path)
            processor.save_pretrained(save_path)
            
            if cfg.USE_EMA and ema is not None:
                ema.restore()
            
            print(f"   ✅ Best model saved (Acc={val_acc:.4f}, Loss={val_loss:.4f})")
    
    # SWA 최종 모델
    if cfg.USE_SWA and swa_model is not None:
        torch.optim.swa_utils.update_bn(train_loader, swa_model, device=device)
        save_path = f"{cfg.SAVE_DIR}/fold{fold}_swa"
        os.makedirs(save_path, exist_ok=True)
        swa_model.module.save_pretrained(save_path)
        processor.save_pretrained(save_path)
        print(f"   ✅ SWA model saved")
    
    # ✅ 학습 곡선 저장
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    ax1.plot(history["val_loss"], marker='o')
    ax1.set_title(f'Fold {fold} - Val Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.grid(True)
    
    ax2.plot(history["val_acc"], marker='o', color='green')
    ax2.set_title(f'Fold {fold} - Val Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.grid(True)
    plt.tight_layout()
    
    log_dir = Path(cfg.LOG_DIR)
    log_dir.mkdir(parents=True, exist_ok=True)
    plt.savefig(log_dir / f"fold{fold}_learning_curve.png")
    plt.show()
    
    return best_val_acc, best_val_loss

print("✅ Training functions 정의 완료")

## 🚀 9. 학습 실행 (K-Fold)

In [None]:
# ✅ 검증 데이터에 train=False 적용 (정답 주입 방지)

if cfg.USE_KFOLD:
    results = {}
    
    for fold in cfg.TRAIN_FOLDS:
        print(f"\n{'#'*60}")
        print(f"Starting Fold {fold}/{cfg.N_FOLDS-1}")
        print(f"{'#'*60}")
        
        train_subset = train_df[train_df['fold'] != fold].reset_index(drop=True)
        valid_subset = train_df[train_df['fold'] == fold].reset_index(drop=True)
        
        print(f"Train: {len(train_subset)}, Valid: {len(valid_subset)}")
        
        train_ds = VQADataset(train_subset, processor, cfg.DATA_DIR, train=True, use_advanced=cfg.USE_ADVANCED_MODEL)
        valid_ds = VQADataset(valid_subset, processor, cfg.DATA_DIR, train=False, use_advanced=cfg.USE_ADVANCED_MODEL)  # ✅ train=False
        
        train_loader = DataLoader(
            train_ds, batch_size=cfg.BATCH_SIZE, shuffle=True,
            collate_fn=DataCollator(processor, train=True, use_advanced=cfg.USE_ADVANCED_MODEL),
            num_workers=0
        )
        valid_loader = DataLoader(
            valid_ds, batch_size=cfg.BATCH_SIZE, shuffle=False,
            collate_fn=DataCollator(processor, train=False, use_advanced=cfg.USE_ADVANCED_MODEL),  # ✅ train=False
            num_workers=0
        )
        
        best_acc, best_loss = train_one_fold(model, train_loader, valid_loader, fold=fold)
        results[fold] = {"acc": best_acc, "loss": best_loss}
        
        print(f"\n✅ Fold {fold} 완료: Best Val Acc={best_acc:.4f}, Loss={best_loss:.4f}")
    
    print(f"\n{'='*60}")
    print("All Folds Training Complete!")
    print(f"{'='*60}")
    for fold, metrics in results.items():
        print(f"Fold {fold}: Acc={metrics['acc']:.4f}, Loss={metrics['loss']:.4f}")
    print(f"Average Acc: {np.mean([m['acc'] for m in results.values()]):.4f}")

else:
    # 단일 모델
    train_subset = train_df[train_df['fold'] == -1].reset_index(drop=True)
    valid_subset = train_df[train_df['fold'] == 0].reset_index(drop=True)
    
    train_ds = VQADataset(train_subset, processor, cfg.DATA_DIR, train=True, use_advanced=cfg.USE_ADVANCED_MODEL)
    valid_ds = VQADataset(valid_subset, processor, cfg.DATA_DIR, train=False, use_advanced=cfg.USE_ADVANCED_MODEL)  # ✅ train=False
    
    train_loader = DataLoader(train_ds, batch_size=cfg.BATCH_SIZE, shuffle=True,
                             collate_fn=DataCollator(processor, train=True, use_advanced=cfg.USE_ADVANCED_MODEL), num_workers=0)
    valid_loader = DataLoader(valid_ds, batch_size=cfg.BATCH_SIZE, shuffle=False,
                             collate_fn=DataCollator(processor, train=False, use_advanced=cfg.USE_ADVANCED_MODEL), num_workers=0)
    
    best_acc, best_loss = train_one_fold(model, train_loader, valid_loader, fold=0)
    print(f"\n✅ Single model 학습 완료: Best Val Acc={best_acc:.4f}, Loss={best_loss:.4f}")

## 🔮 10. Inference with Direct Logits + TTA

✅ **Direct Logits**: a/b/c/d 토큰 확률 직접 계산 (생성 대비 안정)

In [None]:
def get_choice_token_ids(processor):
    """a/b/c/d 토큰 ID 추출"""
    choice_tokens = {}
    for choice in ['a', 'b', 'c', 'd']:
        token_ids = processor.tokenizer.encode(choice, add_special_tokens=False)
        choice_tokens[choice] = token_ids
    return choice_tokens


def infer_with_direct_logits(model, processor, test_df, tta_scales=[1.0], fold=0):
    """✅ Direct Logits 추론 + TTA"""
    model.eval()
    
    # pad_token_id 설정
    if processor.tokenizer.pad_token_id is None:
        processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
    
    choice_tokens = get_choice_token_ids(processor)
    
    all_predictions = []
    all_probs = []
    
    for i in tqdm(range(len(test_df)), desc=f"Fold {fold} Inference"):
        row = test_df.iloc[i]
        
        # TTA: 여러 스케일로 추론
        tta_logits = []
        
        for scale in tta_scales:
            # 이미지 로드
            img_col = 'path' if 'path' in row else 'image'
            img_path = os.path.join(cfg.DATA_DIR, row[img_col])
            try:
                img = Image.open(img_path).convert("RGB")
            except:
                img = Image.new('RGB', (cfg.IMAGE_SIZE, cfg.IMAGE_SIZE), color='white')
            
            # TTA 스케일 적용
            if scale != 1.0:
                w, h = img.size
                new_w, new_h = int(w * scale), int(h * scale)
                img = img.resize((new_w, new_h), Image.BILINEAR)
            
            # 프롬프트
            user_text = build_mc_prompt(
                str(row["question"]), str(row["a"]),
                str(row["b"]), str(row["c"]), str(row["d"])
            )
            
            messages = [
                {"role": "system", "content": [{"type": "text", "text": cfg.SYSTEM_INSTRUCT}]},
                {"role": "user", "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": user_text}
                ]}
            ]
            
            # ✅ add_generation_prompt=True (추론 시)
            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            
            inputs = processor(text=[text], images=[img], return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # ✅ Direct Logits: 다음 토큰 분포에서 a/b/c/d 확률 계산
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits[0, -1, :]  # 마지막 토큰의 logits
            
            tta_logits.append(logits.cpu())
        
        # TTA 평균
        avg_logits = torch.stack(tta_logits).mean(dim=0)
        
        # ✅ a/b/c/d 토큰 확률 집계
        choice_probs = {}
        for choice, token_ids in choice_tokens.items():
            # 해당 choice의 모든 토큰 logit 합산
            total_logit = sum([avg_logits[tid].item() for tid in token_ids])
            choice_probs[choice] = total_logit
        
        # Softmax로 확률 변환
        logit_values = torch.tensor(list(choice_probs.values()))
        probs = F.softmax(logit_values, dim=0).numpy()
        prob_dict = {choice: probs[idx] for idx, choice in enumerate(['a', 'b', 'c', 'd'])}
        
        # 예측
        pred = max(prob_dict, key=prob_dict.get)
        
        all_predictions.append(pred)
        all_probs.append(prob_dict)
    
    # DataFrame 생성
    result_df = pd.DataFrame({
        'id': test_df['id'],
        'answer': all_predictions,
        'prob_a': [p['a'] for p in all_probs],
        'prob_b': [p['b'] for p in all_probs],
        'prob_c': [p['c'] for p in all_probs],
        'prob_d': [p['d'] for p in all_probs]
    })
    
    return result_df


# 각 Fold 추론
predictions_all = []

if cfg.USE_KFOLD:
    for fold in cfg.TRAIN_FOLDS:
        model_path = f"{cfg.SAVE_DIR}/fold{fold}_best"
        
        print(f"\n{'='*60}")
        print(f"Inferencing Fold {fold}")
        print(f"{'='*60}")
        
        # 모델 로드
        if cfg.USE_ADVANCED_MODEL:
            model_infer = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                model_path, trust_remote_code=True, torch_dtype=torch.float16
            )
        else:
            model_infer = AutoModelForVision2Seq.from_pretrained(
                model_path, trust_remote_code=True, torch_dtype=torch.float16
            )
        
        model_infer = model_infer.to(device)
        model_infer.eval()
        
        processor_infer = AutoProcessor.from_pretrained(
            model_path,
            min_pixels=cfg.IMAGE_SIZE * cfg.IMAGE_SIZE,
            max_pixels=cfg.IMAGE_SIZE * cfg.IMAGE_SIZE,
            trust_remote_code=True,
        )
        
        # Direct Logits + TTA
        tta_scales = cfg.TTA_SCALES if cfg.USE_TTA else [1.0]
        pred_df = infer_with_direct_logits(model_infer, processor_infer, test_df, tta_scales, fold)
        
        # 저장
        output_path = f"{cfg.OUTPUT_DIR}/submission_fold{fold}.csv"
        os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
        pred_df.to_csv(output_path, index=False)
        print(f"✅ Saved to {output_path}")
        
        predictions_all.append(pred_df)
        
        # 메모리 정리
        del model_infer
        torch.cuda.empty_cache()

else:
    model_path = f"{cfg.SAVE_DIR}/fold0_best"
    
    if cfg.USE_ADVANCED_MODEL:
        model_infer = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_path, trust_remote_code=True, torch_dtype=torch.float16
        ).to(device)
    else:
        model_infer = AutoModelForVision2Seq.from_pretrained(
            model_path, trust_remote_code=True, torch_dtype=torch.float16
        ).to(device)
    
    processor_infer = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
    
    tta_scales = cfg.TTA_SCALES if cfg.USE_TTA else [1.0]
    pred_df = infer_with_direct_logits(model_infer, processor_infer, test_df, tta_scales, fold=0)
    
    output_path = f"{cfg.OUTPUT_DIR}/submission_single.csv"
    pred_df.to_csv(output_path, index=False)
    predictions_all.append(pred_df)

print("\n✅ All inference complete!")

## 🌡️ 11. Temperature Scaling

✅ 검증 세트로 확률 교정 (선택 사항)

In [None]:
# ✅ Temperature Scaling (선택 사항)
# 검증 세트가 있을 때 각 fold의 최적 temperature를 찾아 test 확률에 적용

def find_optimal_temperature(val_probs, val_labels):
    """검증 세트에서 최적 temperature 탐색"""
    from scipy.optimize import minimize
    
    def nll_loss(temp):
        scaled_probs = F.softmax(torch.tensor(val_probs) / temp, dim=1).numpy()
        # Negative log-likelihood
        nll = -np.log(scaled_probs[np.arange(len(val_labels)), val_labels] + 1e-10).mean()
        return nll
    
    result = minimize(nll_loss, x0=[1.0], bounds=[(0.1, 10.0)])
    return result.x[0]

# 실제로 사용하려면:
# 1. 검증 세트로 확률과 정답 수집
# 2. optimal_temp = find_optimal_temperature(val_probs, val_labels)
# 3. test 확률에 적용: scaled_probs = F.softmax(torch.tensor(test_probs) / optimal_temp, dim=1)

# 현재는 temperature=1.0으로 유지 (기본)
print("✅ Temperature scaling은 선택 사항입니다.")
print("검증 세트가 있을 때 위 코드를 활용하여 최적 temperature를 찾을 수 있습니다.")

## 🎯 12. Ensemble (확률 평균)

✅ **Probability Averaging** (폴백: Majority Voting)

In [None]:
if cfg.USE_KFOLD and len(predictions_all) > 1:
    print(f"\n{'='*60}")
    print(f"Ensemble Method: {cfg.ENSEMBLE_METHOD}")
    print(f"{'='*60}")
    
    if cfg.ENSEMBLE_METHOD == 'prob':
        # ✅ 확률 앙상블
        print("Using Probability Averaging...")
        
        ensemble_probs = pd.DataFrame({
            'id': test_df['id'],
            'prob_a': np.mean([df['prob_a'].values for df in predictions_all], axis=0),
            'prob_b': np.mean([df['prob_b'].values for df in predictions_all], axis=0),
            'prob_c': np.mean([df['prob_c'].values for df in predictions_all], axis=0),
            'prob_d': np.mean([df['prob_d'].values for df in predictions_all], axis=0)
        })
        
        # argmax
        prob_cols = ['prob_a', 'prob_b', 'prob_c', 'prob_d']
        ensemble_probs['answer'] = ensemble_probs[prob_cols].values.argmax(axis=1)
        ensemble_probs['answer'] = ensemble_probs['answer'].map({0: 'a', 1: 'b', 2: 'c', 3: 'd'})
        
        final_submission = ensemble_probs[['id', 'answer', 'prob_a', 'prob_b', 'prob_c', 'prob_d']]
    
    else:
        # Majority Voting (폴백)
        print("Using Majority Voting...")
        
        ensemble_preds = []
        for i in range(len(test_df)):
            votes = [pred.iloc[i]['answer'] for pred in predictions_all]
            most_common = Counter(votes).most_common(1)[0][0]
            ensemble_preds.append(most_common)
        
        final_submission = pd.DataFrame({
            'id': test_df['id'],
            'answer': ensemble_preds
        })
    
    final_path = f"{cfg.OUTPUT_DIR}/submission_ensemble.csv"
    final_submission.to_csv(final_path, index=False)
    
    print(f"✅ Ensemble submission saved to {final_path}")
    print(f"\nAnswer Distribution:")
    print(final_submission['answer'].value_counts().sort_index())

else:
    print("\n✅ Single model - No ensemble needed")
    final_submission = predictions_all[0]
    final_path = f"{cfg.OUTPUT_DIR}/submission_single.csv"
    final_submission.to_csv(final_path, index=False)

## 📊 13. 결과 분석 및 시각화

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

answer_counts = final_submission['answer'].value_counts().sort_index()
sns.barplot(x=answer_counts.index, y=answer_counts.values, palette='viridis', ax=ax)
ax.set_title('Final Submission Answer Distribution', fontsize=14, weight='bold')
ax.set_xlabel('Answer')
ax.set_ylabel('Count')
ax.grid(axis='y', alpha=0.3)

for i, (ans, count) in enumerate(answer_counts.items()):
    percentage = count / len(final_submission) * 100
    ax.text(i, count + 10, f"{percentage:.1f}%", ha='center', fontsize=10)

plt.tight_layout()
plt.show()

print(f"\n{'='*60}")
print("Final Statistics")
print(f"{'='*60}")
print(f"Total predictions: {len(final_submission)}")
print(f"\nAnswer counts:")
for ans, count in answer_counts.items():
    print(f"  {ans}: {count:5d} ({count/len(final_submission)*100:5.1f}%)")

# 확률 분포 (있는 경우)
if 'prob_a' in final_submission.columns:
    print(f"\n{'='*60}")
    print("Probability Statistics")
    print(f"{'='*60}")
    prob_cols = ['prob_a', 'prob_b', 'prob_c', 'prob_d']
    print(final_submission[prob_cols].describe())

print(f"\n{'='*60}")
print("Sample Predictions")
print(f"{'='*60}")
print(final_submission.head(10))

## ✅ 14. 최종 정리

### 🎉 완료된 작업

1. ✅ Config 설정 (Pro2 튜닝)
2. ✅ 데이터 로드 & EDA
3. ✅ Stratified K-Fold CV
4. ✅ Dataset & DataCollator (라벨 마스킹)
5. ✅ Model & Processor (T4 호환)
6. ✅ Training Loop (Val Acc + Confusion Matrix)
7. ✅ Inference (Direct Logits + TTA)
8. ✅ Temperature Scaling (선택 사항)
9. ✅ Ensemble (확률 평균)
10. ✅ 결과 분석 & 시각화

### 🚀 주요 개선사항

#### 학습
- Val Accuracy 로깅 및 Best 모델 저장 (Acc 우선)
- Confusion Matrix 출력
- 학습 곡선 시각화
- 검증 데이터 train=False (정답 주입 방지)

#### 추론
- Direct Logits: a/b/c/d 토큰 확률 직접 계산
- TTA: [0.9, 1.0, 1.1] 스케일 평균
- pad_token_id 자동 보정
- 확률 컬럼 저장

#### 앙상블
- 확률 앙상블 (Probability Averaging)
- 폴백: Majority Voting

### 📊 Pro2 설정

```python
USE_SAMPLE = False          # 전체 데이터
IMAGE_SIZE = 512
NUM_EPOCHS = 3
GRAD_ACCUM_STEPS = 8
WARMUP_RATIO = 0.06
LORA_R = 16
USE_DIRECT_LOGIT_DECODE = True
TTA_SCALES = [0.9, 1.0, 1.1]
ENSEMBLE_METHOD = 'prob'
```

### 📌 Important Notes

- **디바이스 정렬**: 모든 모델/입력을 단일 device로 통일
- **라벨 마스킹**: 프롬프트 토큰 손실 제외, assistant 정답만 감독
- **검증 플래그**: valid_ds에 train=False 적용
- **Direct Logits**: 생성 대비 안정적이고 빠른 추론
- **확률 앙상블**: Fold 간 확률 평균으로 robust한 예측

---

**🤖 SSAFY AI Project 2025 - Pro2 Version**

**⭐ 행운을 빕니다!**