# 🚀 Kaggle_Qwen3_30B_MultiGPU.ipynb

## 🎯 특징

### 📊 모델
- **Qwen/Qwen2.5-VL-30B-A3B-Instruct** (30B 파라미터)
- 4-bit Quantization (필수)
- Multi-GPU 병렬 처리 (T4 * 2)

### 🔧 최적화
- ✅ **Model Parallelism**: 모델을 여러 GPU에 분산
- ✅ **Memory Optimization**: CPU offloading, gradient checkpointing
- ✅ **4-bit QLoRA**: 메모리 효율 극대화
- ✅ **Batch Size 1**: OOM 방지
- ✅ **Gradient Accumulation**: 효과적인 배치 크기
- ✅ **Mixed Precision**: Float16 + BFloat16
- ✅ **Accelerate Integration**: 자동 병렬 처리

### ⚠️ 중요 사항
- T4 * 2 (총 32GB) 환경 최적화
- EMA/SWA 비활성화 (메모리 부족)
- Inference도 병렬 처리
- 매우 높은 gradient accumulation

**🤖 SSAFY AI Project 2025 - Qwen3-30B Multi-GPU Edition**

## 📦 1. 패키지 설치

In [None]:
# 필수 패키지 설치
# !pip install -q transformers>=4.45.0 accelerate>=0.34.0 peft>=0.13.0 bitsandbytes>=0.43.0
# !pip install -q datasets pillow pandas torch torchvision scikit-learn matplotlib seaborn tqdm scipy
# !pip install -q qwen-vl-utils==0.0.8
# !pip install -q deepspeed  # Optional: DeepSpeed for advanced optimization

print("✅ 패키지 설치 완료! 런타임 재시작하세요.")

## 📚 2. 라이브러리 임포트 & GPU 확인

In [None]:
import os, sys, re, math, random, warnings, json, logging
from datetime import datetime
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Any, Optional, Tuple
from collections import Counter, defaultdict
import unicodedata
import gc

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
    BitsAndBytesConfig,
    get_cosine_schedule_with_warmup,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator, DistributedDataParallelKwargs
from accelerate.utils import set_seed as accelerate_set_seed

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.optimize import minimize

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')
Image.MAX_IMAGE_PIXELS = None
sns.set_style('whitegrid')

# GPU 확인
print("🔍 GPU 확인:")
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    print(f"   사용 가능 GPU: {gpu_count}개")
    for i in range(gpu_count):
        gpu_name = torch.cuda.get_device_name(i)
        gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1e9
        print(f"   GPU {i}: {gpu_name} ({gpu_memory:.1f} GB)")
    total_memory = sum(torch.cuda.get_device_properties(i).total_memory for i in range(gpu_count)) / 1e9
    print(f"   총 GPU 메모리: {total_memory:.1f} GB")
    print(f"   CUDA Version: {torch.version.cuda}")
else:
    print("   ❌ GPU를 사용할 수 없습니다!")
    raise RuntimeError("이 노트북은 GPU가 필수입니다.")

print(f"   PyTorch: {torch.__version__}")
print(f"   Transformers: {__import__('transformers').__version__}")
print(f"   Accelerate: {__import__('accelerate').__version__}")

## ⚙️ 3. Config 설정 (Qwen3-30B Multi-GPU 최적화)

### 🎯 핵심 설정
- **메모리 최적화**: 4-bit, CPU offload, gradient checkpointing
- **병렬 처리**: Model parallelism across 2 GPUs
- **안정성**: 매우 작은 배치, 높은 gradient accumulation

In [None]:
class Config:
    # ============ 시드 ============
    SEED = 42
    
    # ============ 모델 ============
    MODEL_ID = "Qwen/Qwen2.5-VL-30B-A3B-Instruct"  # 30B 모델
    IMAGE_SIZE = 384  # 메모리 절약을 위해 384로 시작 (필요시 512)
    
    # ============ 데이터 ============
    DATA_DIR = "/content"
    TRAIN_CSV = f"{DATA_DIR}/train.csv"
    TEST_CSV = f"{DATA_DIR}/test.csv"
    
    # ============ K-Fold ============
    N_FOLDS = 3
    USE_KFOLD = True
    TRAIN_FOLDS = [0, 1, 2]
    
    # ============ QLoRA (메모리 최적화) ============
    LORA_R = 8  # 30B 모델에는 작은 rank 사용
    LORA_ALPHA = 16
    LORA_DROPOUT = 0.05
    TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]  # 필수 모듈만
    
    # ============ 학습 (메모리 최적화) ============
    NUM_EPOCHS = 2  # 30B 모델은 적은 epoch도 충분
    BATCH_SIZE = 1  # 필수: OOM 방지
    GRAD_ACCUM_STEPS = 16  # 높은 accumulation으로 효과적 배치 크기 확보
    LEARNING_RATE = 5e-5  # 큰 모델은 작은 LR
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1  # 더 긴 warmup
    MAX_GRAD_NORM = 0.5  # 더 작은 gradient clipping
    
    # ============ 메모리 최적화 ============
    USE_AMP = True  # Mixed precision
    USE_GRADIENT_CHECKPOINTING = True  # 필수
    USE_CPU_OFFLOAD = True  # Optimizer states를 CPU로
    USE_8BIT_OPTIMIZER = False  # bitsandbytes 8-bit Adam (선택)
    
    # ============ 고급 기법 (메모리 고려하여 비활성화) ============
    USE_EMA = False  # 메모리 부족으로 비활성화
    USE_SWA = False  # 메모리 부족으로 비활성화
    USE_COSINE_SCHEDULE = True
    
    # ============ Early Stopping ============
    USE_EARLY_STOPPING = False
    EARLY_STOPPING_PATIENCE = 1
    
    # ============ TTA ============
    USE_TTA = False  # 메모리 절약을 위해 비활성화 (필요시 활성화)
    TTA_SCALES = [1.0]  # Single scale
    
    # ============ 추론 ============
    USE_DIRECT_LOGIT_DECODE = True
    USE_BATCH_INFERENCE = False  # 메모리 절약
    INFER_BATCH_SIZE = 1
    MAX_NEW_TOKENS = 8
    
    # ============ Temperature Scaling ============
    USE_TEMPERATURE_SCALING = True
    
    # ============ 앙상블 ============
    ENSEMBLE_METHOD = "prob"
    FOLD_WEIGHTS = None
    
    # ============ 저장 ============
    SAVE_DIR = f"{DATA_DIR}/checkpoints_30b"
    OUTPUT_DIR = f"{DATA_DIR}/outputs_30b"
    LOG_DIR = f"{DATA_DIR}/logs_30b"
    SAVE_EVERY_EPOCH = False  # 메모리 절약
    
    # ============ 샘플링 ============
    USE_SAMPLE = False
    SAMPLE_SIZE = 100
    
    # ============ 프롬프트 ============
    SYSTEM_INSTRUCT = (
        "You are a helpful visual question answering assistant. "
        "Answer using exactly one letter among a, b, c, or d. No explanation."
    )
    
    # ============ 로깅 ============
    LOG_LEVEL = logging.INFO
    LOG_TO_FILE = True
    
    # ============ Multi-GPU 설정 ============
    USE_MULTI_GPU = True  # 자동으로 여러 GPU 사용
    MAX_MEMORY_PER_GPU = {0: "14GB", 1: "14GB"}  # T4 * 2 (각 16GB 중 14GB 사용)
    DEVICE_MAP = "auto"  # 자동 모델 병렬화
    
    # ============ Quantization ============
    LOAD_IN_4BIT = True  # 필수: 4-bit quantization
    LOAD_IN_8BIT = False
    BNB_4BIT_COMPUTE_DTYPE = torch.float16  # T4는 BF16 미지원
    BNB_4BIT_QUANT_TYPE = "nf4"
    BNB_4BIT_USE_DOUBLE_QUANT = True

cfg = Config()

# 디렉토리 생성
for dir_path in [cfg.SAVE_DIR, cfg.OUTPUT_DIR, cfg.LOG_DIR]:
    Path(dir_path).mkdir(parents=True, exist_ok=True)

# 로깅 설정
def setup_logging():
    logger = logging.getLogger('VQA_30B')
    logger.setLevel(cfg.LOG_LEVEL)
    logger.handlers.clear()
    
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)
    
    if cfg.LOG_TO_FILE:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        file_handler = logging.FileHandler(f"{cfg.LOG_DIR}/training_30b_{timestamp}.log")
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    
    return logger

logger = setup_logging()

# 시드 고정
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    accelerate_set_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(cfg.SEED)

logger.info("="*60)
logger.info("🚀 Qwen3-VL-30B Multi-GPU Configuration")
logger.info("="*60)
logger.info(f"Model: {cfg.MODEL_ID}")
logger.info(f"Image Size: {cfg.IMAGE_SIZE}")
logger.info(f"Epochs: {cfg.NUM_EPOCHS}, Batch: {cfg.BATCH_SIZE}, Grad Accum: {cfg.GRAD_ACCUM_STEPS}")
logger.info(f"Effective Batch Size: {cfg.BATCH_SIZE * cfg.GRAD_ACCUM_STEPS}")
logger.info(f"LoRA R: {cfg.LORA_R}, LR: {cfg.LEARNING_RATE}")
logger.info(f"Multi-GPU: {cfg.USE_MULTI_GPU}, Device Map: {cfg.DEVICE_MAP}")
logger.info(f"4-bit Quantization: {cfg.LOAD_IN_4BIT}")
logger.info(f"Gradient Checkpointing: {cfg.USE_GRADIENT_CHECKPOINTING}")
logger.info(f"CPU Offload: {cfg.USE_CPU_OFFLOAD}")
logger.info("="*60)
print(f"\n📝 로그 저장: {cfg.LOG_DIR}")

## 💾 4. 메모리 최적화 유틸리티

In [None]:
def print_gpu_memory(prefix=""):
    """모든 GPU 메모리 사용량 출력"""
    if not torch.cuda.is_available():
        return
    
    logger.info(f"{'='*60}")
    logger.info(f"{prefix} GPU Memory Status")
    logger.info(f"{'='*60}")
    
    for i in range(torch.cuda.device_count()):
        allocated = torch.cuda.memory_allocated(i) / 1e9
        reserved = torch.cuda.memory_reserved(i) / 1e9
        total = torch.cuda.get_device_properties(i).total_memory / 1e9
        logger.info(f"GPU {i}: Allocated={allocated:.2f}GB, Reserved={reserved:.2f}GB, Total={total:.1f}GB")
    
    logger.info(f"{'='*60}")

def clear_memory():
    """메모리 정리"""
    gc.collect()
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            with torch.cuda.device(i):
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
    logger.info("💾 Memory cleared")

def estimate_model_memory(model):
    """모델 메모리 사용량 추정"""
    param_size = 0
    buffer_size = 0
    
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    total_size_gb = (param_size + buffer_size) / 1e9
    logger.info(f"📊 Estimated model size: {total_size_gb:.2f} GB")
    return total_size_gb

# 초기 메모리 상태
print_gpu_memory("Initial")
clear_memory()
print_gpu_memory("After clearing")

## 📊 5. 데이터 로드 & EDA

In [None]:
try:
    train_df = pd.read_csv(cfg.TRAIN_CSV)
    test_df = pd.read_csv(cfg.TEST_CSV)
    logger.info(f"📁 Train: {len(train_df):,} samples")
    logger.info(f"📁 Test: {len(test_df):,} samples")
except Exception as e:
    logger.error(f"❌ 데이터 로드 실패: {e}")
    raise

# 데이터 검증
required_cols = ['question', 'a', 'b', 'c', 'd', 'answer']
missing_cols = set(required_cols) - set(train_df.columns)
if missing_cols:
    raise ValueError(f"❌ 필수 컬럼 누락: {missing_cols}")

img_col = 'path' if 'path' in train_df.columns else 'image'
logger.info(f"📷 이미지 컬럼: {img_col}")

if cfg.USE_SAMPLE:
    train_df = train_df.sample(n=min(cfg.SAMPLE_SIZE, len(train_df)), random_state=cfg.SEED).reset_index(drop=True)
    logger.warning(f"⚠️  Sampled {len(train_df)} samples for testing")

logger.info(f"\n📊 Answer Distribution:")
answer_dist = train_df['answer'].value_counts().sort_index()
for ans, count in answer_dist.items():
    logger.info(f"   {ans}: {count:4d} ({count/len(train_df)*100:.1f}%)")

# 시각화
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
answer_dist.plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Answer Distribution', fontsize=12, weight='bold')
axes[0].set_xlabel('Answer')
axes[0].set_ylabel('Count')

train_df['question_len'] = train_df['question'].str.len()
train_df['question_len'].hist(bins=30, ax=axes[1], color='salmon')
axes[1].set_title('Question Length', fontsize=12, weight='bold')
plt.tight_layout()
plt.savefig(f"{cfg.LOG_DIR}/data_dist_30b.png", dpi=150)
plt.show()

logger.info("✅ 데이터 로드 완료")

## 🔄 6. Stratified K-Fold CV

In [None]:
if cfg.USE_KFOLD:
    skf = StratifiedKFold(n_splits=cfg.N_FOLDS, shuffle=True, random_state=cfg.SEED)
    train_df['fold'] = -1
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['answer'])):
        train_df.loc[val_idx, 'fold'] = fold
    
    logger.info(f"✅ {cfg.N_FOLDS}-Fold CV 생성")
    for fold in range(cfg.N_FOLDS):
        fold_count = (train_df['fold'] == fold).sum()
        logger.info(f"   Fold {fold}: {fold_count:4d} samples")
else:
    split_idx = int(len(train_df) * 0.9)
    train_df['fold'] = -1
    train_df.loc[split_idx:, 'fold'] = 0
    logger.info("✅ Single split (90:10)")