# Experiment Snapshot
- Name: 251023_Kaggle_AllInOne
- Score (Public LB): 0.80452
- Baseline: 251023_Baseline (0.76028) — Delta: +0.04424
- Model: Qwen/Qwen2.5-VL-3B-Instruct
- Image size: 384
- Mode: Zero-shot inference (no training)
- Date: 2025-10-23


# SSAFY VQA — All‑in‑One Kaggle Notebook

This single notebook consolidates setup and inference so you can run on Kaggle with just one `.ipynb`.

How to use:
- Create a Kaggle Dataset for your data (CSV + images).
- Open a Kaggle Notebook, set GPU + Internet On, and Add your data Dataset.
- Edit `DATA_DIR` below to match the mount path shown in the left Files panel.
- Run zero‑shot inference immediately (no training) to create `submission_baseline.csv`.

Notes:
- This notebook does not depend on `scripts/` or `config/`.
- For meaningful results, your Kaggle Dataset must include the images referenced by the CSV `path` (or `image`) column.
- Optional fine‑tuning on Kaggle T4 is resource‑heavy; a simple stub is included but disabled by default.


In [None]:
# User Config
import os

IN_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ

# Set your dataset mount directory.
# Check the left Files panel in Kaggle after adding your Dataset,
# e.g., '/kaggle/input/ssafy-ai-pjt-data' or '/kaggle/input/<your-dataset-name>'.
DATA_DIR = '/kaggle/input/ssafy-ai-pjt-data' if IN_KAGGLE else './SSAFY_AI_PJT_2025/data'

# CSV paths (the notebook will try both flat and nested 'data/' layouts)
TRAIN_CSV = f"{DATA_DIR}/train.csv" if os.path.exists(f"{DATA_DIR}/train.csv") else f"{DATA_DIR}/data/train.csv"
TEST_CSV  = f"{DATA_DIR}/test.csv"  if os.path.exists(f"{DATA_DIR}/test.csv")  else f"{DATA_DIR}/data/test.csv"

# Outputs
CKPT_DIR = '/kaggle/working/checkpoints/baseline' if IN_KAGGLE else './checkpoints/baseline'
SUB_CSV  = '/kaggle/working/submission_baseline.csv' if IN_KAGGLE else './submission_baseline.csv'

# Model + image size
BASE_MODEL_ID = 'Qwen/Qwen2.5-VL-3B-Instruct'
IMAGE_SIZE = 384

# Run mode
DO_TRAIN = False  # set True to try fine-tuning (heavy). Zero‑shot inference is default.
SAMPLE_TRAIN_SIZE = 200  # used only if DO_TRAIN
EPOCHS = 1
BATCH_SIZE = 1
GRAD_ACCUM = 4
LR = 1e-4
SEED = 42

print('IN_KAGGLE:', IN_KAGGLE)
print('DATA_DIR:', DATA_DIR)
print('TRAIN_CSV exists:', os.path.exists(TRAIN_CSV))
print('TEST_CSV  exists:', os.path.exists(TEST_CSV))


In [None]:
# Install minimal dependencies (Torch is provided by Kaggle image; keep it)
import sys, subprocess

def pip_install(pkgs):
    if isinstance(pkgs, str):
        pkgs = [pkgs]
    cmd = [sys.executable, '-m', 'pip', 'install', '-q'] + pkgs
    print('pip install:', ' '.join(pkgs))
    subprocess.check_call(cmd)

# Transformers latest for Qwen2.5-VL
pip_install(['-U', 'git+https://github.com/huggingface/transformers.git'])
# Core deps for I/O + VLM utils
pip_install(['qwen-vl-utils[decord]==0.0.8', 'pillow==10.4.0', 'opencv-python==4.10.0',
            'pandas==2.2.2', 'numpy==1.26.4', 'tqdm==4.66.4', 'pyyaml==6.0.1',
            'ipywidgets==8.1.3', 'matplotlib==3.9.1', 'seaborn==0.13.2'])

# Optional: only needed if you plan to try fine‑tuning in this single notebook
if DO_TRAIN:
    pip_install(['peft==0.12.0', 'bitsandbytes==0.43.3', 'accelerate==0.33.0'])


In [None]:
# Imports + environment
import os, sys, math, random, re
import torch
import pandas as pd
from PIL import Image
from tqdm import tqdm
from transformers import AutoModelForVision2Seq, AutoProcessor, get_linear_schedule_with_warmup

os.environ['WANDB_MODE'] = 'disabled'  # disable W&B by default
print('CUDA available:', torch.cuda.is_available(), '| torch', torch.__version__)

def set_seed(seed: int = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)


In [None]:
# Prompt + parsing helpers
SYSTEM_INSTRUCT = (
    'You are a helpful visual question answering assistant. ' +
    'Answer using exactly one letter among a, b, c, or d. No explanation.'
)

def build_mc_prompt(question, a, b, c, d):
    return (
        f"{question}
"
        f"(a) {a}\n(b) {b}\n(c) {c}\n(d) {d}\n\n"
        '정답은 반드시 a, b, c, d 중 하나의 소문자만 출력하세요.'
    )

def parse_answer(text: str) -> str:
    t = text.lower()
    # If the decoded text contains 'assistant', take the tail
    if 'assistant' in t:
        t = t.split('assistant')[-1]
    m = re.findall(r'\b([abcd])\b', t)
    return m[0] if m else 'a'  # fallback


In [None]:
# Zero‑shot inference (no training)
def load_model_and_processor(model_id: str, image_size: int = 384):
    processor = AutoProcessor.from_pretrained(
        model_id,
        min_pixels=image_size*image_size,
        max_pixels=image_size*image_size,
        trust_remote_code=True,
    )
    model = AutoModelForVision2Seq.from_pretrained(
        model_id,
        device_map='auto',
        trust_remote_code=True,
        torch_dtype=torch.float16 if torch.cuda.is_available() else None,
    )
    model.eval()
    return processor, model

def predict_row(model, processor, row, data_root: str, device: str = 'cuda', image_size: int = 384):
    # Build image path
    if 'path' in row:
        img_path = os.path.join(data_root, row['path'])
    elif 'image' in row:
        img_path = os.path.join(data_root, row['image'])
    else:
        raise ValueError("No 'path' or 'image' column found")

    try:
        img = Image.open(img_path).convert('RGB')
    except Exception:
        # Fallback to a blank image if not found
        img = Image.new('RGB', (image_size, image_size), color='white')

    user_text = build_mc_prompt(str(row['question']), str(row['a']), str(row['b']), str(row['c']), str(row['d']))

    messages = [
        {'role': 'system', 'content': [{'type': 'text', 'text': SYSTEM_INSTRUCT}]},
        {'role': 'user', 'content': [
            {'type': 'image', 'image': img},
            {'type': 'text',  'text': user_text}
        ]}
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[img], padding=True, return_tensors='pt')
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=10, do_sample=False, temperature=0.0)
    generated = processor.decode(outputs[0], skip_special_tokens=True)
    return parse_answer(generated)

def run_zero_shot_inference(model_id: str, test_csv: str, data_root: str, output_csv: str, image_size: int = 384):
    print('Loading test CSV:', test_csv)
    test_df = pd.read_csv(test_csv)
    print('Test samples:', len(test_df))

    processor, model = load_model_and_processor(model_id, image_size=image_size)

    results = []
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc='Predict'):
        ans = predict_row(model, processor, row, data_root, image_size=image_size)
        results.append({'id': row['id'], 'answer': ans})

    sub = pd.DataFrame(results).sort_values('id').reset_index(drop=True)
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    sub.to_csv(output_csv, index=False)
    print('Saved submission to:', output_csv)
    return sub


In [None]:
# (Optional) Very simple fine‑tune stub — disabled by default
# This is a minimal baseline loop and may OOM on T4.
# Prefer zero‑shot or LoRA training in a dedicated pipeline.
def train_minimal(model_id: str, train_csv: str, data_root: str, output_dir: str, image_size: int = 384,
                  epochs: int = 1, batch_size: int = 1, grad_accum: int = 4, lr: float = 1e-4,
                  sample_n: int | None = None):
    from torch.utils.data import Dataset, DataLoader

    class VQADataset(Dataset):
        def __init__(self, df, processor, data_dir: str, train: bool = True):
            self.df = df.reset_index(drop=True)
            self.processor = processor
            self.data_dir = data_dir
            self.train = train
        def __len__(self):
            return len(self.df)
        def __getitem__(self, idx):
            row = self.df.iloc[idx]
            img_rel = row['path'] if 'path' in row else row.get('image', '')
            img_path = os.path.join(self.data_dir, img_rel)
            try:
                img = Image.open(img_path).convert('RGB')
            except Exception:
                img = Image.new('RGB', (image_size, image_size), color='white')
            user_text = build_mc_prompt(str(row['question']), str(row['a']), str(row['b']), str(row['c']), str(row['d']))
            messages = [
                {'role': 'system', 'content': [{'type': 'text', 'text': SYSTEM_INSTRUCT}]},
                {'role': 'user',   'content': [
                    {'type': 'image', 'image': img},
                    {'type': 'text',  'text': user_text}
                ]}
            ]
            if self.train and 'answer' in row:
                messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': str(row['answer']).strip().lower()}]})
            return {'messages': messages, 'image': img}

    class Collator:
        def __init__(self, processor):
            self.processor = processor
        def __call__(self, batch):
            texts, images = [], []
            for sample in batch:
                text = self.processor.apply_chat_template(sample['messages'], tokenize=False, add_generation_prompt=False)
                texts.append(text)
                images.append(sample['image'])
            enc = self.processor(text=texts, images=images, padding=True, return_tensors='pt')
            enc['labels'] = enc['input_ids'].clone()  # naive labels on full sequence
            if torch.cuda.is_available():
                enc = {k: v.to('cuda') for k, v in enc.items()}
            return enc

    print('Loading training CSV:', train_csv)
    df = pd.read_csv(train_csv)
    if sample_n is not None:
        df = df.sample(n=min(sample_n, len(df)), random_state=SEED).reset_index(drop=True)
    print('Train samples:', len(df))

    processor = AutoProcessor.from_pretrained(model_id, min_pixels=image_size*image_size, max_pixels=image_size*image_size, trust_remote_code=True)
    model = AutoModelForVision2Seq.from_pretrained(model_id, device_map='auto', trust_remote_code=True, torch_dtype=torch.float16 if torch.cuda.is_available() else None)
    if torch.cuda.is_available():
        model = model.to('cuda')
    model.train()

    ds = VQADataset(df, processor, data_root, train=True)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=True, collate_fn=Collator(processor))

    optim = torch.optim.AdamW(model.parameters(), lr=lr)
    steps = epochs * math.ceil(len(dl) / max(1, grad_accum))
    sched = get_linear_schedule_with_warmup(optim, int(steps*0.03), steps)
    scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

    global_step = 0
    for epoch in range(epochs):
        running = 0.0
        for i, batch in enumerate(tqdm(dl, desc=f'Epoch {epoch+1}')):
            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available(), dtype=torch.float16 if torch.cuda.is_available() else None):
                out = model(**batch)
                loss = out.loss / max(1, grad_accum)
            scaler.scale(loss).backward()
            running += float(loss.item())
            if (i + 1) % max(1, grad_accum) == 0:
                scaler.step(optim)
                scaler.update()
                optim.zero_grad(set_to_none=True)
                sched.step()
                global_step += 1
        print(f'epoch {epoch+1} done')

    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    processor.save_pretrained(output_dir)
    print('Saved checkpoint to:', output_dir)
    return output_dir


In [None]:
# Run — zero‑shot by default
if DO_TRAIN:
    print('Fine‑tuning is enabled — this may OOM on T4.\nTry SAMPLE_TRAIN_SIZE small first or keep DO_TRAIN=False.')
    _ = train_minimal(
        model_id=BASE_MODEL_ID,
        train_csv=TRAIN_CSV,
        data_root=DATA_DIR,
        output_dir=CKPT_DIR,
        image_size=IMAGE_SIZE,
        epochs=EPOCHS, batch_size=BATCH_SIZE, grad_accum=GRAD_ACCUM, lr=LR,
        sample_n=SAMPLE_TRAIN_SIZE
    )
else:
    print('Running zero‑shot inference (no training)...')
    sub = run_zero_shot_inference(
        model_id=BASE_MODEL_ID,
        test_csv=TEST_CSV,
        data_root=DATA_DIR,
        output_csv=SUB_CSV,
        image_size=IMAGE_SIZE
    )
    display(sub.head())
