# Belle-VLM Inference (Standalone)

Notebook độc lập để chạy inference với Belle-VLM từ HuggingFace.

**Không cần clone repo GitHub!**

## Requirements
- Python 3.10+
- CUDA GPU (recommended)
- ~8GB VRAM

In [None]:
# Install dependencies
!pip install -q transformers>=4.51.0 torch torchvision
!pip install -q accelerate pillow einops timm
!pip install -q huggingface_hub datasets

In [None]:
# ============================================
# CONFIG
# ============================================

HF_MODEL_ID = "beyoru/Belle-VLM"  # Your model on HuggingFace

# Image size used during training
IMAGE_SIZE = 384

print(f"Model: {HF_MODEL_ID}")
print(f"Image size: {IMAGE_SIZE}")

In [None]:
# ============================================
# CONSTANTS (from LLaVA)
# ============================================

IGNORE_INDEX = -100
IMAGE_TOKEN_INDEX = -200
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"

In [None]:
# ============================================
# CONVERSATION TEMPLATE (Qwen3)
# ============================================

from dataclasses import dataclass
from typing import List, Optional, Tuple

@dataclass
class Conversation:
    """Conversation template for Qwen3."""
    system: str
    roles: Tuple[str, str]
    messages: List[List[str]]
    sep: str
    sep2: Optional[str] = None
    
    def copy(self):
        return Conversation(
            system=self.system,
            roles=self.roles,
            messages=[[x, y] for x, y in self.messages],
            sep=self.sep,
            sep2=self.sep2,
        )
    
    def append_message(self, role: str, message: str):
        self.messages.append([role, message])
    
    def get_prompt(self) -> str:
        """Build prompt for Qwen3 format."""
        ret = ""
        
        # Add system message if exists
        if self.system:
            ret += f"<|im_start|>system\n{self.system}<|im_end|>\n"
        
        # Add conversation messages
        for role, message in self.messages:
            if message:
                ret += f"<|im_start|>{role}\n{message}<|im_end|>\n"
            else:
                ret += f"<|im_start|>{role}\n"
        
        return ret


# Qwen3 conversation template
CONV_QWEN3 = Conversation(
    system="You are a helpful assistant.",
    roles=("user", "assistant"),
    messages=[],
    sep="<|im_end|>",
)

print("Conversation template ready!")

In [None]:
# ============================================
# TOKENIZER UTILS
# ============================================

import torch

def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
    """
    Tokenize prompt with image token handling.
    Replaces <image> with IMAGE_TOKEN_INDEX.
    """
    prompt_chunks = prompt.split(DEFAULT_IMAGE_TOKEN)
    
    # Tokenize each chunk
    token_chunks = []
    for i, chunk in enumerate(prompt_chunks):
        chunk_ids = tokenizer.encode(chunk, add_special_tokens=(i == 0))
        token_chunks.append(chunk_ids)
    
    # Merge with image token
    input_ids = []
    for i, chunk_ids in enumerate(token_chunks):
        input_ids.extend(chunk_ids)
        if i < len(token_chunks) - 1:  # Add image token between chunks
            input_ids.append(image_token_index)
    
    if return_tensors == 'pt':
        return torch.tensor(input_ids, dtype=torch.long)
    return input_ids

print("Tokenizer utils ready!")

In [None]:
# ============================================
# IMAGE PROCESSING
# ============================================

from PIL import Image
import torch
from torchvision import transforms

def expand2square(pil_img, background_color):
    """Expand image to square with padding."""
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        return result


def process_image(image, image_size=384):
    """
    Process image for model input.
    
    Args:
        image: PIL Image
        image_size: Target size (default 384 for mobileclip_l_384)
    
    Returns:
        Tensor of shape (3, image_size, image_size)
    """
    # Convert to RGB
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    # Expand to square with gray padding
    image = expand2square(image, (128, 128, 128))
    
    # Define transforms (CLIP normalization)
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.48145466, 0.4578275, 0.40821073],  # CLIP mean
            std=[0.26862954, 0.26130258, 0.27577711]   # CLIP std
        ),
    ])
    
    return transform(image)

print("Image processing ready!")

In [None]:
# ============================================
# LOAD MODEL & TOKENIZER
# ============================================

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

print(f"Loading model from: {HF_MODEL_ID}")
print("This may take a few minutes...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    HF_MODEL_ID,
    trust_remote_code=True,
    use_fast=False
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded! Vocab size: {tokenizer.vocab_size}")

# Load model
model = AutoModelForCausalLM.from_pretrained(
    HF_MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()

print(f"Model loaded!")
print(f"Model type: {model.config.model_type}")
print(f"Device: {model.device}")

In [None]:
# ============================================
# SETUP VISION TOWER
# ============================================

# Check if model has vision tower
has_vision_tower = hasattr(model, 'get_vision_tower')
print(f"Has vision tower method: {has_vision_tower}")

if has_vision_tower:
    vision_tower = model.get_vision_tower()
    
    if vision_tower is not None:
        if not vision_tower.is_loaded:
            print("Loading vision tower...")
            vision_tower.load_model()
        
        vision_tower = vision_tower.to(device=model.device, dtype=torch.float16)
        print(f"Vision tower ready: {type(vision_tower).__name__}")
    else:
        print("Vision tower is None - model may be LLM-only")
else:
    print("Model doesn't have vision tower - using as LLM-only")

In [None]:
# ============================================
# INFERENCE FUNCTION
# ============================================

def ask_vlm(image, question, max_tokens=512, temperature=0.7):
    """
    Ask Belle-VLM a question about an image.
    
    Args:
        image: PIL Image or path to image file
        question: Question about the image (Vietnamese or English)
        max_tokens: Maximum tokens to generate (default 512)
        temperature: Sampling temperature (default 0.7, use 0 for greedy)
    
    Returns:
        Model response as string
    
    Example:
        response = ask_vlm("image.jpg", "Mo ta hinh anh nay.")
        print(response)
    """
    # Load image if path
    if isinstance(image, str):
        image = Image.open(image)
    
    # Process image
    image_tensor = process_image(image, IMAGE_SIZE)
    
    # Build prompt
    conv = CONV_QWEN3.copy()
    conv.append_message(conv.roles[0], f"{DEFAULT_IMAGE_TOKEN}\n{question}")
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    
    # Tokenize
    input_ids = tokenizer_image_token(
        prompt, 
        tokenizer, 
        IMAGE_TOKEN_INDEX, 
        return_tensors='pt'
    ).unsqueeze(0).to(model.device)
    
    # Prepare image tensor
    images = image_tensor.unsqueeze(0).to(dtype=torch.float16, device=model.device)
    
    # Generate
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=images,
            image_sizes=[image.size],
            do_sample=temperature > 0,
            temperature=temperature if temperature > 0 else None,
            top_p=0.8 if temperature > 0 else None,
            max_new_tokens=max_tokens,
            use_cache=True,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # Decode
    response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
    
    # Extract assistant response (after last "assistant\n")
    if "assistant\n" in response:
        response = response.split("assistant\n")[-1].strip()
    
    return response

print("Inference function ready!")
print("Usage: response = ask_vlm(image, question)")

## Test with Sample Image

In [None]:
# ============================================
# TEST WITH DATASET SAMPLE
# ============================================

from datasets import load_dataset
from IPython.display import display

# Load test dataset
print("Loading test dataset...")
dataset = load_dataset("5CD-AI/Viet-multimodal-open-r1-8k-verified", split="train")

# Get sample
sample_idx = 0
test_image = dataset[sample_idx]['image']
test_question = dataset[sample_idx]['vi_problem']
expected_answer = dataset[sample_idx]['vi_solution']

# Display image
print("Test Image:")
display(test_image.resize((400, 400)))

print(f"\nQuestion: {test_question[:300]}...")

In [None]:
# ============================================
# RUN INFERENCE
# ============================================

print("Generating response...")
response = ask_vlm(test_image, test_question)

print("=" * 60)
print("INFERENCE RESULT")
print("=" * 60)
print(f"\nQuestion:\n{test_question[:500]}")
print(f"\nModel Response:\n{response}")
print(f"\nExpected Answer:\n{expected_answer[:500]}...")
print("=" * 60)

In [None]:
# ============================================
# TEST WITH CUSTOM IMAGE
# ============================================

# Option 1: From URL
import requests
from io import BytesIO

def load_image_from_url(url):
    response = requests.get(url)
    return Image.open(BytesIO(response.content))

# Example with URL (uncomment to use)
# image_url = "https://example.com/image.jpg"
# custom_image = load_image_from_url(image_url)
# response = ask_vlm(custom_image, "Mo ta hinh anh nay.")
# print(response)

# Option 2: From file path
# custom_image = Image.open("/path/to/your/image.jpg")
# response = ask_vlm(custom_image, "Trong hinh co gi?")
# print(response)

print("Ready to test with custom images!")
print("Uncomment the code above to try with your own images.")

In [None]:
# ============================================
# BATCH INFERENCE (Multiple samples)
# ============================================

print("Testing on multiple samples...\n")

for i in range(min(3, len(dataset))):
    sample = dataset[i]
    image = sample['image']
    question = sample['vi_problem']
    
    print(f"--- Sample {i+1} ---")
    print(f"Q: {question[:150]}...")
    
    response = ask_vlm(image, question, max_tokens=256)
    print(f"A: {response[:300]}...")
    print()

## Usage Summary

```python
# Basic usage
response = ask_vlm(image, "Mo ta hinh anh nay.")

# With file path
response = ask_vlm("/path/to/image.jpg", "Trong hinh co gi?")

# With custom parameters
response = ask_vlm(
    image,
    "Giai bai toan trong hinh.",
    max_tokens=1024,  # Longer response
    temperature=0.3   # More focused/deterministic
)
```

### Parameters
- `image`: PIL Image or path to image file
- `question`: Your question (Vietnamese or English)
- `max_tokens`: Maximum response length (default: 512)
- `temperature`: Creativity (0 = deterministic, 1 = creative, default: 0.7)