# Belle-VLM Inference

Vietnamese Vision Language Model - Load từ HuggingFace

**Không cần clone repo!**

In [None]:
# Install dependencies
!pip install -q transformers>=4.51.0 torch torchvision pillow accelerate

In [None]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
from torchvision import transforms

MODEL_ID = "beyoru/Belle-VLM"

print(f"Loading {MODEL_ID}...")

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()

print(f"Model loaded on {model.device}")

In [None]:
# Image processing & inference function
IMAGE_TOKEN_INDEX = -200

def process_image(image, size=384):
    if isinstance(image, str):
        image = Image.open(image)
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    # Pad to square
    w, h = image.size
    if w != h:
        new_size = max(w, h)
        new_img = Image.new('RGB', (new_size, new_size), (128, 128, 128))
        new_img.paste(image, ((new_size - w) // 2, (new_size - h) // 2))
        image = new_img
    
    transform = transforms.Compose([
        transforms.Resize((size, size)),
        transforms.ToTensor(),
        transforms.Normalize([0.48145466, 0.4578275, 0.40821073],
                           [0.26862954, 0.26130258, 0.27577711]),
    ])
    return transform(image)


def tokenize_with_image(prompt, tokenizer):
    chunks = prompt.split("<image>")
    tokens = []
    for i, chunk in enumerate(chunks):
        tokens.extend(tokenizer.encode(chunk, add_special_tokens=(i == 0)))
        if i < len(chunks) - 1:
            tokens.append(IMAGE_TOKEN_INDEX)
    return torch.tensor(tokens, dtype=torch.long)


def ask_vlm(image, question, max_tokens=512, temperature=0.7):
    """
    Hỏi Belle-VLM về hình ảnh.
    
    Args:
        image: PIL Image hoặc đường dẫn file
        question: Câu hỏi (tiếng Việt hoặc Anh)
        max_tokens: Độ dài tối đa câu trả lời
        temperature: 0 = chính xác, cao hơn = sáng tạo hơn
    """
    if isinstance(image, str):
        pil_image = Image.open(image)
    else:
        pil_image = image
    
    image_tensor = process_image(pil_image)
    
    prompt = f"""<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<image>
{question}<|im_end|>
<|im_start|>assistant
"""
    
    input_ids = tokenize_with_image(prompt, tokenizer).unsqueeze(0).to(model.device)
    
    with torch.inference_mode():
        outputs = model.generate(
            input_ids,
            images=image_tensor.unsqueeze(0).to(torch.float16, model.device),
            image_sizes=[pil_image.size],
            max_new_tokens=max_tokens,
            do_sample=temperature > 0,
            temperature=temperature if temperature > 0 else None,
            top_p=0.8 if temperature > 0 else None,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "assistant" in response:
        response = response.split("assistant")[-1].strip()
    
    return response

print("ask_vlm() ready!")

In [None]:
# Test với dataset
from datasets import load_dataset
from IPython.display import display

dataset = load_dataset("5CD-AI/Viet-multimodal-open-r1-8k-verified", split="train")

# Lấy sample
test_image = dataset[0]['image']
test_question = dataset[0]['vi_problem']

display(test_image.resize((300, 300)))
print(f"Question: {test_question[:200]}...")

In [None]:
# Run inference
response = ask_vlm(test_image, test_question)

print("="*50)
print(f"Q: {test_question[:300]}...")
print(f"\nA: {response}")
print("="*50)

In [None]:
# Test với câu hỏi đơn giản
response = ask_vlm(test_image, "Mô tả hình ảnh này bằng tiếng Việt.")
print(response)

In [None]:
# Test với URL
import requests
from io import BytesIO

def load_url(url):
    return Image.open(BytesIO(requests.get(url).content))

# Uncomment để test:
# img = load_url("https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png")
# display(img)
# print(ask_vlm(img, "Đây là gì?"))

## Cách dùng

```python
# Cơ bản
response = ask_vlm(image, "Mô tả hình ảnh này.")

# Từ file
response = ask_vlm("/path/to/image.jpg", "Trong hình có gì?")

# Tùy chỉnh
response = ask_vlm(
    image,
    "Giải bài toán trong hình.",
    max_tokens=1024,   # Câu trả lời dài hơn
    temperature=0.3    # Chính xác hơn
)
```