# Belle-VLM Inference - Simple Version

Notebook inference cho Belle-VLM từ HuggingFace.

**Cần clone repo một lần để lấy code model.**

In [None]:
# ============================================
# STEP 1: Install & Clone
# ============================================

!pip install -q transformers>=4.51.0 torch torchvision
!pip install -q accelerate pillow einops timm peft
!pip install -q huggingface_hub datasets

# Clone repo (chỉ cần làm 1 lần)
import os
if not os.path.exists('ml-fastvlm-v2'):
    !git clone --depth 1 https://github.com/Hert4/ml-fastvlm-v2.git
    print("Cloned ml-fastvlm-v2")
else:
    print("ml-fastvlm-v2 already exists")

In [None]:
# ============================================
# STEP 2: Setup paths
# ============================================

import sys
sys.path.insert(0, 'ml-fastvlm-v2')

# Config
HF_MODEL_ID = "beyoru/Belle-VLM"

print(f"Model: {HF_MODEL_ID}")
print("Path setup done!")

In [None]:
# ============================================
# STEP 3: Load Model
# ============================================

import torch
from llava.model.builder import load_pretrained_model

print(f"Loading model from {HF_MODEL_ID}...")
print("This may take a few minutes...")

# Load model với builder (handles everything)
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path=HF_MODEL_ID,
    model_base=None,
    model_name="llava-qwen3",  # MUST contain 'llava'
    device_map="auto",
    device="cuda"
)

model.eval()

print(f"\nModel loaded!")
print(f"Image processor: {type(image_processor).__name__ if image_processor else 'None'}")
print(f"Context length: {context_len}")

In [None]:
# ============================================
# STEP 4: Fallback for image_processor
# ============================================

# If image_processor is None, load manually
if image_processor is None:
    print("image_processor is None, loading fallback...")
    from transformers import CLIPImageProcessor
    image_processor = CLIPImageProcessor.from_pretrained(
        "openai/clip-vit-large-patch14-336"
    )
    # Adjust for mobileclip_l_384 size
    image_processor.size = {"shortest_edge": 384}
    image_processor.crop_size = {"height": 384, "width": 384}
    print("Fallback image_processor loaded!")
else:
    print(f"image_processor OK: {type(image_processor).__name__}")

In [None]:
# ============================================
# STEP 5: Define inference function
# ============================================

from PIL import Image
from llava.conversation import conv_templates
from llava.mm_utils import tokenizer_image_token, process_images
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN

def ask_vlm(image, question, max_tokens=512, temperature=0.7):
    """
    Ask Belle-VLM a question about an image.
    
    Args:
        image: PIL Image or path to image
        question: Question in Vietnamese or English
        max_tokens: Max response length
        temperature: Sampling temperature (0 = deterministic)
    
    Returns:
        Response string
    """
    # Load image if path
    if isinstance(image, str):
        image = Image.open(image)
    
    # Convert to RGB
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    # Process image
    image_tensor = process_images([image], image_processor, model.config)[0]
    
    # Build prompt with Qwen3 template
    conv = conv_templates["qwen_3"].copy()
    conv.append_message(conv.roles[0], f"{DEFAULT_IMAGE_TOKEN}\n{question}")
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    
    # Tokenize
    input_ids = tokenizer_image_token(
        prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
    ).unsqueeze(0).to(model.device)
    
    # Generate
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor.unsqueeze(0).to(dtype=torch.float16, device=model.device),
            image_sizes=[image.size],
            do_sample=temperature > 0,
            temperature=temperature if temperature > 0 else None,
            top_p=0.8 if temperature > 0 else None,
            max_new_tokens=max_tokens,
            use_cache=True,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # Decode
    response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
    
    # Extract assistant response
    if "assistant\n" in response:
        response = response.split("assistant\n")[-1].strip()
    
    return response

print("ask_vlm() function ready!")
print("Usage: response = ask_vlm(image, 'Mo ta hinh anh')")

In [None]:
# ============================================
# STEP 6: Test with dataset
# ============================================

from datasets import load_dataset
from IPython.display import display

# Load dataset
print("Loading test dataset...")
dataset = load_dataset("5CD-AI/Viet-multimodal-open-r1-8k-verified", split="train")

# Get sample
idx = 0
test_image = dataset[idx]['image']
test_question = dataset[idx]['vi_problem']

# Display
print("Test image:")
display(test_image.resize((300, 300)))
print(f"\nQuestion: {test_question[:200]}...")

In [None]:
# ============================================
# STEP 7: Run inference
# ============================================

print("Generating response...\n")

response = ask_vlm(test_image, test_question)

print("=" * 50)
print("RESULT")
print("=" * 50)
print(f"\nQ: {test_question[:300]}...")
print(f"\nA: {response}")
print("=" * 50)

In [None]:
# ============================================
# Test more samples
# ============================================

# Test voi cau hoi don gian
response = ask_vlm(test_image, "Mo ta hinh anh nay bang tieng Viet.")
print("Simple description:")
print(response)

In [None]:
# ============================================
# Test with URL image
# ============================================

import requests
from io import BytesIO

def load_image_url(url):
    response = requests.get(url)
    return Image.open(BytesIO(response.content))

# Example (uncomment to use)
# url = "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1200px-Oryctolagus_cuniculus_Rcdo.jpg"
# img = load_image_url(url)
# display(img.resize((300, 300)))
# response = ask_vlm(img, "Day la con gi?")
# print(response)

print("Ready for URL images! Uncomment code above to test.")

## Usage Summary

```python
# Basic
response = ask_vlm(image, "Mo ta hinh anh nay.")

# From file
response = ask_vlm("/path/to/image.jpg", "Trong hinh co gi?")

# Custom params
response = ask_vlm(image, "Giai bai toan.", max_tokens=1024, temperature=0.3)
```