# FastVLM Training with Qwen3-0.6B on Kaggle

This notebook trains FastVLM (Vision Language Model) using:
- **LLM Backbone**: Qwen3-0.6B
- **Vision Encoder**: FastViTHD (MobileCLIP)
- **Dataset**: 5CD-AI/Viet-multimodal-open-r1-8k-verified
- **Output**: Push to HuggingFace: beyoru/Belle-VLM

## Requirements
- Kaggle GPU (P100/T4/A100)
- ~16GB VRAM for full training, or use QLoRA for less

## 1. Install Dependencies

In [None]:
# Install required packages
!pip install -q transformers>=4.51.0,<5.0.0
!pip install -q torch>=2.1.0 torchvision>=0.16.0
!pip install -q accelerate>=0.26.0 peft>=0.10.0
!pip install -q bitsandbytes>=0.43.0
!pip install -q datasets pillow einops timm>=0.9.0
!pip install -q deepspeed sentencepiece
!pip install -q huggingface_hub

In [None]:
# Clone the FastVLM repository
!git clone https://github.com/Hert4/ml-fastvlm-v2.git
%cd ml-fastvlm-v2

In [None]:
# Verify Qwen3 support
!python test_qwen3_load.py

## 2. Setup & Configuration

In [None]:
import os
import json
import torch
from pathlib import Path

# Configuration
CONFIG = {
    # Model
    "llm_model": "Qwen/Qwen3-0.6B",
    # Vision tower format: mobileclip_l_<image_size>
    # Available: mobileclip_l_256, mobileclip_l_384, mobileclip_l_512
    # The config uses fastvithd backbone with embed_dim=3072
    "vision_tower": "mobileclip_l_384",  # FastViTHD with 384x384 input
    
    # Dataset
    "dataset_name": "5CD-AI/Viet-multimodal-open-r1-8k-verified",
    "image_column": "image",
    "question_column": "vi_problem",
    "answer_column": "vi_solution",
    
    # Training
    "output_dir": "./outputs/fastvlm-qwen3-0.6b-vietnamese",
    "num_train_epochs": 3,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 8,
    "learning_rate": 2e-5,
    "warmup_ratio": 0.03,
    "lr_scheduler_type": "cosine",
    "bf16": True,
    "model_max_length": 2048,
    
    # LoRA (for memory efficiency)
    "use_lora": True,
    "lora_r": 64,
    "lora_alpha": 128,
    "lora_dropout": 0.05,
    
    # HuggingFace - Set your token in Kaggle Secrets as HF_TOKEN
    "hf_repo": "beyoru/Belle-VLM",
    "hf_token": os.environ.get("HF_TOKEN", ""),  # Get from environment/Kaggle secrets
}

# Create output directory
os.makedirs(CONFIG["output_dir"], exist_ok=True)

print("Configuration:")
for k, v in CONFIG.items():
    if k != "hf_token":
        print(f"  {k}: {v}")

if not CONFIG["hf_token"]:
    print("\n⚠️ WARNING: HF_TOKEN not set! Set it in Kaggle Secrets to push model to HuggingFace.")

In [None]:
# Login to HuggingFace
from huggingface_hub import login

if CONFIG["hf_token"]:
    login(token=CONFIG["hf_token"])
    print("✅ Logged in to HuggingFace!")
else:
    print("⚠️ HF_TOKEN not set. To push to HuggingFace, set your token:")
    print("   1. Go to Kaggle notebook settings")
    print("   2. Add Secret: Name='HF_TOKEN', Value='your_hf_token'")
    print("   3. Enable 'Add-ons' > 'Secrets' in your notebook")

## 3. Load and Prepare Dataset

In [None]:
from datasets import load_dataset
from PIL import Image
import io

# Load dataset
print(f"Loading dataset: {CONFIG['dataset_name']}")
dataset = load_dataset(CONFIG["dataset_name"], split="train")

print(f"\nDataset info:")
print(f"  Total samples: {len(dataset)}")
print(f"  Columns: {dataset.column_names}")

# Show sample
sample = dataset[0]
print(f"\nSample data:")
print(f"  vi_problem: {sample['vi_problem'][:200]}...")
print(f"  vi_solution: {sample['vi_solution'][:200]}...")

In [None]:
# Display sample image
from IPython.display import display

sample_image = sample['image']
if isinstance(sample_image, Image.Image):
    display(sample_image.resize((400, 400)))
else:
    print(f"Image type: {type(sample_image)}")

In [None]:
import json
import os
from tqdm import tqdm

def convert_dataset_to_llava_format(dataset, output_dir, image_folder):
    """
    Convert dataset to LLaVA training format.
    
    LLaVA format:
    {
        "id": "unique_id",
        "image": "image_filename.jpg",
        "conversations": [
            {"from": "human", "value": "<image>\nQuestion"},
            {"from": "gpt", "value": "Answer"}
        ]
    }
    """
    os.makedirs(image_folder, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)
    
    llava_data = []
    
    for idx, sample in enumerate(tqdm(dataset, desc="Converting dataset")):
        # Save image
        image_filename = f"{idx:06d}.jpg"
        image_path = os.path.join(image_folder, image_filename)
        
        img = sample['image']
        if isinstance(img, Image.Image):
            # Convert to RGB if necessary
            if img.mode != 'RGB':
                img = img.convert('RGB')
            img.save(image_path, 'JPEG', quality=95)
        
        # Create conversation
        question = sample['vi_problem'].strip()
        answer = sample['vi_solution'].strip()
        
        # Truncate very long solutions to avoid memory issues
        max_answer_len = 4096
        if len(answer) > max_answer_len:
            answer = answer[:max_answer_len] + "..."
        
        llava_sample = {
            "id": str(sample.get('id', idx)),
            "image": image_filename,
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{question}"
                },
                {
                    "from": "gpt",
                    "value": answer
                }
            ]
        }
        llava_data.append(llava_sample)
    
    # Save JSON
    json_path = os.path.join(output_dir, "train_data.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(llava_data, f, ensure_ascii=False, indent=2)
    
    print(f"\nDataset converted:")
    print(f"  JSON file: {json_path}")
    print(f"  Images: {image_folder}")
    print(f"  Total samples: {len(llava_data)}")
    
    return json_path, image_folder

# Convert dataset
DATA_DIR = "./data"
IMAGE_FOLDER = os.path.join(DATA_DIR, "images")

json_path, image_folder = convert_dataset_to_llava_format(
    dataset, 
    DATA_DIR, 
    IMAGE_FOLDER
)

## 4. Setup Training

In [None]:
# Create training script arguments
import sys
sys.path.insert(0, '.')

training_args = {
    # Model
    "model_name_or_path": CONFIG["llm_model"],
    "version": "qwen_3",
    "vision_tower": CONFIG["vision_tower"],  # mobileclip_l_384
    "mm_projector_type": "mlp2x_gelu",
    
    # Data
    "data_path": json_path,
    "image_folder": image_folder,
    "image_aspect_ratio": "pad",
    
    # Training
    "output_dir": CONFIG["output_dir"],
    "num_train_epochs": CONFIG["num_train_epochs"],
    "per_device_train_batch_size": CONFIG["per_device_train_batch_size"],
    "gradient_accumulation_steps": CONFIG["gradient_accumulation_steps"],
    "learning_rate": CONFIG["learning_rate"],
    "warmup_ratio": CONFIG["warmup_ratio"],
    "lr_scheduler_type": CONFIG["lr_scheduler_type"],
    "model_max_length": CONFIG["model_max_length"],
    
    # Precision
    "bf16": CONFIG["bf16"],
    
    # LoRA
    "lora_enable": CONFIG["use_lora"],
    "lora_r": CONFIG["lora_r"],
    "lora_alpha": CONFIG["lora_alpha"],
    "lora_dropout": CONFIG["lora_dropout"],
    
    # Other
    "gradient_checkpointing": True,
    "dataloader_num_workers": 4,
    "logging_steps": 1,
    "save_steps": 500,
    "save_total_limit": 2,
    "report_to": "none",
}

print("Training arguments prepared!")
print(f"Vision tower: {CONFIG['vision_tower']}")

In [None]:
# Alternative: Direct Python training (recommended for Kaggle)

import torch
from transformers import AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Check GPU
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Load tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    CONFIG["llm_model"],
    model_max_length=CONFIG["model_max_length"],
    padding_side="right",
    use_fast=False,
)

# Set pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded: {CONFIG['llm_model']}")
print(f"Vocab size: {tokenizer.vocab_size}")

In [None]:
# Load model with QLoRA (4-bit quantization for memory efficiency)
from llava.model.language_model.llava_qwen import LlavaQwen3ForCausalLM, LlavaQwen3Config

# Quantization config for 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

print("Loading model...")
model = LlavaQwen3ForCausalLM.from_pretrained(
    CONFIG["llm_model"],
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

print(f"Model loaded!")
print(f"Model type: {model.config.model_type}")

In [None]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

# Setup LoRA
lora_config = LoraConfig(
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    lora_dropout=CONFIG["lora_dropout"],
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 5. Train the Model

In [None]:
# Run training using the train_qwen.py script
import subprocess
import torch
import os

# Force single GPU to avoid DataParallel issues with variable-length sequences
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Check GPU capability for TF32 support (requires Ampere or newer)
def supports_tf32():
    if torch.cuda.is_available():
        capability = torch.cuda.get_device_capability()
        # TF32 requires compute capability >= 8.0 (Ampere)
        return capability[0] >= 8
    return False

use_tf32 = supports_tf32()
print(f"GPU supports TF32: {use_tf32}")

# Build command
cmd = [
    "python", "-m", "llava.train.train_qwen",
    f"--model_name_or_path={CONFIG['llm_model']}",
    "--version=qwen_3",
    f"--data_path={json_path}",
    f"--image_folder={image_folder}",
    f"--vision_tower={CONFIG['vision_tower']}",  # mobileclip_l_384
    "--mm_projector_type=mlp2x_gelu",
    "--mm_vision_select_layer=-2",
    "--mm_use_im_start_end=False",
    "--mm_use_im_patch_token=False",
    "--image_aspect_ratio=pad",
    f"--output_dir={CONFIG['output_dir']}",
    f"--num_train_epochs={CONFIG['num_train_epochs']}",
    f"--per_device_train_batch_size={CONFIG['per_device_train_batch_size']}",
    f"--gradient_accumulation_steps={CONFIG['gradient_accumulation_steps']}",
    f"--learning_rate={CONFIG['learning_rate']}",
    "--weight_decay=0.0",
    f"--warmup_ratio={CONFIG['warmup_ratio']}",
    f"--lr_scheduler_type={CONFIG['lr_scheduler_type']}",
    "--logging_steps=1",
    "--save_steps=500",
    "--save_total_limit=2",
    f"--model_max_length={CONFIG['model_max_length']}",
    "--gradient_checkpointing=True",
    "--dataloader_num_workers=4",
    "--lazy_preprocess=True",
    "--bf16=True",
    "--report_to=none",
    # LoRA
    "--lora_enable=True",
    f"--lora_r={CONFIG['lora_r']}",
    f"--lora_alpha={CONFIG['lora_alpha']}",
    f"--lora_dropout={CONFIG['lora_dropout']}",
]

# Only add TF32 if GPU supports it (Ampere A100+)
if use_tf32:
    cmd.append("--tf32=True")
else:
    cmd.append("--tf32=False")

print("Starting training...")
print("Using single GPU (CUDA_VISIBLE_DEVICES=0) to avoid DataParallel issues")
print("Command:", " ".join(cmd[:5]), "...")

# Run training with single GPU environment
env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = "0"

process = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    universal_newlines=True,
    bufsize=1,
    env=env,
)

# Stream output
for line in process.stdout:
    print(line, end='')

process.wait()
print(f"\nTraining completed with return code: {process.returncode}")

## 6. Merge LoRA and Save Model

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

# Paths
BASE_MODEL = CONFIG["llm_model"]
LORA_PATH = CONFIG["output_dir"]
MERGED_PATH = os.path.join(CONFIG["output_dir"], "merged")

print(f"Base model: {BASE_MODEL}")
print(f"LoRA path: {LORA_PATH}")
print(f"Merged output: {MERGED_PATH}")

In [None]:
# Load base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

print("Base model loaded!")

In [None]:
# Load and merge LoRA
print("Loading LoRA weights...")
model = PeftModel.from_pretrained(base_model, LORA_PATH)

print("Merging LoRA weights...")
model = model.merge_and_unload()

print("LoRA merged successfully!")

In [None]:
# Save merged model locally
print(f"Saving merged model to {MERGED_PATH}...")
os.makedirs(MERGED_PATH, exist_ok=True)

model.save_pretrained(MERGED_PATH, safe_serialization=True)
tokenizer.save_pretrained(MERGED_PATH)

# Also copy vision tower and projector weights if they exist
import shutil
for file in ["mm_projector.bin", "config.json"]:
    src = os.path.join(LORA_PATH, file)
    if os.path.exists(src):
        shutil.copy(src, os.path.join(MERGED_PATH, file))
        print(f"Copied: {file}")

print("Merged model saved!")

## 7. Push to HuggingFace

In [None]:
from huggingface_hub import HfApi, create_repo

# Initialize API
api = HfApi(token=CONFIG["hf_token"])

# Create repo if not exists
try:
    create_repo(
        repo_id=CONFIG["hf_repo"],
        repo_type="model",
        exist_ok=True,
        token=CONFIG["hf_token"]
    )
    print(f"Repository ready: {CONFIG['hf_repo']}")
except Exception as e:
    print(f"Repo exists or error: {e}")

In [None]:
# Create model card
model_card = f"""---
license: apache-2.0
language:
- vi
- en
tags:
- vision-language-model
- vlm
- qwen3
- fastvlm
- vietnamese
base_model: {CONFIG['llm_model']}
datasets:
- {CONFIG['dataset_name']}
---

# Belle-VLM: Vietnamese Vision Language Model

## Model Description

Belle-VLM is a Vision Language Model trained for Vietnamese multimodal reasoning tasks.

### Architecture
- **LLM Backbone**: Qwen3-0.6B
- **Vision Encoder**: FastViTHD (MobileCLIP-S2)
- **Projector**: MLP 2-layer

### Training
- **Dataset**: 5CD-AI/Viet-multimodal-open-r1-8k-verified (7,030 samples)
- **Method**: LoRA fine-tuning with QLoRA (4-bit)
- **Epochs**: {CONFIG['num_train_epochs']}
- **Learning Rate**: {CONFIG['learning_rate']}

## Usage

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image

model = AutoModelForCausalLM.from_pretrained("{CONFIG['hf_repo']}")
tokenizer = AutoTokenizer.from_pretrained("{CONFIG['hf_repo']}")

# For VLM inference, use with FastVLM pipeline
```

## Training Details

| Parameter | Value |
|-----------|-------|
| Base Model | {CONFIG['llm_model']} |
| LoRA Rank | {CONFIG['lora_r']} |
| LoRA Alpha | {CONFIG['lora_alpha']} |
| Batch Size | {CONFIG['per_device_train_batch_size']} x {CONFIG['gradient_accumulation_steps']} |
| Max Length | {CONFIG['model_max_length']} |

## License

Apache 2.0

## Acknowledgments

- FastVLM architecture from Apple
- Qwen3 from Alibaba
- Dataset from 5CD-AI
"""

# Save model card
readme_path = os.path.join(MERGED_PATH, "README.md")
with open(readme_path, "w", encoding="utf-8") as f:
    f.write(model_card)

print("Model card created!")

In [None]:
# Upload to HuggingFace
print(f"Uploading to {CONFIG['hf_repo']}...")

api.upload_folder(
    folder_path=MERGED_PATH,
    repo_id=CONFIG["hf_repo"],
    repo_type="model",
    commit_message="Upload Belle-VLM trained on Vietnamese multimodal dataset",
)

print(f"\n{'='*50}")
print(f"Model uploaded successfully!")
print(f"View at: https://huggingface.co/{CONFIG['hf_repo']}")
print(f"{'='*50}")

## 8. Test Inference

In [None]:
# Quick inference test
from llava.model.builder import load_pretrained_model
from llava.conversation import conv_templates
from llava.mm_utils import tokenizer_image_token, process_images
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from PIL import Image
import torch

# Load model
model_path = MERGED_PATH
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path, None, "fastvlm-qwen3", device="cuda"
)

print("Model loaded for inference!")

In [None]:
# Test with sample image
test_image = dataset[0]['image']
test_question = dataset[0]['vi_problem']

# Prepare input
if test_image.mode != 'RGB':
    test_image = test_image.convert('RGB')

image_tensor = process_images([test_image], image_processor, model.config)[0]

# Build prompt
conv = conv_templates["qwen_3"].copy()
conv.append_message(conv.roles[0], f"{DEFAULT_IMAGE_TOKEN}\n{test_question}")
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

# Generate
with torch.inference_mode():
    output_ids = model.generate(
        input_ids,
        images=image_tensor.unsqueeze(0).half().cuda(),
        image_sizes=[test_image.size],
        do_sample=True,
        temperature=0.7,
        top_p=0.8,
        max_new_tokens=512,
        use_cache=True
    )

output = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

print("Question:", test_question[:200], "...")
print("\nModel Response:", output[:500], "...")

## Done!

The model has been:
1. Trained on Vietnamese multimodal dataset
2. Merged LoRA weights
3. Pushed to HuggingFace

View your model at: https://huggingface.co/beyoru/Belle-VLM