# Zalo AI Traffic-MLLM Inference Test

Testing Qwen2.5-VL 3B with Flash Attention on Vietnamese traffic videos

**Requirements:**
- Python 3.13+
- CUDA 12.9+
- Flash Attention installed
- Model: Qwen2.5-VL-3B-Instruct

In [1]:
# Install required packages if not already installed
# !uv pip install transformers torch torchvision accelerate flash-attn
# !uv pip install opencv-python av decord pillow
# !uv pip install underthesea pyvi  # Vietnamese NLP

In [2]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import json
import torch
from qwen_vl_utils import process_vision_info
# import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from underthesea import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
CUDA Version: 12.8
GPU: NVIDIA GeForce RTX 5060 Ti


In [5]:
# Load Qwen2.5-VL 3B with Flash Attention
from transformers import BitsAndBytesConfig


model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

print("Loading model and processor...")
try:
    # Load processor
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,     # Nested quantization → less VRAM
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    # Load model with flash attention
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="flash_attention_2",  # Enable flash attention
        use_cache=True,
        quantization_config=bnb_config,
        dtype=torch.bfloat16,
    )



    print(f"✅ Model loaded successfully!")
    print(f"Model parameters: {model.num_parameters():,}")
    print(f"Flash Attention: Enabled")

except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Make sure you have enough GPU memory (at least 8GB VRAM recommended)")

Loading model and processor...


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.46s/it]


✅ Model loaded successfully!
Model parameters: 3,754,622,976
Flash Attention: Enabled


In [None]:
# Load training data
train_data_path = r"d:\ZALO_AI\\trainining\\train\\train.json"
videos_path = r"d:\ZALO_AI\\trainining\\train\\videos"

print(f"Loading data from: {train_data_path}")

try:
    with open(train_data_path, 'r', encoding='utf-8') as f:
        train_data = json.load(f)

    print(f"✅ Loaded {len(train_data)} training samples")

    # Show sample structure
    if train_data:
        sample = train_data[0]
        print("\n📋 Sample data structure:")
        for key, value in sample.items():
            if key == 'question':
                print(f"  {key}: {value[:100]}...")
            elif key == 'support_frames':
                print(f"  {key}: {len(value)} frames")
            else:
                print(f"  {key}: {value}")

except Exception as e:
    print(f"❌ Error loading data: {e}")
    train_data = []

Loading data from: d:\ZALO_AI\trainining\train\train.json
✅ Loaded 2 training samples
❌ Error loading data: 0


In [7]:
import cv2

# Function to extract frames from video at specific timestamps
def extract_frames_at_timestamps(video_path, timestamps, max_frames=4):
    """
    Extract frames from video at specific timestamps

    Args:
        video_path: Path to video file
        timestamps: List of timestamps in seconds
        max_frames: Maximum number of frames to extract

    Returns:
        List of PIL Images
    """
    frames = []

    if not os.path.exists(video_path):
        print(f"Video not found: {video_path}")
        return frames

    try:
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)

        # Sort timestamps and limit to max_frames
        timestamps = sorted(timestamps)[:max_frames]

        for timestamp in timestamps:
            # Convert timestamp to frame number
            frame_number = int(timestamp * fps)
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)

            ret, frame = cap.read()
            if ret:
                # Convert BGR to RGB
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                # Convert to PIL Image
                pil_image = Image.fromarray(frame_rgb)
                frames.append(pil_image)

        cap.release()

    except Exception as e:
        print(f"Error extracting frames: {e}")

    return frames

# Function to display frames
def display_frames(frames, title="Video Frames"):
    if not frames:
        print("No frames to display")
        return

    fig, axes = plt.subplots(1, len(frames), figsize=(15, 5))
    if len(frames) == 1:
        axes = [axes]

    fig.suptitle(title)
    for i, frame in enumerate(frames):
        axes[i].imshow(frame)
        axes[i].axis('off')
        axes[i].set_title(f'Frame {i+1}')

    plt.tight_layout()
    plt.show()

In [8]:
# Function to run inference
def run_inference(sample, model, processor, max_new_tokens=512):
    """
    Run inference on a single sample

    Args:
        sample: Dictionary containing question, video_id, support_frames, etc.
        model: Loaded Qwen2.5-VL model
        processor: Model processor
        max_new_tokens: Maximum tokens to generate

    Returns:
        Generated answer
    """
    try:
        # Get video path
        video_id = sample.get('video_id', '')
        video_path = os.path.join(videos_path, f"{video_id}.mp4")

        # Extract frames at support timestamps
        support_frames = sample.get('support_frames', [])
        frames = extract_frames_at_timestamps(video_path, support_frames, max_frames=4)

        if not frames:
            return "Could not extract frames from video"

        # Prepare messages for Qwen2.5-VL
        question = sample.get('question', '')

        # Create message with images and question
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Đây là video giao thông Việt Nam. {question}"},
                    *[
                        {"type": "image", "image": frame}
                        for frame in frames
                    ]
                ]
            }
        ]

        # Process inputs
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt"
        )

        # Move to device
        inputs = inputs.to(device)

        # Generate response
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                use_cache=True
            )

        # Decode response
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]

        return output_text.strip()

    except Exception as e:
        return f"Error during inference: {str(e)}"

# Test with a few samples
print("🧪 Testing inference on sample data...")

# Select first 3 samples for testing
test_samples = train_data[:3] if len(train_data) >= 3 else train_data

for i, sample in enumerate(test_samples):
    print(f"\n{'='*80}")
    print(f"🧪 TEST SAMPLE {i+1}")
    print(f"{'='*80}")

    # Show question and ground truth
    question = sample.get('question', 'N/A')
    answer = sample.get('answer', 'N/A')
    video_id = sample.get('video_id', 'N/A')

    print(f"📹 Video ID: {video_id}")
    print(f"❓ Question: {question}")
    print(f"✅ Ground Truth: {answer}")

    # Extract and show frames
    video_path = os.path.join(videos_path, f"{video_id}.mp4")
    support_frames = sample.get('support_frames', [])
    frames = extract_frames_at_timestamps(video_path, support_frames, max_frames=4)

    if frames:
        print(f"📸 Extracted {len(frames)} frames at timestamps: {support_frames[:len(frames)]}")
        display_frames(frames, f"Sample {i+1} - Video {video_id}")
    else:
        print("❌ Could not extract frames")
        continue

    # Run inference
    print("\n🤖 Generating answer...")
    model_answer = run_inference(sample, model, processor)

    print(f"🤖 Model Answer: {model_answer}")

    # Simple evaluation
    if answer in model_answer:
        print("✅ Answer matches ground truth!")
    else:
        print("❌ Answer differs from ground truth")

    print()

🧪 Testing inference on sample data...


In [9]:
# Performance analysis
print("📊 Performance Analysis")
print("="*50)

# Memory usage
if torch.cuda.is_available():
    memory_allocated = torch.cuda.memory_allocated() / 1024**3
    memory_reserved = torch.cuda.memory_reserved() / 1024**3
    print(f"GPU Memory Allocated: {memory_allocated:.2f} GB")
    print(f"GPU Memory Reserved: {memory_reserved:.2f} GB")

# Model info
print(f"Model Parameters: {model.num_parameters():,}")
print(f"Model Size: {model.num_parameters() * 2 / 1024**3:.2f} GB (bfloat16)")

print("\n✅ Inference test completed!")
print("\n💡 Next steps:")
print("   1. Fine-tune the model on your dataset")
print("   2. Add RAG for traffic rules knowledge")
print("   3. Implement Chain-of-Thought reasoning")

📊 Performance Analysis
GPU Memory Allocated: 2.25 GB
GPU Memory Reserved: 3.59 GB
Model Parameters: 3,754,622,976
Model Size: 6.99 GB (bfloat16)

✅ Inference test completed!

💡 Next steps:
   1. Fine-tune the model on your dataset
   2. Add RAG for traffic rules knowledge
   3. Implement Chain-of-Thought reasoning
