From a12e49268b58446e0d5c1a222c09ba6715ceb132 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Tue, 14 Apr 2026 15:05:44 +0800
Subject: [PATCH 01/21] WIP: support mixed modality

---
 lmdeploy/pytorch/messages.py            |   4 +-
 lmdeploy/pytorch/models/qwen3_vl.py     |  63 ++--
 lmdeploy/serve/core/async_engine.py     |   5 +-
 lmdeploy/serve/processors/multimodal.py |  18 +-
 lmdeploy/vl/constants.py                |  12 +-
 lmdeploy/vl/engine.py                   |  60 ++--
 lmdeploy/vl/model/base.py               | 390 ++++++++++++++++++------
 lmdeploy/vl/model/qwen3.py              | 184 +----------
 8 files changed, 410 insertions(+), 326 deletions(-)

diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index 218cfb5113..885988e507 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -566,13 +566,13 @@ def __init__(self, multimodals: MultiModalInputs = None):
         self.multimodals = multimodals
 
     def get_datas(self, start=0, end=-1):
-        """Get multimodals from prompts position [start, end)."""
+        """Get multimodals from prompts position [start, end]."""
         outs: MultiModalInputs = dict()
         test_range = range(start, end)
         for modal_type, modal_datas in self.multimodals.items():
             data = []
             for modal_data in modal_datas:
-                if (modal_data.start not in test_range and modal_data.end - 1 not in test_range):
+                if (modal_data.start not in test_range and modal_data.end not in test_range):
                     continue
                 data.append(modal_data)
             if len(data) > 0:
diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py
index 8a12c878e5..96478349da 100644
--- a/lmdeploy/pytorch/models/qwen3_vl.py
+++ b/lmdeploy/pytorch/models/qwen3_vl.py
@@ -551,7 +551,7 @@ def forward(
         pixel_values: torch.Tensor = None,
         vis_cu_seqlens: torch.Tensor = None,
         vis_pos_emb: torch.Tensor = None,
-        image_mask: torch.Tensor = None,
+        multimodal_mask: torch.Tensor = None,
         pos_embeds: torch.Tensor = None,
         grid_thw: torch.Tensor = None,
         **kwargs,
@@ -580,10 +580,9 @@ def forward(
                 image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, dtype)
 
                 # mask and scatter to create final input embeddings
-                expanded_image_mask = image_mask.unsqueeze(-1).expand_as(inputs_embeds)
-                inputs_embeds = inputs_embeds.masked_scatter(expanded_image_mask, image_embeds)
-
-                visual_pos_masks = expanded_image_mask
+                multimodal_mask = multimodal_mask.unsqueeze(-1).expand_as(inputs_embeds)
+                inputs_embeds = inputs_embeds.masked_scatter(multimodal_mask, image_embeds)
+                visual_pos_masks = multimodal_mask
 
         hidden_states = self.language_model(
             input_ids=input_ids,
@@ -602,6 +601,27 @@ def get_input_embeddings(self):
         """Get input embeddings."""
         return self.language_model.get_input_embeddings()
 
+    def get_multimodal_mask(self, input_ids: torch.Tensor, mm_inputs: list[MultiModalData]) -> torch.Tensor:
+        """Get position masks for vision tokens."""
+        image_token_id = next((m.meta.get('image_token_id') for m in mm_inputs if m.modality == Modality.IMAGE), None)
+        video_token_id = next((m.meta.get('video_token_id') for m in mm_inputs if m.modality == Modality.VIDEO), None)
+
+        image_mask, video_mask = None, None
+        if image_token_id is not None:
+            image_mask = (input_ids == image_token_id)
+        if video_token_id is not None:
+            video_mask = (input_ids == video_token_id)
+
+        multimodal_mask = None
+        if image_mask is not None and video_mask is not None:
+            multimodal_mask = image_mask | video_mask
+        elif image_mask is not None:
+            multimodal_mask = image_mask
+        elif video_mask is not None:
+            multimodal_mask = video_mask
+
+        return multimodal_mask
+
     def prepare_inputs_for_generation(
         self,
         past_key_values: list[list[torch.Tensor]],
@@ -618,7 +638,7 @@ def prepare_inputs_for_generation(
         pixel_values = None
         vis_cu_seqlens = None
         vis_pos_emb = None
-        image_mask = None
+        multimodal_mask = None
         grid_thw = None
         pos_embeds = None
         if context.input_multimodals is not None:
@@ -627,15 +647,9 @@ def prepare_inputs_for_generation(
             mm_inputs = [item for sublist in mm_inputs for item in sublist]
 
             if len(mm_inputs) > 0:
-                modality = mm_inputs[0].modality
                 pixel_values = torch.cat([inp.data for inp in mm_inputs])
-
-                image_token_id = mm_inputs[0].meta.get('image_token_id')
-                video_token_id = mm_inputs[0].meta.get('video_token_id')
-                mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id
-                image_mask = (input_ids == mm_token_id)
-
-                grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu()
+                multimodal_mask = self.get_multimodal_mask(input_ids, mm_inputs)
+                grid_thw = torch.stack([data.meta['grid_thw'] for data in mm_inputs]).cpu()
                 vis_pos_emb = self.visual.rot_pos_emb(grid_thw)
                 pos_embeds = self.visual.fast_pos_embed_interpolate(grid_thw)
                 vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
@@ -665,7 +679,7 @@ def prepare_inputs_for_generation(
             pixel_values=pixel_values,
             vis_cu_seqlens=vis_cu_seqlens,
             vis_pos_emb=vis_pos_emb,
-            image_mask=image_mask,
+            multimodal_mask=multimodal_mask,
             grid_thw=grid_thw,
             pos_embeds=pos_embeds,
         )
@@ -744,7 +758,8 @@ def _get_multimodal_pos_ids(cls, grid_thw: Sequence[int]) -> np.ndarray:
 
     @classmethod
     def make_mrope(cls, grid_thw: torch.Tensor):
-        img_pos_ids = cls._get_multimodal_pos_ids(grid_thw[0].tolist())
+        grid_thw = grid_thw.tolist() if grid_thw.dim() == 1 else grid_thw[0].tolist()
+        img_pos_ids = cls._get_multimodal_pos_ids(grid_thw)
         return img_pos_ids
 
     def _make_image_mm_data(self, input_mm: dict[str, Any]) -> MultiModalData:
@@ -752,18 +767,14 @@ def _make_image_mm_data(self, input_mm: dict[str, Any]) -> MultiModalData:
         pixel_values = input_mm['pixel_values']
         image_grid_thw = input_mm['image_grid_thw']
         offset = input_mm['offset']
-        start = offset
         image_token_id = input_mm['image_token_id']
-        num_pad = input_mm['image_tokens']
-        if isinstance(num_pad, torch.Tensor):
-            num_pad = num_pad.item()
 
         mrope_pos_ids = self.make_mrope(image_grid_thw)
 
         mm_data = MultiModalData(modality=Modality.IMAGE,
                                  data=pixel_values,
-                                 start=start,
-                                 end=start + num_pad,
+                                 start=offset[0],
+                                 end=offset[1],
                                  mrope_pos_ids=mrope_pos_ids,
                                  meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
         return mm_data
@@ -773,18 +784,14 @@ def _make_video_mm_data(self, input_mm: dict[str, Any]) -> MultiModalData:
         pixel_values_videos = input_mm['pixel_values_videos']
         video_grid_thw = input_mm['video_grid_thw']
         offset = input_mm['offset']
-        start = offset
         video_token_id = input_mm['video_token_id']
-        num_pad = input_mm['video_tokens']
-        if isinstance(num_pad, torch.Tensor):
-            num_pad = num_pad.item()
 
         mrope_pos_ids = self.make_mrope(video_grid_thw)
 
         mm_data = MultiModalData(modality=Modality.VIDEO,
                                  data=pixel_values_videos,
-                                 start=start,
-                                 end=start + num_pad,
+                                 start=offset[0],
+                                 end=offset[1],
                                  mrope_pos_ids=mrope_pos_ids,
                                  meta=dict(
                                      grid_thw=video_grid_thw,
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index a259fbdd90..9483ae6254 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -396,10 +396,11 @@ async def generate(
                                                                         media_io_kwargs=media_io_kwargs,
                                                                         mm_processor_kwargs=mm_processor_kwargs,
                                                                         **kwargs)
-            prompt = prompt_input['prompt']
+            # prompt = prompt_input['prompt']
             input_ids = prompt_input['input_ids']
             self.request_logger.log_inputs(session,
-                                           prompt=prompt,
+                                        #    prompt=prompt,
+                                           prompt='DEBUG!',
                                            prompt_token_ids=input_ids,
                                            gen_config=gen_config,
                                            adapter_name=adapter_name)
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 8847c6f2c1..7cce025f17 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -112,6 +112,7 @@ def _parse_multimodal_item(i: int, in_messages: list[dict], out_messages: list[d
             item_params = item.get(item_type, {}).copy()
             data_src = item_params.pop('url', None) or item_params.pop('data', None)
 
+            modality = None
             if item_type == 'image_data':
                 modality = Modality.IMAGE
                 data = data_src
@@ -131,7 +132,7 @@ def _parse_multimodal_item(i: int, in_messages: list[dict], out_messages: list[d
             else:
                 raise NotImplementedError(f'unknown type: {item_type}')
 
-            out_message['content'].append({'type': modality, 'data': data, **item_params})
+            out_message['content'].append({'type': modality.value, 'data': data, **item_params})
 
         out_messages[i] = out_message
 
@@ -356,8 +357,13 @@ async def _get_multimodal_prompt_input(self,
         engines."""
         chat_template = self.chat_template if do_preprocess else BaseChatTemplate()
         messages = await self.async_parse_multimodal_item(messages, media_io_kwargs)
-        results = await self.vl_encoder.preprocess(messages, mm_processor_kwargs)
+        input_text = self.vl_encoder.apply_chat_template(messages=messages,
+                                                         chat_template=chat_template,
+                                                         sequence_start=sequence_start,
+                                                         chat_template_kwargs=chat_template_kwargs)
+        input_ids, multimodal = await self.vl_encoder.preprocess(messages, input_text, mm_processor_kwargs)
 
+        results = {'input_ids': input_ids, 'multimodal': multimodal}
         if self.backend == 'turbomind':
             # for tm engine, this module perform vision embedding after image
             # preprocessing. It utilizes the hf model's vision embeddings
@@ -374,10 +380,6 @@ async def _get_multimodal_prompt_input(self,
         elif self.backend == 'pytorch':
             # for pt engine, this module only conduct the image preprocessing
             # It leaves the vision embedding to the pt engine
-            results = await self.vl_encoder.wrap_for_pytorch(messages=results,
-                                                             chat_template=chat_template,
-                                                             tokenizer=self.tokenizer,
-                                                             sequence_start=sequence_start,
-                                                             tools=tools,
-                                                             chat_template_kwargs=chat_template_kwargs)
+            return results
+
         return results
diff --git a/lmdeploy/vl/constants.py b/lmdeploy/vl/constants.py
index e0cd744f15..4a9b486ecf 100644
--- a/lmdeploy/vl/constants.py
+++ b/lmdeploy/vl/constants.py
@@ -4,8 +4,18 @@
 IMAGE_TOKEN = '<IMAGE_TOKEN>'
 
 
-class Modality(str, Enum):
+class Modality(Enum):
     IMAGE = 'image'
     VIDEO = 'video'
     AUDIO = 'audio'
     TIME_SERIES = 'time_series'
+
+    def __eq__(self, other):
+        if isinstance(other, Modality):
+            return self.value == other.value
+        if isinstance(other, str):
+            return self.value == other
+        return NotImplemented
+
+    def __hash__(self):
+        return hash(self.value)
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index 8cd179df8a..e8d4dd0451 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -47,15 +47,27 @@ def __init__(
         self.executor = ThreadPoolExecutor(max_workers=1)
         torch.cuda.empty_cache()
 
+    def apply_chat_template(self, messages, chat_template, sequence_start, chat_template_kwargs=None):
+        return self.model.apply_chat_template(
+            messages, chat_template, sequence_start, chat_template_kwargs
+        )
+
     async def preprocess(self,
                          messages: list[dict],
+                         input_text: str,
                          mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
         """Preprocess multimodal data in the messages."""
         if _accepts_arg(self.model.preprocess, 'mm_processor_kwargs'):
-            future = asyncio.get_event_loop().run_in_executor(self.executor, self.model.preprocess, messages,
+            future = asyncio.get_event_loop().run_in_executor(self.executor,
+                                                              self.model.preprocess,
+                                                              messages,
+                                                              input_text,
                                                               mm_processor_kwargs)
         else:
-            future = asyncio.get_event_loop().run_in_executor(self.executor, self.model.preprocess, messages)
+            future = asyncio.get_event_loop().run_in_executor(self.executor,
+                                                              self.model.preprocess,
+                                                              messages,
+                                                              input_text)
         future.add_done_callback(_raise_exception_on_finish)
         outputs = await future
         return outputs
@@ -75,12 +87,14 @@ async def async_infer(self, messages: list[dict]) -> list[dict]:
 
     async def wrap_for_pytorch(
         self,
-        messages: list[dict],
-        chat_template,
-        tokenizer,
-        sequence_start,
-        tools: list[object] | None = None,
-        chat_template_kwargs: dict | None = None,
+        input_ids: list[torch.Tensor] | None = None,
+        multimodal: list[dict] | None = None,
+        # messages: list[dict],
+        # chat_template,
+        # tokenizer,
+        # sequence_start,
+        # tools: list[object] | None = None,
+        # chat_template_kwargs: dict | None = None,
     ) -> list[dict]:
         """
         Args:
@@ -100,20 +114,26 @@ async def wrap_for_pytorch(
                         },
                     }
         """
-        has_input_ids = self.model.has_input_ids(messages)
+        # has_input_ids = self.model.has_input_ids(messages)
+        has_input_ids = False
+        result = {}
         if not has_input_ids:
-            result = self.model.to_pytorch(messages,
-                                           chat_template,
-                                           tokenizer,
-                                           sequence_start,
-                                           tools=tools,
-                                           chat_template_kwargs=chat_template_kwargs)
+            return dict(
+                input_ids=input_ids,
+                multimodal=multimodal,
+            )
+            # result = self.model.to_pytorch(messages,
+            #                                chat_template,
+            #                                tokenizer,
+            #                                sequence_start,
+            #                                tools=tools,
+            #                                chat_template_kwargs=chat_template_kwargs
+            #                                )
         else:
-            result = self.model.to_pytorch_with_input_ids(messages)
-        # clear data
-        for i, message in enumerate(messages):
-            if isinstance(message['content'], list):
-                messages[i]['preprocess'] = None
+            # TODO: support input_ids inputs
+            # result = self.model.to_pytorch_with_input_ids(messages)
+            pass
+
         return result
 
     async def wrap_for_turbomind(
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 51ebb44419..8b61465f64 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -1,12 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import dataclasses
 from abc import ABC, abstractmethod
 from itertools import groupby
+from typing import Any
 
 import numpy as np
+import torch
 from mmengine import Registry
 from transformers import AutoConfig, AutoTokenizer
 
 from lmdeploy.archs import get_model_arch
+from lmdeploy.vl.constants import Modality
 
 VISION_MODELS = Registry('vision_model')
 
@@ -31,6 +35,62 @@ def __init__(self,
         self.hf_config = hf_config
         self.image_token_id = self.get_pad_token_id(model_path, hf_config) or 0
 
+        # mapping from attribute names to modality types
+        self.ATTR_NAME_TO_MODALITY = {
+            # image-related attributes
+            'pixel_values': Modality.IMAGE,
+            'image_sizes': Modality.IMAGE,
+            'image_grid_thw': Modality.IMAGE,
+            'image_attention_mask': Modality.IMAGE,
+            'image_emb_mask': Modality.IMAGE,
+            'images_spatial_crop': Modality.IMAGE,
+            'images_crop': Modality.IMAGE,
+            'has_local_crops': Modality.IMAGE,
+            'has_images': Modality.IMAGE,
+            'tgt_size': Modality.IMAGE,
+            'image_grid_hws': Modality.IMAGE,
+            'aspect_ratio_ids': Modality.IMAGE,
+            'aspect_ratio_mask': Modality.IMAGE,
+            'num_patches': Modality.IMAGE,
+            'patch_pixel_values': Modality.IMAGE,
+            'block_sizes': Modality.IMAGE,
+            # audio-related attributes
+            'audio_features': Modality.AUDIO,
+            'audio_feature_lens': Modality.AUDIO,
+            'input_features': Modality.AUDIO,
+            'input_features_mask': Modality.AUDIO,
+            'audio_attention_mask': Modality.AUDIO,
+            'feature_attention_mask': Modality.AUDIO,
+            # video-related attributes
+            'pixel_values_videos': Modality.VIDEO,
+            'second_per_grid_ts': Modality.VIDEO,
+            'video_grid_thw': Modality.VIDEO,
+        }
+
+        # name of the feature filed
+        self.FEATURE_NAMES = [
+            'pixel_values',
+            'pixel_values_videos',
+            'audio_features',
+            'input_features',
+        ]
+
+    @staticmethod
+    def get_mm_items_offset(
+        input_ids: torch.Tensor, mm_token_id: int
+    ) -> list[tuple[int, int]]:
+        """
+        Get a set of range for mm_items from input_ids
+        Example:
+            input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
+            mm_token_id = 3
+            return result = [(2,4),(6,7)]
+        """
+        mask = input_ids == mm_token_id
+        start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
+        end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
+        return list(zip(start_positions.tolist(), end_positions.tolist()))
+
     def get_pad_token_id(self, model_path, hf_config):
         """Get pad_token_id from hf_config or tokenizer."""
         pad_token_id = getattr(hf_config, 'pad_token_id', None)
@@ -60,49 +120,209 @@ def build_model(self, ):
         if self.backend == 'turbomind' or self.with_llm:
             raise NotImplementedError()
 
-    @abstractmethod
-    def preprocess(self, messages: list[dict]) -> list[dict]:
-        """Preprocess multimodal data in the messages.
-
-        The derived class,
-        i.e., a specific vision model, takes the charge of image preprocessing
-        and the result management.
-        It can integrate the result into the messages list, or insert it to
-        the individual image item.
-        Args:
-            message(dict): multimodal data in a dict, which is as follows:
-            [
-                {'role': 'user', 'content': 'user prompt'},
-                {'role': 'assisant', 'content': 'AI reponse'},
-                {
-                    'role': 'user',
-                    'content': [
-                        {
-                            'type': 'text',
-                            'text': 'string',
-                        },
-                        {
-                            'type': 'image',
-                            'image': pillow.Image,
-                            'key1': value1,
-                            ...
-                        },
-                        {
-                            'type': 'image',
-                            'image': pillow.Image,
-                            'key1': value1,
-                            ...
-                        },
-                        ...
-                    ]
-                }
-                {....}
-            ]
-        Returns:
-            the message list with preprocessing results included, which is
-            determined by the derived classes
-        """  # noqa
-        raise NotImplementedError()
+    # adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/mm_utils.py
+    def _get_expanded_mm_items(self, collected_mm_items):
+        """Hf processor outputs produced bundled data for multiple
+        images/videos we need to expand them into per-image/video entries for
+        better cache locality and fine-grained scheduling."""
+        expanded_mm_items = []
+        for modality, item in collected_mm_items.items():
+            is_bundled = item.get('offset', None) is not None and len(item['offset']) > 1
+
+            # non-bundled case
+            if not is_bundled:
+                expanded_mm_items.append(
+                    dict(
+                        modality=modality,
+                        pixel_values=item['feature'],
+                        image_grid_thw=item['image_grid_thw'][0],
+                        offset=item['offset'][0],
+                        image_token_id=self.image_token_id
+                        )
+                    )
+                continue
+
+            # bundled case
+            num_items = len(item['offset'])
+            if modality == Modality.IMAGE:
+                image_grid_thw = item['image_grid_thw']
+                grid_len = image_grid_thw.shape[0]
+
+                patches_per_item = []
+                for grid in image_grid_thw:
+                    grid_tensor = torch.as_tensor(grid, dtype=torch.long)
+                    patches_per_item.append(int(torch.prod(grid_tensor).item()))
+
+                cumulative = torch.cumsum(
+                    torch.tensor(patches_per_item, dtype=torch.long), dim=0
+                )
+                slice_indices = [0] + cumulative.tolist()
+
+                # expand each image into a separate item
+                for i in range(num_items):
+                    start_idx, end_idx = slice_indices[i], slice_indices[i + 1]
+                    # TODO: may compute mm mask and remove mm token id inputs
+                    expanded_mm_items.append(
+                        dict(
+                            modality=modality,
+                            pixel_values=item['feature'][start_idx:end_idx],
+                            image_grid_thw=image_grid_thw[i],
+                            offset=item['offset'][i],
+                            image_token_id=self.image_token_id,
+                        )
+                    )
+            elif modality == Modality.VIDEO:
+                video_grid_thw = item['video_grid_thw']
+
+                # video_grid_thw shape: [num_videos, 3] where each row is [T, H, W]
+                # When T > 1, item.offsets contains frames (num_items = total frames)
+                # grid_len = num_videos, num_items = sum(T for each video) = total frames
+                grid_len = video_grid_thw.shape[0]
+                num_videos = grid_len
+
+                # calculate total frames and frames per video
+                frames_per_video = []
+                total_frames = 0
+                for i in range(num_videos):
+                    grid = video_grid_thw[i]
+                    if isinstance(grid, torch.Tensor):
+                        T = int(grid[0].item())  # T is the first element [T, H, W]
+                    else:
+                        grid_tensor = torch.as_tensor(grid, dtype=torch.long)
+                        T = int(grid_tensor[0].item())
+                    frames_per_video.append(T)
+                    total_frames += T
+
+                # num_items should equal total_frames when T > 1
+                if num_items != total_frames:
+                    expanded_mm_items.append(item)
+                    continue
+
+                # calculate patches per video: T * H * W for each video
+                patches_per_video = []
+                for i in range(num_videos):
+                    grid = video_grid_thw[i]
+                    if isinstance(grid, torch.Tensor):
+                        patches_per_video.append(int(torch.prod(grid).item()))
+                    else:
+                        grid_tensor = torch.as_tensor(grid, dtype=torch.long)
+                        patches_per_video.append(int(torch.prod(grid_tensor).item()))
+
+                # calculate cumulative patches to get slice indices for each video
+                cumulative = torch.cumsum(
+                    torch.tensor(patches_per_video, dtype=torch.long), dim=0
+                )
+                slice_indices = [0] + cumulative.tolist()
+
+                # group frames by video, calculate frame indices for each video
+                frame_start_indices = [0]
+                for i in range(num_videos):
+                    frame_start_indices.append(
+                        frame_start_indices[-1] + frames_per_video[i]
+                    )
+
+                # expand each video into a separate item
+                for video_idx in range(num_videos):
+                    start, end = (
+                        slice_indices[video_idx],
+                        slice_indices[video_idx + 1],
+                    )
+                    frame_start, frame_end = (
+                        frame_start_indices[video_idx],
+                        frame_start_indices[video_idx + 1],
+                    )
+
+                    # import pdb; pdb.set_trace()
+                    # expand each frame into a separate item, not sure good or no
+                    t, h, w = video_grid_thw[video_idx].tolist()
+                    for frame_idx in range(t):
+                        video_feature = item['feature'][start:end]
+                        # FIXME: grid_thw [1, h, w] is only for qwen3vl
+                        expanded_mm_items.append(
+                            dict(
+                                modality=modality,
+                                pixel_values_videos=video_feature[frame_idx * h * w:(frame_idx + 1) * h * w],
+                                video_grid_thw=torch.tensor([1, h, w]),
+                                offset=item['offset'][frame_start:frame_end][frame_idx],
+                                video_token_id=self.video_token_id,
+                            )
+                        )
+
+        return expanded_mm_items
+
+    def preprocess(self,
+                   messages: list[dict],
+                   input_text: str,
+                   mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
+        """Refer to `super().preprocess()` for spec."""
+
+        kwargs = mm_processor_kwargs or {}
+        mm_items = self.collect_multimodal_items(messages)
+
+        # TODO: get kwargs from params
+        # TODO: careful about mm processor kwargs, in mixed modality case
+        # may need to treat each modality differently
+        raw_images, raw_videos, video_metadatas = [], [], []
+        for modality, data, params in mm_items:
+            if modality == Modality.IMAGE:
+                raw_images.append(data)
+            elif modality == Modality.VIDEO:
+                raw_videos.append(data)
+                video_metadatas.append(params['video_metadata'])
+            else:
+                raise ValueError(f'unsupported modality {modality}')
+
+        # get kwrags for processor
+        if raw_images:
+            kwargs['images'] = raw_images
+        if raw_videos:
+            kwargs['videos'] = raw_videos
+            kwargs['video_metadata'] = video_metadatas
+            # leave resize to hf processor
+            # sample frames is done in video loader, avoid duplication
+            kwargs['do_resize'] = True
+            kwargs['do_sample_frames'] = False
+            kwargs['return_metadata'] = True
+
+        # process raw items with hf processor
+        processor_outputs = self.processor(
+            text=[input_text],
+            padding=True,
+            return_tensors='pt',
+            **kwargs,
+        )
+        input_ids = processor_outputs['input_ids'].flatten()
+
+        # collect from processor outputs and categorized by modality
+        collected_mm_items: dict[Modality, dict[str, Any]] = {}
+        for attr_name, value in processor_outputs.items():
+            if attr_name == 'input_ids':
+                continue
+
+            current_modality = self.ATTR_NAME_TO_MODALITY.get(attr_name)
+
+            if current_modality:
+                if current_modality not in collected_mm_items:
+                    collected_mm_items[current_modality] = {}
+
+                if attr_name in self.FEATURE_NAMES:
+                    attr_name = 'feature'
+
+                collected_mm_items[current_modality][attr_name] = value
+
+        # compute offsets for all items
+        for modality, item in collected_mm_items.items():
+            mm_token_id = self.mm_tokens.get_token_id_by_modality(modality)
+            item['offset'] = self.get_mm_items_offset(
+                input_ids=input_ids,
+                mm_token_id=mm_token_id,
+            )
+
+        # expand bundled hf processor outputs into per-image/video entry
+        expanded_mm_items = self._get_expanded_mm_items(collected_mm_items)
+
+        # import pdb; pdb.set_trace()
+        return input_ids.tolist(), expanded_mm_items
 
     def has_input_ids(self, messages: list[dict]) -> bool:
         """Check whether the messages contain input_ids directly.
@@ -131,21 +351,21 @@ def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         if self.backend == 'turbomind':
             raise NotImplementedError()
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, chat_template_kwargs=None, **kwargs):
-        """Pack the preprocessing results in a format compatible with what is
-        required by pytorch engine. ONLY implement it when the backend is
-        pytorch engine.
-
-        Args:
-            messages(list[dict]): the output of `preprocess`
-            chat_template: the chat template defined in `lmdeploy/model.py`
-            tokenzer: the tokenizer model
-            sequence_start: starting flag of a sequence
-            chat_template_kwargs: additional arguments for chat template
-                processing, such as `add_vision_id` and `enable_thinking`
-        """
-        if self.backend == 'pytorch':
-            raise NotImplementedError()
+    # def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, chat_template_kwargs=None, **kwargs):
+    #     """Pack the preprocessing results in a format compatible with what is
+    #     required by pytorch engine. ONLY implement it when the backend is
+    #     pytorch engine.
+
+    #     Args:
+    #         messages(list[dict]): the output of `preprocess`
+    #         chat_template: the chat template defined in `lmdeploy/model.py`
+    #         tokenzer: the tokenizer model
+    #         sequence_start: starting flag of a sequence
+    #         chat_template_kwargs: additional arguments for chat template
+    #             processing, such as `add_vision_id` and `enable_thinking`
+    #     """
+    #     if self.backend == 'pytorch':
+    #         raise NotImplementedError()
 
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, chat_template_kwargs=None, **kwargs):
         """Pack the forwarding results in a format compatible with what is
@@ -251,41 +471,6 @@ def to_pytorch_with_input_ids(self, messages):
 
         return dict(prompt=None, input_ids=input_ids, multimodal=preps)
 
-    def to_pytorch_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start):
-        """Auxiliary function to pack the preprocessing results in a format
-        compatible with what is required by pytorch engine.
-
-        Args:
-            messages(list[dict]): the output of `preprocess`
-            prompt(str): the prompt after applying chat template
-            IMAGE_TOKEN(str): a placeholder where image tokens will be
-                inserted
-            tokenzer: the tokenizer model
-            sequence_start: starting flag of a sequence
-        """
-        # collect all preprocessing result from messages
-        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
-        assert len(preps) == 1
-        preps = preps[0]
-
-        # split prompt into segments and validate data
-        segs = prompt.split(IMAGE_TOKEN)
-        assert len(segs) == len(preps) + 1, (f'the number of {IMAGE_TOKEN} is not equal '
-                                             f'to input images, {len(segs) - 1} vs {len(preps)}')
-
-        # calculate the image token offset for each image
-        input_ids = []
-        for i, seg in enumerate(segs):
-            if i > 0 and i <= len(preps):
-                preps[i - 1].update(offset=len(input_ids))
-                image_tokens = preps[i - 1]['image_tokens']
-                assert self.image_token_id == preps[i - 1]['image_token_id']
-                input_ids.extend([self.image_token_id] * image_tokens)
-            token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start))
-            input_ids.extend(token_ids)
-
-        return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)
-
     def to_turbomind_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start):
         """Auxiliary function to pack the forwarding results in a format
         compatible with what is required by turbomind engine.
@@ -329,3 +514,22 @@ def match(cls, config: AutoConfig):
         if arch and (arch == cls._arch or arch in cls._arch):
             return True
         return False
+
+
+@dataclasses.dataclass
+class MultomodalSpecialTokens:
+    image_token: str | list[str] | None = None
+    video_token: str | list[str] | None = None
+    audio_token: str | list[str] | None = None
+
+    image_token_id: int | None = None
+    video_token_id: int | None = None
+    audio_token_id: int | None = None
+
+    def get_token_id_by_modality(self, modality: Modality) -> int | None:
+        """Get token ID for a given modality."""
+        return {
+            Modality.IMAGE: self.image_token_id,
+            Modality.VIDEO: self.video_token_id,
+            Modality.AUDIO: self.audio_token_id,
+        }.get(modality)
diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py
index e43dad838c..6a47f3fa74 100644
--- a/lmdeploy/vl/model/qwen3.py
+++ b/lmdeploy/vl/model/qwen3.py
@@ -1,12 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Any
 
-import torch
 from transformers import AutoProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.constants import Modality
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
+from lmdeploy.vl.model.base import VISION_MODELS, MultomodalSpecialTokens, VisionModel
 
 logger = get_logger('lmdeploy')
 
@@ -41,6 +39,14 @@ def build_preprocessor(self):
         self.vision_start_token = self.processor.vision_start_token
         self.vision_end_token = self.processor.vision_end_token
 
+        # special tokens
+        self.mm_tokens = MultomodalSpecialTokens(
+            image_token=self.image_token,
+            video_token=self.video_token,
+            image_token_id=self.image_token_id,
+            video_token_id=self.video_token_id,
+        )
+
     def resolve_size_params(self, processor, mm_processor_kwargs: dict[str, Any] | None = None):
         default_min = processor.size['shortest_edge']
         default_max = processor.size['longest_edge']
@@ -57,72 +63,7 @@ def resolve_size_params(self, processor, mm_processor_kwargs: dict[str, Any] | N
 
         return {'shortest_edge': min_pixels, 'longest_edge': max_pixels}
 
-    def _preprocess_image(self,
-                          data: list[Any],
-                          params: dict[str, Any],
-                          mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
-
-        size = self.resolve_size_params(self.processor.image_processor, mm_processor_kwargs)
-        result = self.processor.image_processor(images=data, size=size, return_tensors='pt')
-        merge_length = self.processor.image_processor.merge_size**2
-        image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
-        result.update(dict(image_size=data.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
-        return result
-
-    def _preprocess_video(self,
-                          data: list[Any],
-                          params: dict[str, Any],
-                          mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
-
-        metadata = params['video_metadata']
-        if metadata.get('fps') is None or metadata['fps'] <= 0:
-            logger.warning('fps not found or invalid, fallback to 24.')
-            metadata['fps'] = 24
-        size = self.resolve_size_params(self.processor.video_processor, mm_processor_kwargs)
-
-        # do_resize = True, we leave resize to hf processor
-        # do_sample_frames = False, we already sample frames in video loader, avoid duplicates in hf processor
-        result = self.processor.video_processor(videos=data,
-                                                size=size,
-                                                return_metadata=True,
-                                                do_resize=True,
-                                                do_sample_frames=False,
-                                                video_metadata=metadata,
-                                                return_tensors='pt')
-
-        merge_length = self.processor.video_processor.merge_size**2
-        video_grid_thw = result['video_grid_thw']
-        frame_seqlen = video_grid_thw[0][1:].prod() // merge_length
-        curr_timestamp = self.processor._calculate_timestamps(
-            metadata['frames_indices'],
-            metadata['fps'],
-            self.processor.video_processor.merge_size,
-        )
-
-        result.update(curr_timestamp=curr_timestamp, frame_seqlen=frame_seqlen, video_token_id=self.video_token_id)
-        return result
-
-    def preprocess(self, messages: list[dict], mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
-        """Refer to `super().preprocess()` for spec."""
-        outputs = []
-        self.contains_video_input = False
-
-        mm_items = self.collect_multimodal_items(messages)
-        for modality, data, params in mm_items:
-            result = {}
-            if modality == Modality.IMAGE:
-                result = self._preprocess_image(data, params, mm_processor_kwargs)
-            elif modality == Modality.VIDEO:
-                self.contains_video_input = True
-                result = self._preprocess_video(data, params, mm_processor_kwargs)
-
-            result.update(modality=modality)
-            outputs.append(result)
-
-        messages.append(dict(role='preprocess', content=outputs))
-        return messages
-
-    def proc_messages(self, messages, chat_template, sequence_start, chat_template_kwargs=None):
+    def apply_chat_template(self, messages, chat_template, sequence_start, chat_template_kwargs=None):
         """Apply chat template to get the prompt."""
         chat_template_kwargs = chat_template_kwargs or {}
         prompt_messages = []
@@ -141,107 +82,6 @@ def proc_messages(self, messages, chat_template, sequence_start, chat_template_k
                 prompt_messages.append(dict(role='user', content=prompt))
         else:
             prompt_messages = messages
+
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start, **chat_template_kwargs)
-        return prompt, None
-
-    def to_pytorch_aux_video(self, messages, prompt, VIDEO_TOKEN, tokenizer, sequence_start):
-        """Pack the video input to the compatible format with pytorch engine.
-
-        Each video is split into per-frame (temporal step) entries so that the timestamp text tokens between frames get
-        sequential mrope positions and each frame's video-pad tokens get independent 3D spatial positions.
-        """
-
-        # collect all preprocessing result from messages
-        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
-        assert len(preps) == 1
-        preps = preps[0]
-
-        # split prompt into segments and validate data
-        segs = prompt.split(self.vision_start_token + self.video_token + self.vision_end_token)
-        assert len(segs) == len(preps) + 1, (f'the number of {self.video_token} is not equal '
-                                             f'to input videos, {len(segs) - 1} vs {len(preps)}')
-
-        # calculate the video token offset for each frame
-        input_ids = []
-        frame_preps = []
-
-        for i, seg in enumerate(segs):
-            if i > 0 and i <= len(preps):
-                video_prep = preps[i - 1]
-                frame_seqlen = video_prep['frame_seqlen']
-                curr_timestamp = video_prep['curr_timestamp']
-                video_grid_thw = video_prep['video_grid_thw']
-                pixel_values_videos = video_prep['pixel_values_videos']
-                assert self.video_token_id == video_prep['video_token_id']
-
-                t, h, w = video_grid_thw[0].tolist()
-
-                # each temporal step becomes an independent multimodal entry
-                for frame_idx in range(t):
-                    curr_time = curr_timestamp[frame_idx]
-
-                    # timestamp text + vision_start (regular text tokens)
-                    prefix = f'<{curr_time:.1f} seconds>' + self.vision_start_token
-                    prefix_ids = tokenizer.encode(prefix, add_bos=False)
-                    input_ids.extend(prefix_ids)
-
-                    # video pad tokens for this frame
-                    frame_offset = len(input_ids)
-                    input_ids.extend([self.video_token_id] * frame_seqlen)
-
-                    # vision_end (regular text token)
-                    suffix_ids = tokenizer.encode(self.vision_end_token, add_bos=False)
-                    input_ids.extend(suffix_ids)
-
-                    # since we use timestamps to separate videos
-                    # like <t1> <vision_start> <frame1> <vision_end> <t2> <vision_start> <frame2> <vision_end>
-                    # the video_grid_thw should also be split, becomes [1, h, w] for each frame
-                    frame_preps.append(
-                        dict(
-                            offset=frame_offset,
-                            video_tokens=frame_seqlen,
-                            pixel_values_videos=pixel_values_videos[frame_idx * h * w:(frame_idx + 1) * h * w],
-                            video_grid_thw=torch.tensor([[1, h, w]]),
-                            video_token_id=self.video_token_id,
-                            modality=video_prep['modality'],
-                        )
-                    )
-
-            token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start))
-            input_ids.extend(token_ids)
-
-        return dict(prompt=prompt, input_ids=input_ids, multimodal=frame_preps)
-
-    def to_pytorch(self,
-                   messages,
-                   chat_template,
-                   tokenizer,
-                   sequence_start,
-                   chat_template_kwargs: dict | None = None,
-                   **kwargs):
-        """Return to the information needed by pytorch engine."""
-        prompt, _ = self.proc_messages(messages, chat_template, sequence_start, chat_template_kwargs)
-
-        if self.contains_video_input:
-            return self.to_pytorch_aux_video(messages, prompt, self.video_token, tokenizer, sequence_start)
-        else:
-            return self.to_pytorch_aux(messages, prompt, self.image_token, tokenizer, sequence_start)
-
-    def build_model(self):
-        # TODO: implement for turbomind
-        pass
-
-    @torch.no_grad()
-    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
-        # TODO: implement for turbomind
-        pass
-
-    def to_turbomind(self,
-                     messages,
-                     chat_template,
-                     tokenizer,
-                     sequence_start,
-                     chat_template_kwargs: dict | None = None,
-                     **kwargs):
-        # TODO: implement for turbomind
-        pass
+        return prompt

From 79b99536e5bfd86fc1c2fdeceb2e664404f8713d Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Wed, 15 Apr 2026 11:47:39 +0800
Subject: [PATCH 02/21] fix mm processor kwargs, cleanup

---
 lmdeploy/vl/model/base.py  | 61 +++++++++++++++++++++++++++++++-------
 lmdeploy/vl/model/qwen3.py | 17 -----------
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 8b61465f64..ccd6998bdd 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -10,10 +10,13 @@
 from transformers import AutoConfig, AutoTokenizer
 
 from lmdeploy.archs import get_model_arch
+from lmdeploy.utils import get_logger
 from lmdeploy.vl.constants import Modality
 
 VISION_MODELS = Registry('vision_model')
 
+logger = get_logger('lmdeploy')
+
 
 class VisionModel(ABC):
     """Visual model which extract image feature."""
@@ -161,7 +164,7 @@ def _get_expanded_mm_items(self, collected_mm_items):
                 # expand each image into a separate item
                 for i in range(num_items):
                     start_idx, end_idx = slice_indices[i], slice_indices[i + 1]
-                    # TODO: may compute mm mask and remove mm token id inputs
+                    # TODO: zhouxinyu, compute mask and avoid passing token id
                     expanded_mm_items.append(
                         dict(
                             modality=modality,
@@ -232,12 +235,12 @@ def _get_expanded_mm_items(self, collected_mm_items):
                         frame_start_indices[video_idx + 1],
                     )
 
-                    # import pdb; pdb.set_trace()
-                    # expand each frame into a separate item, not sure good or no
+                    # expand each frame into a separate item
+                    # TODO: zhouxinyu, not sure per-frame split is good or not
+                    # TODO: zhouxinyu, grid_thw [1, h, w] is only for qwen3vl
                     t, h, w = video_grid_thw[video_idx].tolist()
                     for frame_idx in range(t):
                         video_feature = item['feature'][start:end]
-                        # FIXME: grid_thw [1, h, w] is only for qwen3vl
                         expanded_mm_items.append(
                             dict(
                                 modality=modality,
@@ -250,18 +253,45 @@ def _get_expanded_mm_items(self, collected_mm_items):
 
         return expanded_mm_items
 
+    def _get_override_kwargs(self, processor, mm_processor_kwargs: dict[str, Any] | None = None):
+        if not mm_processor_kwargs:
+            return {}, {}
+
+        def apply_override_size(sub_processor, kwargs):
+            if not kwargs or sub_processor is None or not hasattr(sub_processor, 'size'):
+                return
+            default_min = sub_processor.size['shortest_edge']
+            default_max = sub_processor.size['longest_edge']
+            override_min = kwargs.get('min_pixels', default_min)
+            override_max = kwargs.get('max_pixels', default_max)
+            if override_min > override_max:
+                logger.info(
+                    f'Overriding min_pixels {override_min} > max_pixels {override_max}, ' \
+                     'falling back to defaults.'
+                )
+                return
+            logger.info(f'Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
+            kwargs.pop('min_pixels', None)
+            kwargs.pop('max_pixels', None)
+            kwargs['size'] = {'shortest_edge': override_min, 'longest_edge': override_max}
+
+        image_processor = getattr(processor, 'image_processor', None)
+        video_processor = getattr(processor, 'video_processor', None)
+        image_processor_kwargs = mm_processor_kwargs.get('image', {})
+        video_processor_kwargs = mm_processor_kwargs.get('video', {})
+
+        apply_override_size(image_processor, image_processor_kwargs)
+        apply_override_size(video_processor, video_processor_kwargs)
+        return image_processor_kwargs, video_processor_kwargs
+
     def preprocess(self,
                    messages: list[dict],
                    input_text: str,
                    mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
         """Refer to `super().preprocess()` for spec."""
 
-        kwargs = mm_processor_kwargs or {}
         mm_items = self.collect_multimodal_items(messages)
 
-        # TODO: get kwargs from params
-        # TODO: careful about mm processor kwargs, in mixed modality case
-        # may need to treat each modality differently
         raw_images, raw_videos, video_metadatas = [], [], []
         for modality, data, params in mm_items:
             if modality == Modality.IMAGE:
@@ -273,16 +303,26 @@ def preprocess(self,
                 raise ValueError(f'unsupported modality {modality}')
 
         # get kwrags for processor
+        kwargs = {}
+        image_processor_kwargs, video_processor_kwargs = self._get_override_kwargs(self.processor, mm_processor_kwargs)
         if raw_images:
             kwargs['images'] = raw_images
+            kwargs['images_kwargs'] = image_processor_kwargs
         if raw_videos:
             kwargs['videos'] = raw_videos
             kwargs['video_metadata'] = video_metadatas
             # leave resize to hf processor
-            # sample frames is done in video loader, avoid duplication
+            # sample frames is done in video loader, set False to avoid duplication
             kwargs['do_resize'] = True
             kwargs['do_sample_frames'] = False
-            kwargs['return_metadata'] = True
+            kwargs['videos_kwargs'] = video_processor_kwargs
+        if raw_images and raw_videos and (image_processor_kwargs or video_processor_kwargs):
+            logger.warning(
+                'Overriding mm processor kwargs for mixed modality can be problematic, ' \
+                'skip kwargs setting for both processors to avoid potential issues.'
+            )
+            kwargs.pop('images_kwargs', None)
+            kwargs.pop('videos_kwargs', None)
 
         # process raw items with hf processor
         processor_outputs = self.processor(
@@ -321,7 +361,6 @@ def preprocess(self,
         # expand bundled hf processor outputs into per-image/video entry
         expanded_mm_items = self._get_expanded_mm_items(collected_mm_items)
 
-        # import pdb; pdb.set_trace()
         return input_ids.tolist(), expanded_mm_items
 
     def has_input_ids(self, messages: list[dict]) -> bool:
diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py
index 6a47f3fa74..69771dc3d1 100644
--- a/lmdeploy/vl/model/qwen3.py
+++ b/lmdeploy/vl/model/qwen3.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any
 
 from transformers import AutoProcessor
 
@@ -47,22 +46,6 @@ def build_preprocessor(self):
             video_token_id=self.video_token_id,
         )
 
-    def resolve_size_params(self, processor, mm_processor_kwargs: dict[str, Any] | None = None):
-        default_min = processor.size['shortest_edge']
-        default_max = processor.size['longest_edge']
-
-        if not mm_processor_kwargs:
-            return {'shortest_edge': default_min, 'longest_edge': default_max}
-
-        min_pixels = mm_processor_kwargs.get('min_pixels', default_min)
-        max_pixels = mm_processor_kwargs.get('max_pixels', default_max)
-
-        if min_pixels > max_pixels:
-            logger.warning(f'min_pixels {min_pixels} > max_pixels {max_pixels}, falling back to defaults.')
-            return {'shortest_edge': default_min, 'longest_edge': default_max}
-
-        return {'shortest_edge': min_pixels, 'longest_edge': max_pixels}
-
     def apply_chat_template(self, messages, chat_template, sequence_start, chat_template_kwargs=None):
         """Apply chat template to get the prompt."""
         chat_template_kwargs = chat_template_kwargs or {}

From c12b02c89917167dd9ddfeb3acef6e021a68f1b7 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Wed, 15 Apr 2026 12:11:06 +0800
Subject: [PATCH 03/21] qwen3.5 mixed modality

---
 lmdeploy/pytorch/models/qwen3_5.py | 45 ++++++++++++++++++++----------
 lmdeploy/vl/model/qwen3_5.py       | 16 +----------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/lmdeploy/pytorch/models/qwen3_5.py b/lmdeploy/pytorch/models/qwen3_5.py
index aae94cb6d7..2983ec3951 100644
--- a/lmdeploy/pytorch/models/qwen3_5.py
+++ b/lmdeploy/pytorch/models/qwen3_5.py
@@ -15,6 +15,7 @@
 from lmdeploy.pytorch.distributed import get_tp_world_rank
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, LayerNorm, RMSNorm, SiluAndMul
 from lmdeploy.pytorch.nn.gated_delta import CausalConv1d, GatedDelta, GatedDeltaMeta, build_rmsnorm_gated
 from lmdeploy.pytorch.nn.linear import (
@@ -1022,7 +1023,7 @@ def forward(
         pixel_values: torch.Tensor | None = None,
         vis_cu_seqlens: torch.Tensor | None = None,
         vis_pos_emb: torch.Tensor | None = None,
-        image_mask: torch.Tensor | None = None,
+        multimodal_mask: torch.Tensor | None = None,
         pos_embeds: torch.Tensor | None = None,
         grid_thw: torch.Tensor | None = None,
         all_routed_experts: torch.Tensor | None = None,
@@ -1051,8 +1052,8 @@ def forward(
                 image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, dtype)
 
                 # mask and scatter to create final input embeddings
-                expanded_image_mask = image_mask.unsqueeze(-1).expand_as(inputs_embeds)
-                inputs_embeds = inputs_embeds.masked_scatter(expanded_image_mask, image_embeds)
+                multimodal_mask = multimodal_mask.unsqueeze(-1).expand_as(inputs_embeds)
+                inputs_embeds = inputs_embeds.masked_scatter(multimodal_mask, image_embeds)
 
         output_inputs_embeds = inputs_embeds if return_input_embeds else None
 
@@ -1126,7 +1127,7 @@ def forward(
         pixel_values: torch.Tensor | None = None,
         vis_cu_seqlens: torch.Tensor | None = None,
         vis_pos_emb: torch.Tensor | None = None,
-        image_mask: torch.Tensor | None = None,
+        multimodal_mask: torch.Tensor | None = None,
         pos_embeds: torch.Tensor | None = None,
         grid_thw: torch.Tensor | None = None,
         return_input_embeds: bool = False,
@@ -1151,7 +1152,7 @@ def forward(
             pixel_values=pixel_values,
             vis_cu_seqlens=vis_cu_seqlens,
             vis_pos_emb=vis_pos_emb,
-            image_mask=image_mask,
+            multimodal_mask=multimodal_mask,
             pos_embeds=pos_embeds,
             grid_thw=grid_thw,
             all_routed_experts=all_routed_experts,
@@ -1165,6 +1166,27 @@ def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()
 
+    def get_multimodal_mask(self, input_ids: torch.Tensor, mm_inputs: list[MultiModalData]) -> torch.Tensor:
+        """Get position masks for vision tokens."""
+        image_token_id = next((m.meta.get('image_token_id') for m in mm_inputs if m.modality == Modality.IMAGE), None)
+        video_token_id = next((m.meta.get('video_token_id') for m in mm_inputs if m.modality == Modality.VIDEO), None)
+
+        image_mask, video_mask = None, None
+        if image_token_id is not None:
+            image_mask = (input_ids == image_token_id)
+        if video_token_id is not None:
+            video_mask = (input_ids == video_token_id)
+
+        multimodal_mask = None
+        if image_mask is not None and video_mask is not None:
+            multimodal_mask = image_mask | video_mask
+        elif image_mask is not None:
+            multimodal_mask = image_mask
+        elif video_mask is not None:
+            multimodal_mask = video_mask
+
+        return multimodal_mask
+
     def prepare_inputs_for_generation(
         self,
         past_key_values: list[list[torch.Tensor]],
@@ -1192,7 +1214,7 @@ def prepare_inputs_for_generation(
         pixel_values = None
         vis_cu_seqlens = None
         vis_pos_emb = None
-        image_mask = None
+        multimodal_mask = None
         grid_thw = None
         pos_embeds = None
         if context.input_multimodals is not None:
@@ -1201,15 +1223,10 @@ def prepare_inputs_for_generation(
             mm_inputs = [item for sublist in mm_inputs for item in sublist]
 
             if len(mm_inputs) > 0:
-                modality = mm_inputs[0].modality
                 pixel_values = torch.cat([inp.data for inp in mm_inputs])
 
-                image_token_id = mm_inputs[0].meta.get('image_token_id')
-                video_token_id = mm_inputs[0].meta.get('video_token_id')
-                mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id
-                image_mask = (input_ids == mm_token_id)
-
-                grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu()
+                multimodal_mask = self.get_multimodal_mask(input_ids, mm_inputs)
+                grid_thw = torch.stack([data.meta['grid_thw'] for data in mm_inputs]).cpu()
                 vis_pos_emb = self.model.visual.rot_pos_emb(grid_thw)
                 pos_embeds = self.model.visual.fast_pos_embed_interpolate(grid_thw)
                 vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
@@ -1244,7 +1261,7 @@ def prepare_inputs_for_generation(
             pixel_values=pixel_values,
             vis_cu_seqlens=vis_cu_seqlens,
             vis_pos_emb=vis_pos_emb,
-            image_mask=image_mask,
+            multimodal_mask=multimodal_mask,
             grid_thw=grid_thw,
             pos_embeds=pos_embeds,
             return_input_embeds=return_input_embeds,
diff --git a/lmdeploy/vl/model/qwen3_5.py b/lmdeploy/vl/model/qwen3_5.py
index f030de5e5e..3f46f0a38d 100644
--- a/lmdeploy/vl/model/qwen3_5.py
+++ b/lmdeploy/vl/model/qwen3_5.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from transformers import AutoProcessor
 
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS
@@ -25,17 +24,4 @@ class Qwen3_5Model(Qwen3VLModel):
 
     def build_preprocessor(self):
         check_transformers()
-
-        self.processor = AutoProcessor.from_pretrained(self.model_path)
-
-        # image tokens
-        self.image_token = self.processor.image_token
-        self.image_token_id = self.processor.image_token_id
-
-        # video tokens
-        self.video_token = self.processor.video_token
-        self.video_token_id = self.processor.video_token_id
-
-        # vision start and end tokens
-        self.vision_start_token = self.processor.vision_start_token
-        self.vision_end_token = self.processor.vision_end_token
+        super().build_preprocessor()

From cd85dcf64ee5d6b5092e7aa8b2592115653b0a80 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Wed, 15 Apr 2026 18:10:25 +0800
Subject: [PATCH 04/21] interns1 pro mixed modality, fix kwargs

---
 lmdeploy/pytorch/models/interns1_pro.py |  76 ++++++------
 lmdeploy/vl/model/base.py               | 106 ++++++++++-------
 lmdeploy/vl/model/interns1_pro.py       | 148 ++++++++----------------
 3 files changed, 151 insertions(+), 179 deletions(-)

diff --git a/lmdeploy/pytorch/models/interns1_pro.py b/lmdeploy/pytorch/models/interns1_pro.py
index a78b9803f5..c902202c83 100644
--- a/lmdeploy/pytorch/models/interns1_pro.py
+++ b/lmdeploy/pytorch/models/interns1_pro.py
@@ -88,14 +88,13 @@ def forward(
         pixel_values: torch.Tensor = None,
         vis_cu_seqlens: torch.Tensor = None,
         vis_pos_emb: torch.Tensor = None,
-        image_mask: torch.Tensor = None,
+        multimodal_mask: torch.Tensor = None,
         pos_embeds: torch.Tensor = None,
         grid_thw: torch.Tensor = None,
         # for time series
         ts_values: torch.Tensor = None,
         ts_lens: torch.Tensor = None,
         ts_sr: torch.Tensor = None,
-        ts_mask: torch.Tensor = None,
         **kwargs,
     ):
         """Model forward, return logits."""
@@ -121,12 +120,11 @@ def forward(
                 image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, dtype)
 
                 # mask and scatter to create final input embeddings
-                expanded_image_mask = image_mask.unsqueeze(-1).expand_as(inputs_embeds)
-                inputs_embeds = inputs_embeds.masked_scatter(expanded_image_mask, image_embeds)
-
+                multimodal_mask = multimodal_mask.unsqueeze(-1).expand_as(inputs_embeds)
+                inputs_embeds = inputs_embeds.masked_scatter(multimodal_mask, image_embeds)
             elif ts_values is not None:
                 ts_embeds = self.time_series(ts_values, ts_lens, ts_sr)  # [B, T, C]
-                inputs_embeds = inputs_embeds.masked_scatter_(ts_mask[..., None], ts_embeds)
+                inputs_embeds = inputs_embeds.masked_scatter(multimodal_mask[..., None], ts_embeds)
 
         # router replay
         all_routed_experts = None
@@ -150,6 +148,32 @@ def get_input_embeddings(self):
         """Get input embeddings."""
         return self.language_model.get_input_embeddings()
 
+    def get_multimodal_mask(self, input_ids: torch.Tensor, mm_inputs: list[MultiModalData]) -> torch.Tensor:
+        """Get position masks for vision tokens."""
+        image_token_id = next((m.meta.get('image_token_id') for m in mm_inputs if m.modality == Modality.IMAGE), None)
+        video_token_id = next((m.meta.get('video_token_id') for m in mm_inputs if m.modality == Modality.VIDEO), None)
+        ts_token_id = next((m.meta.get('ts_token_id') for m in mm_inputs if m.modality == Modality.TIME_SERIES), None)
+
+        image_mask, video_mask, ts_mask = None, None, None
+        if image_token_id is not None:
+            image_mask = (input_ids == image_token_id)
+        if video_token_id is not None:
+            video_mask = (input_ids == video_token_id)
+        if ts_token_id is not None:
+            ts_mask = (input_ids == ts_token_id)
+
+        multimodal_mask = None
+        if image_mask is not None and video_mask is not None:
+            multimodal_mask = image_mask | video_mask
+        elif image_mask is not None:
+            multimodal_mask = image_mask
+        elif video_mask is not None:
+            multimodal_mask = video_mask
+        elif ts_mask is not None:
+            multimodal_mask = ts_mask
+
+        return multimodal_mask
+
     def prepare_inputs_for_generation(
         self,
         past_key_values: list[list[torch.Tensor]],
@@ -166,14 +190,13 @@ def prepare_inputs_for_generation(
         pixel_values = None
         vis_cu_seqlens = None
         vis_pos_emb = None
-        image_mask = None
+        multimodal_mask = None
         grid_thw = None
         pos_embeds = None
         # for time series
         ts_values = None
         ts_lens = None
         ts_sr = None
-        ts_mask = None
         if context.input_multimodals is not None:
             mm_inputs = [input_mm.get('mm_data', []) for input_mm in context.input_multimodals]
             # flatten batch
@@ -181,22 +204,15 @@ def prepare_inputs_for_generation(
 
             if len(mm_inputs) > 0:
                 modality = mm_inputs[0].modality
-                image_token_id = mm_inputs[0].meta.get('image_token_id')
-                video_token_id = mm_inputs[0].meta.get('video_token_id')
-                ts_token_id = mm_inputs[0].meta.get('ts_token_id')
+                multimodal_mask = self.get_multimodal_mask(input_ids, mm_inputs)
 
                 if modality == Modality.TIME_SERIES:
                     ts_values = torch.cat([inp.data for inp in mm_inputs])
-                    ts_mask = input_ids == ts_token_id
-
                     ts_lens = mm_inputs[0].meta['ts_lens']
                     ts_sr = mm_inputs[0].meta['ts_sr']
                 else:
                     pixel_values = torch.cat([inp.data for inp in mm_inputs])
-                    mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id
-                    image_mask = (input_ids == mm_token_id)
-
-                    grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu()
+                    grid_thw = torch.stack([data.meta['grid_thw'] for data in mm_inputs]).cpu()
                     vis_pos_emb = self.visual.rot_pos_emb(grid_thw)
                     pos_embeds = self.visual.fast_pos_embed_interpolate(grid_thw)
                     vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
@@ -223,14 +239,13 @@ def prepare_inputs_for_generation(
             pixel_values=pixel_values,
             vis_cu_seqlens=vis_cu_seqlens,
             vis_pos_emb=vis_pos_emb,
-            image_mask=image_mask,
+            multimodal_mask=multimodal_mask,
             grid_thw=grid_thw,
             pos_embeds=pos_embeds,
             # for time series
             ts_values=ts_values,
             ts_lens=ts_lens,
             ts_sr=ts_sr,
-            ts_mask=ts_mask,
         )
 
     @classmethod
@@ -375,16 +390,12 @@ def _make_image_mm_data(self, input_mm: dict[str, Any]) -> MultiModalData:
         pixel_values = input_mm['pixel_values'].to(self.dtype)
         image_grid_thw = input_mm['image_grid_thw']
         offset = input_mm['offset']
-        start = offset
         image_token_id = input_mm['image_token_id']
-        num_pad = input_mm['image_tokens']
-        if isinstance(num_pad, torch.Tensor):
-            num_pad = num_pad.item()
 
         mm_data = MultiModalData(modality=Modality.IMAGE,
                                  data=pixel_values,
-                                 start=start,
-                                 end=start + num_pad,
+                                 start=offset[0],
+                                 end=offset[1],
                                  meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
         return mm_data
 
@@ -393,16 +404,12 @@ def _make_video_mm_data(self, input_mm: dict[str, Any]) -> MultiModalData:
         pixel_values_videos = input_mm['pixel_values_videos'].to(self.dtype)
         video_grid_thw = input_mm['video_grid_thw']
         offset = input_mm['offset']
-        start = offset
         video_token_id = input_mm['video_token_id']
-        num_pad = input_mm['video_tokens']
-        if isinstance(num_pad, torch.Tensor):
-            num_pad = num_pad.item()
 
         mm_data = MultiModalData(modality=Modality.VIDEO,
                                  data=pixel_values_videos,
-                                 start=start,
-                                 end=start + num_pad,
+                                 start=offset[0],
+                                 end=offset[1],
                                  meta=dict(
                                      grid_thw=video_grid_thw,
                                      video_token_id=video_token_id,
@@ -416,14 +423,11 @@ def _make_time_series_mm_data(self, input_mm: dict[str, Any]) -> MultiModalData:
         ts_token_id = input_mm['ts_token_id']
         ts_lens = input_mm['ts_lens']
         ts_sr = input_mm['ts_sr']
-        num_pad = input_mm['ts_tokens']
-        if isinstance(num_pad, torch.Tensor):
-            num_pad = num_pad.item()
 
         mm_data = MultiModalData(modality=Modality.TIME_SERIES,
                                  data=ts_values,
-                                 start=offset,
-                                 end=offset + num_pad,
+                                 start=offset[0],
+                                 end=offset[1],
                                  meta=dict(ts_lens=ts_lens, ts_sr=ts_sr, ts_token_id=ts_token_id))
         return mm_data
 
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index ccd6998bdd..46aa9137e2 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -68,6 +68,10 @@ def __init__(self,
             'pixel_values_videos': Modality.VIDEO,
             'second_per_grid_ts': Modality.VIDEO,
             'video_grid_thw': Modality.VIDEO,
+            # time series-related attributes
+            'ts_values': Modality.TIME_SERIES,
+            'ts_sr': Modality.TIME_SERIES,
+            'ts_lens': Modality.TIME_SERIES,
         }
 
         # name of the feature filed
@@ -76,6 +80,7 @@ def __init__(self,
             'pixel_values_videos',
             'audio_features',
             'input_features',
+            'ts_values',
         ]
 
     @staticmethod
@@ -134,13 +139,25 @@ def _get_expanded_mm_items(self, collected_mm_items):
 
             # non-bundled case
             if not is_bundled:
-                expanded_mm_items.append(
-                    dict(
-                        modality=modality,
-                        pixel_values=item['feature'],
-                        image_grid_thw=item['image_grid_thw'][0],
-                        offset=item['offset'][0],
-                        image_token_id=self.image_token_id
+                if modality == Modality.IMAGE:
+                    expanded_mm_items.append(
+                        dict(
+                            modality=modality,
+                            pixel_values=item['feature'],
+                            image_grid_thw=item['image_grid_thw'][0],
+                            offset=item['offset'][0],
+                            image_token_id=self.image_token_id
+                            )
+                        )
+                elif modality == Modality.TIME_SERIES:
+                    expanded_mm_items.append(
+                        dict(
+                            modality=modality,
+                            ts_values=item['feature'],
+                            ts_sr=item['ts_sr'],
+                            ts_lens=item['ts_lens'],
+                            offset=item['offset'][0],
+                            ts_token_id=self.ts_token_id
                         )
                     )
                 continue
@@ -253,36 +270,23 @@ def _get_expanded_mm_items(self, collected_mm_items):
 
         return expanded_mm_items
 
-    def _get_override_kwargs(self, processor, mm_processor_kwargs: dict[str, Any] | None = None):
+    def _get_override_size(self, processor, mm_processor_kwargs: dict[str, Any] | None = None):
+        default_min = processor.size['shortest_edge']
+        default_max = processor.size['longest_edge']
         if not mm_processor_kwargs:
-            return {}, {}
-
-        def apply_override_size(sub_processor, kwargs):
-            if not kwargs or sub_processor is None or not hasattr(sub_processor, 'size'):
-                return
-            default_min = sub_processor.size['shortest_edge']
-            default_max = sub_processor.size['longest_edge']
-            override_min = kwargs.get('min_pixels', default_min)
-            override_max = kwargs.get('max_pixels', default_max)
-            if override_min > override_max:
-                logger.info(
-                    f'Overriding min_pixels {override_min} > max_pixels {override_max}, ' \
-                     'falling back to defaults.'
-                )
-                return
-            logger.info(f'Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
-            kwargs.pop('min_pixels', None)
-            kwargs.pop('max_pixels', None)
-            kwargs['size'] = {'shortest_edge': override_min, 'longest_edge': override_max}
-
-        image_processor = getattr(processor, 'image_processor', None)
-        video_processor = getattr(processor, 'video_processor', None)
-        image_processor_kwargs = mm_processor_kwargs.get('image', {})
-        video_processor_kwargs = mm_processor_kwargs.get('video', {})
+            return {'shortest_edge': default_min, 'longest_edge': default_max}
+
+        override_min = mm_processor_kwargs.get('min_pixels', default_min)
+        override_max = mm_processor_kwargs.get('max_pixels', default_max)
+        if override_min > override_max:
+            logger.info(
+                f'Overriding min_pixels {override_min} > max_pixels {override_max}, ' \
+                f'falling back to defaults, min_pixels={default_min} and max_pixels={default_max}.'
+            )
+            return {'shortest_edge': default_min, 'longest_edge': default_max}
 
-        apply_override_size(image_processor, image_processor_kwargs)
-        apply_override_size(video_processor, video_processor_kwargs)
-        return image_processor_kwargs, video_processor_kwargs
+        logger.info(f'Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
+        return {'shortest_edge': override_min, 'longest_edge': override_max}
 
     def preprocess(self,
                    messages: list[dict],
@@ -292,37 +296,50 @@ def preprocess(self,
 
         mm_items = self.collect_multimodal_items(messages)
 
-        raw_images, raw_videos, video_metadatas = [], [], []
+        raw_images = []
+        raw_videos, video_metadatas = [], []
+        raw_time_series, sampling_rates = [], []
         for modality, data, params in mm_items:
             if modality == Modality.IMAGE:
                 raw_images.append(data)
             elif modality == Modality.VIDEO:
                 raw_videos.append(data)
-                video_metadatas.append(params['video_metadata'])
+                video_metadatas.append(params.get('video_metadata', None))
+            elif modality == Modality.TIME_SERIES:
+                raw_time_series.append(data)
+                sampling_rates.append(params.get('sampling_rate', None))
             else:
                 raise ValueError(f'unsupported modality {modality}')
 
         # get kwrags for processor
         kwargs = {}
-        image_processor_kwargs, video_processor_kwargs = self._get_override_kwargs(self.processor, mm_processor_kwargs)
+        mm_processor_kwargs = mm_processor_kwargs or {}
         if raw_images:
             kwargs['images'] = raw_images
-            kwargs['images_kwargs'] = image_processor_kwargs
+            image_processor_kwargs = mm_processor_kwargs.get('image', None)
+            kwargs['size'] = self._get_override_size(self.processor.image_processor, image_processor_kwargs)
         if raw_videos:
             kwargs['videos'] = raw_videos
             kwargs['video_metadata'] = video_metadatas
+            video_processor_kwargs = mm_processor_kwargs.get('video', None)
+            kwargs['size'] = self._get_override_size(self.processor.video_processor, video_processor_kwargs)
             # leave resize to hf processor
             # sample frames is done in video loader, set False to avoid duplication
             kwargs['do_resize'] = True
             kwargs['do_sample_frames'] = False
-            kwargs['videos_kwargs'] = video_processor_kwargs
-        if raw_images and raw_videos and (image_processor_kwargs or video_processor_kwargs):
+        if raw_time_series:
+            assert hasattr(self, 'time_series_processor'), 'time series processor is not defined for time series input'
+            assert not raw_images and not raw_videos, 'time series is not compatible with image/video input'
+            self.tokenizer = self.processor.tokenizer
+            self.processor = self.time_series_processor
+            kwargs['time_series'] = raw_time_series
+            kwargs['sampling_rate'] = sampling_rates
+        if raw_images and raw_videos and kwargs.get('size', None) is not None:
             logger.warning(
                 'Overriding mm processor kwargs for mixed modality can be problematic, ' \
                 'skip kwargs setting for both processors to avoid potential issues.'
             )
-            kwargs.pop('images_kwargs', None)
-            kwargs.pop('videos_kwargs', None)
+            kwargs.pop('size', None)
 
         # process raw items with hf processor
         processor_outputs = self.processor(
@@ -560,10 +577,12 @@ class MultomodalSpecialTokens:
     image_token: str | list[str] | None = None
     video_token: str | list[str] | None = None
     audio_token: str | list[str] | None = None
+    ts_token: str | list[str] | None = None
 
     image_token_id: int | None = None
     video_token_id: int | None = None
     audio_token_id: int | None = None
+    ts_token_id: int | None = None
 
     def get_token_id_by_modality(self, modality: Modality) -> int | None:
         """Get token ID for a given modality."""
@@ -571,4 +590,5 @@ def get_token_id_by_modality(self, modality: Modality) -> int | None:
             Modality.IMAGE: self.image_token_id,
             Modality.VIDEO: self.video_token_id,
             Modality.AUDIO: self.audio_token_id,
+            Modality.TIME_SERIES: self.ts_token_id,
         }.get(modality)
diff --git a/lmdeploy/vl/model/interns1_pro.py b/lmdeploy/vl/model/interns1_pro.py
index 6534886017..45622d9a88 100644
--- a/lmdeploy/vl/model/interns1_pro.py
+++ b/lmdeploy/vl/model/interns1_pro.py
@@ -6,7 +6,7 @@
 
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.constants import Modality
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
+from lmdeploy.vl.model.base import VISION_MODELS, MultomodalSpecialTokens, VisionModel
 from lmdeploy.vl.model.qwen3 import Qwen3VLModel
 
 logger = get_logger('lmdeploy')
@@ -28,14 +28,27 @@ def build_preprocessor(self):
         # time series tokens
         self.ts_token = getattr(self.processor, 'ts_token', None)
         self.ts_token_id = getattr(self.processor, 'ts_token_id', None)
-
-    def _preprocess_time_series(self,
-                                data: list[Any],
-                                params: dict[str, Any],
-                                mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
-
-        ts_input = data
-        sr = params.get('sampling_rate') if params is not None else None
+        self.ts_start_token = getattr(self.processor, 'ts_start_token', None)
+        self.ts_end_token = getattr(self.processor, 'ts_end_token', None)
+
+        # special tokens
+        self.mm_tokens = MultomodalSpecialTokens(
+            image_token=self.image_token,
+            video_token=self.video_token,
+            ts_token=self.ts_token,
+            image_token_id=self.image_token_id,
+            video_token_id=self.video_token_id,
+            ts_token_id=self.ts_token_id
+        )
+
+    def time_series_processor(self,
+                              text: str,
+                              time_series: list[Any],
+                              sampling_rate: float | None = None,
+                              **kwargs):
+
+        ts_input = time_series[0] if isinstance(time_series, list) else time_series
+        sampling_rate = sampling_rate[0] if isinstance(sampling_rate, list) else sampling_rate
 
         if not isinstance(ts_input, np.ndarray):
             ts_input = np.array(ts_input, dtype=np.float32)
@@ -55,46 +68,37 @@ def _preprocess_time_series(self,
         ts_len = ts_input.shape[0]
 
         # set the default value to ts_len / 4 if sr is not provided or invalid
-        if sr is None or sr <= 0:
-            sr = max(ts_len / 4, 1.0)
+        if sampling_rate is None or sampling_rate <= 0:
+            sampling_rate = max(ts_len / 4, 1.0)
 
         # compute num ts tokens
-        stride = np.floor(160 / ((1 + np.exp(-sr / 100))**6))
+        stride = np.floor(160 / ((1 + np.exp(-sampling_rate / 100))**6))
         patch_size = stride * 2
         embed_length = (np.ceil((ts_len - patch_size) / stride) + 1)
         ts_tokens = int((embed_length // 2 + 1) // 2)
 
-        return dict(ts_values=[ts_input],
-                    ts_sr=[sr],
-                    ts_lens=[ts_len],
-                    ts_tokens=[ts_tokens],
+        # generate text with ts tokens
+        for i in range(len(text)):
+            if f'{self.ts_start_token}{self.ts_token}{self.ts_end_token}' in text[i]:
+                ts_placeholder = self.ts_start_token + self.ts_token * ts_tokens + self.ts_end_token
+                text[i] = text[i].replace(
+                        f'{self.ts_start_token}{self.ts_token}{self.ts_end_token}', ts_placeholder, 1
+                    )
+            elif self.ts_token in text[i]:
+                text[i] = text[i].replace(self.ts_token, self.ts_token * ts_tokens)
+
+        input_ids = self.tokenizer(text, add_special_tokens=False, **kwargs)['input_ids']
+
+        ts_input = torch.from_numpy(np.array([ts_input])).to(dtype=torch.bfloat16)
+        ts_sr = torch.tensor([sampling_rate])
+        ts_lens = torch.tensor([ts_len])
+        return dict(input_ids=input_ids,
+                    ts_values=ts_input,
+                    ts_sr=ts_sr,
+                    ts_lens=ts_lens,
                     ts_token_id=self.ts_token_id)
 
-    def preprocess(self, messages: list[dict], mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
-        """Refer to `super().preprocess()` for spec."""
-        outputs = []
-        self.contains_video_input = False
-        self.contains_ts_input = False
-
-        mm_items = self.collect_multimodal_items(messages)
-        for modality, data, params in mm_items:
-            result = {}
-            if modality == Modality.IMAGE:
-                result = self._preprocess_image(data, params, mm_processor_kwargs)
-            elif modality == Modality.VIDEO:
-                self.contains_video_input = True
-                result = self._preprocess_video(data, params, mm_processor_kwargs)
-            elif modality == Modality.TIME_SERIES:
-                self.contains_ts_input = True
-                result = self._preprocess_time_series(data, params, mm_processor_kwargs)
-
-            result.update(modality=modality)
-            outputs.append(result)
-
-        messages.append(dict(role='preprocess', content=outputs))
-        return messages
-
-    def proc_messages(self,
+    def apply_chat_template(self,
                       messages,
                       chat_template,
                       sequence_start,
@@ -119,65 +123,9 @@ def proc_messages(self,
         else:
             prompt_messages = messages
 
-        # time series input requires enabling_thinking = False
-        if self.contains_ts_input:
+        # time series requires enabling_thinking = False
+        if any(m == Modality.TIME_SERIES for m, _, _ in self.collect_multimodal_items(messages)):
             chat_template_kwargs['enable_thinking'] = False
 
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start, tools=tools, **chat_template_kwargs)
-        return prompt, None
-
-    def to_pytorch_aux_ts(self, messages, prompt, TS_TOKEN, tokenizer, sequence_start):
-        """Pack the time series input to the compatible format with pytorch
-        engine."""
-        # collect all preprocessing result from messages
-        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
-        assert len(preps) == 1
-        preps = preps[0]
-
-        # split prompt into segments and validate data
-        segs = prompt.split(TS_TOKEN)
-        assert len(segs) == len(preps) + 1, (f'the number of {TS_TOKEN} is not equal '
-                                             f'to input time series data, {len(segs) - 1} vs {len(preps)}')
-
-        input_ids = []
-        for i, seg in enumerate(segs):
-            if i > 0 and i <= len(preps):
-                preps[i - 1].update(offset=len(input_ids))
-                ts_tokens = preps[i - 1]['ts_tokens']
-
-                ts_tokens = ts_tokens[0]
-                ts_array = np.array(preps[i - 1]['ts_values'])
-
-                preps[i - 1].update(ts_tokens=ts_tokens)
-                preps[i - 1].update(ts_values=torch.from_numpy(ts_array).to(dtype=torch.bfloat16))
-                preps[i - 1].update(ts_lens=torch.tensor(preps[i - 1]['ts_lens']))
-                preps[i - 1].update(ts_sr=torch.tensor(preps[i - 1]['ts_sr']))
-
-                assert self.ts_token_id == preps[i - 1]['ts_token_id']
-                input_ids.extend([self.ts_token_id] * ts_tokens)
-            token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start))
-            input_ids.extend(token_ids)
-
-        return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)
-
-    def to_pytorch(self,
-                   messages,
-                   chat_template,
-                   tokenizer,
-                   sequence_start,
-                   tools: list[object] | None = None,
-                   chat_template_kwargs: dict | None = None,
-                   **kwargs):
-        """Return to the information needed by pytorch engine."""
-        prompt, _ = self.proc_messages(messages,
-                                       chat_template,
-                                       sequence_start,
-                                       tools=tools,
-                                       chat_template_kwargs=chat_template_kwargs)
-
-        if self.contains_video_input:
-            return self.to_pytorch_aux_video(messages, prompt, self.video_token, tokenizer, sequence_start)
-        elif self.contains_ts_input:
-            return self.to_pytorch_aux_ts(messages, prompt, self.ts_token, tokenizer, sequence_start)
-        else:
-            return self.to_pytorch_aux(messages, prompt, self.image_token, tokenizer, sequence_start)
+        return prompt

From dc8e388f085007011f8c7c269c80f755f0a19613 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 16 Apr 2026 12:42:36 +0800
Subject: [PATCH 05/21] fix generate, cleanup

---
 lmdeploy/serve/processors/multimodal.py |  21 +++-
 lmdeploy/vl/engine.py                   |  51 ----------
 lmdeploy/vl/model/base.py               | 126 ++++++++----------------
 lmdeploy/vl/model/interns1_pro.py       |   4 +-
 lmdeploy/vl/model/qwen3.py              |   4 +-
 5 files changed, 61 insertions(+), 145 deletions(-)

diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 7cce025f17..b07374e5e9 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -310,6 +310,11 @@ def _has_multimodal_input(self, messages: list[dict]) -> bool:
             isinstance(message.get('content'), list) and any(
                 item.get('type') in multimodal_types for item in message['content']) for message in messages)
 
+    def _has_input_ids_input(self, messages: list[dict]) -> bool:
+        """Check whether the messages contain input_ids directly."""
+        users = [x['content'] for x in messages if x['role'] == 'user']
+        return len(users) == 1 and isinstance(users[0], list) and isinstance(users[0][0].get('text', ''), list)
+
     async def _get_text_prompt_input(self,
                                      prompt: str | list[dict],
                                      do_preprocess: bool,
@@ -357,11 +362,17 @@ async def _get_multimodal_prompt_input(self,
         engines."""
         chat_template = self.chat_template if do_preprocess else BaseChatTemplate()
         messages = await self.async_parse_multimodal_item(messages, media_io_kwargs)
-        input_text = self.vl_encoder.apply_chat_template(messages=messages,
-                                                         chat_template=chat_template,
-                                                         sequence_start=sequence_start,
-                                                         chat_template_kwargs=chat_template_kwargs)
-        input_ids, multimodal = await self.vl_encoder.preprocess(messages, input_text, mm_processor_kwargs)
+
+        # get input prompt as either text or input ids list
+        if self._has_input_ids_input(messages):
+            input_prompt = messages[0]['content'][0]['text']
+        else:
+            input_prompt = self.vl_encoder.apply_chat_template(messages=messages,
+                                                               chat_template=chat_template,
+                                                               sequence_start=sequence_start,
+                                                               chat_template_kwargs=chat_template_kwargs)
+
+        input_ids, multimodal = await self.vl_encoder.preprocess(messages, input_prompt, mm_processor_kwargs)
 
         results = {'input_ids': input_ids, 'multimodal': multimodal}
         if self.backend == 'turbomind':
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index e8d4dd0451..a15231a5e6 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -85,57 +85,6 @@ async def async_infer(self, messages: list[dict]) -> list[dict]:
         outputs = await future
         return outputs
 
-    async def wrap_for_pytorch(
-        self,
-        input_ids: list[torch.Tensor] | None = None,
-        multimodal: list[dict] | None = None,
-        # messages: list[dict],
-        # chat_template,
-        # tokenizer,
-        # sequence_start,
-        # tools: list[object] | None = None,
-        # chat_template_kwargs: dict | None = None,
-    ) -> list[dict]:
-        """
-        Args:
-            messages (list[dict]): a list of message, which is supposed to be
-                the output of `preprocess`
-
-        Returns:
-            list[dict]: a list of dicts passed to pytorch engine_instance's forward.
-                Each dict has the following structure::
-
-                    {
-                        'prompt': 'the prompt after applying chat template',
-                        'input_ids': [],
-                        'multimodal': {
-                            'pixel_values': torch.Tensor,
-                            ...
-                        },
-                    }
-        """
-        # has_input_ids = self.model.has_input_ids(messages)
-        has_input_ids = False
-        result = {}
-        if not has_input_ids:
-            return dict(
-                input_ids=input_ids,
-                multimodal=multimodal,
-            )
-            # result = self.model.to_pytorch(messages,
-            #                                chat_template,
-            #                                tokenizer,
-            #                                sequence_start,
-            #                                tools=tools,
-            #                                chat_template_kwargs=chat_template_kwargs
-            #                                )
-        else:
-            # TODO: support input_ids inputs
-            # result = self.model.to_pytorch_with_input_ids(messages)
-            pass
-
-        return result
-
     async def wrap_for_turbomind(
         self,
         messages: list[dict],
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 46aa9137e2..53a4e42025 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dataclasses
 from abc import ABC, abstractmethod
-from itertools import groupby
 from typing import Any
 
 import numpy as np
@@ -288,16 +287,31 @@ def _get_override_size(self, processor, mm_processor_kwargs: dict[str, Any] | No
         logger.info(f'Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
         return {'shortest_edge': override_min, 'longest_edge': override_max}
 
+    def _get_expanded_input_ids(self, input_prompt, collected_mm_items) -> torch.Tensor:
+        """Get input_ids with multimodal tokens expanded."""
+        image_grid_thw = collected_mm_items.get(Modality.IMAGE, {}).get('image_grid_thw', None)
+        merge_length = self.processor.image_processor.merge_size ** 2
+        image_index = 0
+        input_ids = []
+        for token in input_prompt:
+            if token == self.image_token_id:
+                image_tokens = image_grid_thw[image_index].prod() // merge_length
+                input_ids.extend([self.image_token_id] * image_tokens)
+                image_index += 1
+            else:
+                input_ids.append(token)
+        input_ids = torch.tensor(input_ids)
+        return input_ids
+
     def preprocess(self,
                    messages: list[dict],
-                   input_text: str,
+                   input_prompt: str | list[int],
                    mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
         """Refer to `super().preprocess()` for spec."""
 
         mm_items = self.collect_multimodal_items(messages)
 
-        raw_images = []
-        raw_videos, video_metadatas = [], []
+        raw_images, raw_videos, video_metadatas = [], [], []
         raw_time_series, sampling_rates = [], []
         for modality, data, params in mm_items:
             if modality == Modality.IMAGE:
@@ -316,39 +330,40 @@ def preprocess(self,
         mm_processor_kwargs = mm_processor_kwargs or {}
         if raw_images:
             kwargs['images'] = raw_images
-            image_processor_kwargs = mm_processor_kwargs.get('image', None)
-            kwargs['size'] = self._get_override_size(self.processor.image_processor, image_processor_kwargs)
+            kwargs['size'] = self._get_override_size(self.processor.image_processor,
+                                                     mm_processor_kwargs.get('image', None))
         if raw_videos:
             kwargs['videos'] = raw_videos
             kwargs['video_metadata'] = video_metadatas
-            video_processor_kwargs = mm_processor_kwargs.get('video', None)
-            kwargs['size'] = self._get_override_size(self.processor.video_processor, video_processor_kwargs)
-            # leave resize to hf processor
-            # sample frames is done in video loader, set False to avoid duplication
+            kwargs['size'] = self._get_override_size(self.processor.video_processor,
+                                                     mm_processor_kwargs.get('video', None))
+            # perform resize in hf processor, while sample frames has been done in video loader
             kwargs['do_resize'] = True
             kwargs['do_sample_frames'] = False
+        if raw_images and raw_videos and kwargs.get('size', None) is not None:
+            logger.warning(
+                'Overriding mm processor kwargs for mixed modality can be problematic, '
+                'skip kwargs setting for both processors to avoid potential issues.'
+            )
+            kwargs.pop('size')
         if raw_time_series:
-            assert hasattr(self, 'time_series_processor'), 'time series processor is not defined for time series input'
-            assert not raw_images and not raw_videos, 'time series is not compatible with image/video input'
+            assert hasattr(self, 'time_series_processor'), \
+                'time series processor is not defined for time series input'
+            assert not raw_images and not raw_videos, \
+                'time series is not compatible with image/video input'
             self.tokenizer = self.processor.tokenizer
             self.processor = self.time_series_processor
             kwargs['time_series'] = raw_time_series
             kwargs['sampling_rate'] = sampling_rates
-        if raw_images and raw_videos and kwargs.get('size', None) is not None:
-            logger.warning(
-                'Overriding mm processor kwargs for mixed modality can be problematic, ' \
-                'skip kwargs setting for both processors to avoid potential issues.'
-            )
-            kwargs.pop('size', None)
 
         # process raw items with hf processor
+        input_text = input_prompt if isinstance(input_prompt, str) else ''
         processor_outputs = self.processor(
             text=[input_text],
             padding=True,
             return_tensors='pt',
             **kwargs,
         )
-        input_ids = processor_outputs['input_ids'].flatten()
 
         # collect from processor outputs and categorized by modality
         collected_mm_items: dict[Modality, dict[str, Any]] = {}
@@ -367,6 +382,12 @@ def preprocess(self,
 
                 collected_mm_items[current_modality][attr_name] = value
 
+        # get input_ids
+        if isinstance(input_prompt, str):
+            input_ids = processor_outputs['input_ids'].flatten()
+        else:
+            input_ids = self._get_expanded_input_ids(input_prompt, collected_mm_items)
+
         # compute offsets for all items
         for modality, item in collected_mm_items.items():
             mm_token_id = self.mm_tokens.get_token_id_by_modality(modality)
@@ -380,18 +401,6 @@ def preprocess(self,
 
         return input_ids.tolist(), expanded_mm_items
 
-    def has_input_ids(self, messages: list[dict]) -> bool:
-        """Check whether the messages contain input_ids directly.
-
-        Args:
-            messages (list[dict]): a list of message, which is supposed to be
-                the output of `preprocess`
-        Returns:
-            bool: whether the messages contain input_ids directly
-        """
-        users = [x['content'] for x in messages if x['role'] == 'user']
-        return len(users) == 1 and isinstance(users[0], list) and isinstance(users[0][0].get('text', ''), list)
-
     def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
@@ -407,22 +416,6 @@ def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         if self.backend == 'turbomind':
             raise NotImplementedError()
 
-    # def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, chat_template_kwargs=None, **kwargs):
-    #     """Pack the preprocessing results in a format compatible with what is
-    #     required by pytorch engine. ONLY implement it when the backend is
-    #     pytorch engine.
-
-    #     Args:
-    #         messages(list[dict]): the output of `preprocess`
-    #         chat_template: the chat template defined in `lmdeploy/model.py`
-    #         tokenzer: the tokenizer model
-    #         sequence_start: starting flag of a sequence
-    #         chat_template_kwargs: additional arguments for chat template
-    #             processing, such as `add_vision_id` and `enable_thinking`
-    #     """
-    #     if self.backend == 'pytorch':
-    #         raise NotImplementedError()
-
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, chat_template_kwargs=None, **kwargs):
         """Pack the forwarding results in a format compatible with what is
         required by turbomind engine. ONLY implement it when the backend is
@@ -490,43 +483,6 @@ def IMAGE_TOKEN_included(messages):
                     return True
         return False
 
-    def to_pytorch_with_input_ids(self, messages):
-        """Pack the preprocessing results in a format compatible with what is
-        required by pytorch engine when input_ids are provided directly.
-
-        Args:
-            messages(list[dict]): the output of `preprocess`
-        """
-        # collect all preprocessing result from messages
-        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
-        assert len(preps) == 1
-        preps = preps[0]
-
-        _input_ids = messages[0]['content'][0]['text']
-        segs = []
-        for k, g in groupby(_input_ids, lambda x: x == self.image_token_id):
-            if not k:
-                segs.append(list(g))
-            else:
-                segs.extend([[]] * (len(list(g)) - 1))
-        if _input_ids[0] == self.image_token_id:
-            segs = [[]] + segs
-        if _input_ids[-1] == self.image_token_id:
-            segs = segs + [[]]
-
-        assert self.image_token_id == preps[0]['image_token_id']
-        assert len(segs) == len(preps) + 1, (f'the number of image token id {self.image_token_id} is not equal '
-                                             f'to input images, {len(segs) - 1} vs {len(preps)}')
-        input_ids = []
-        for i, seg in enumerate(segs):
-            if i > 0 and i <= len(preps):
-                preps[i - 1].update(offset=len(input_ids))
-                image_tokens = preps[i - 1]['image_tokens']
-                input_ids.extend([self.image_token_id] * image_tokens)
-            input_ids.extend(seg)
-
-        return dict(prompt=None, input_ids=input_ids, multimodal=preps)
-
     def to_turbomind_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start):
         """Auxiliary function to pack the forwarding results in a format
         compatible with what is required by turbomind engine.
@@ -573,7 +529,7 @@ def match(cls, config: AutoConfig):
 
 
 @dataclasses.dataclass
-class MultomodalSpecialTokens:
+class MultimodalSpecialTokens:
     image_token: str | list[str] | None = None
     video_token: str | list[str] | None = None
     audio_token: str | list[str] | None = None
diff --git a/lmdeploy/vl/model/interns1_pro.py b/lmdeploy/vl/model/interns1_pro.py
index 45622d9a88..66b21a233b 100644
--- a/lmdeploy/vl/model/interns1_pro.py
+++ b/lmdeploy/vl/model/interns1_pro.py
@@ -6,7 +6,7 @@
 
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.constants import Modality
-from lmdeploy.vl.model.base import VISION_MODELS, MultomodalSpecialTokens, VisionModel
+from lmdeploy.vl.model.base import VISION_MODELS, MultimodalSpecialTokens, VisionModel
 from lmdeploy.vl.model.qwen3 import Qwen3VLModel
 
 logger = get_logger('lmdeploy')
@@ -32,7 +32,7 @@ def build_preprocessor(self):
         self.ts_end_token = getattr(self.processor, 'ts_end_token', None)
 
         # special tokens
-        self.mm_tokens = MultomodalSpecialTokens(
+        self.mm_tokens = MultimodalSpecialTokens(
             image_token=self.image_token,
             video_token=self.video_token,
             ts_token=self.ts_token,
diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py
index 69771dc3d1..2efa35df34 100644
--- a/lmdeploy/vl/model/qwen3.py
+++ b/lmdeploy/vl/model/qwen3.py
@@ -3,7 +3,7 @@
 from transformers import AutoProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, MultomodalSpecialTokens, VisionModel
+from lmdeploy.vl.model.base import VISION_MODELS, MultimodalSpecialTokens, VisionModel
 
 logger = get_logger('lmdeploy')
 
@@ -39,7 +39,7 @@ def build_preprocessor(self):
         self.vision_end_token = self.processor.vision_end_token
 
         # special tokens
-        self.mm_tokens = MultomodalSpecialTokens(
+        self.mm_tokens = MultimodalSpecialTokens(
             image_token=self.image_token,
             video_token=self.video_token,
             image_token_id=self.image_token_id,

From 8d525fc87ab500ce3846c5e92008771636d1e50c Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 16 Apr 2026 12:52:24 +0800
Subject: [PATCH 06/21] minor

---
 lmdeploy/vl/engine.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index a15231a5e6..fa3c5fd955 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -54,20 +54,20 @@ def apply_chat_template(self, messages, chat_template, sequence_start, chat_temp
 
     async def preprocess(self,
                          messages: list[dict],
-                         input_text: str,
+                         input_prompt: str | list[int],
                          mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
         """Preprocess multimodal data in the messages."""
         if _accepts_arg(self.model.preprocess, 'mm_processor_kwargs'):
             future = asyncio.get_event_loop().run_in_executor(self.executor,
                                                               self.model.preprocess,
                                                               messages,
-                                                              input_text,
+                                                              input_prompt,
                                                               mm_processor_kwargs)
         else:
             future = asyncio.get_event_loop().run_in_executor(self.executor,
                                                               self.model.preprocess,
                                                               messages,
-                                                              input_text)
+                                                              input_prompt)
         future.add_done_callback(_raise_exception_on_finish)
         outputs = await future
         return outputs

From 20cd4ec45951ac9876b9d8603fb128c6786cd4bf Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 16 Apr 2026 14:17:19 +0800
Subject: [PATCH 07/21] simplify

---
 lmdeploy/pytorch/models/interns1_pro.py | 26 ------------------------
 lmdeploy/pytorch/models/qwen3_5.py      | 23 ---------------------
 lmdeploy/pytorch/models/qwen3_vl.py     | 21 -------------------
 lmdeploy/pytorch/models/utils/model.py  | 27 +++++++++++++++++++++++++
 lmdeploy/vl/model/base.py               |  1 -
 5 files changed, 27 insertions(+), 71 deletions(-)

diff --git a/lmdeploy/pytorch/models/interns1_pro.py b/lmdeploy/pytorch/models/interns1_pro.py
index c902202c83..4467fc0bba 100644
--- a/lmdeploy/pytorch/models/interns1_pro.py
+++ b/lmdeploy/pytorch/models/interns1_pro.py
@@ -148,32 +148,6 @@ def get_input_embeddings(self):
         """Get input embeddings."""
         return self.language_model.get_input_embeddings()
 
-    def get_multimodal_mask(self, input_ids: torch.Tensor, mm_inputs: list[MultiModalData]) -> torch.Tensor:
-        """Get position masks for vision tokens."""
-        image_token_id = next((m.meta.get('image_token_id') for m in mm_inputs if m.modality == Modality.IMAGE), None)
-        video_token_id = next((m.meta.get('video_token_id') for m in mm_inputs if m.modality == Modality.VIDEO), None)
-        ts_token_id = next((m.meta.get('ts_token_id') for m in mm_inputs if m.modality == Modality.TIME_SERIES), None)
-
-        image_mask, video_mask, ts_mask = None, None, None
-        if image_token_id is not None:
-            image_mask = (input_ids == image_token_id)
-        if video_token_id is not None:
-            video_mask = (input_ids == video_token_id)
-        if ts_token_id is not None:
-            ts_mask = (input_ids == ts_token_id)
-
-        multimodal_mask = None
-        if image_mask is not None and video_mask is not None:
-            multimodal_mask = image_mask | video_mask
-        elif image_mask is not None:
-            multimodal_mask = image_mask
-        elif video_mask is not None:
-            multimodal_mask = video_mask
-        elif ts_mask is not None:
-            multimodal_mask = ts_mask
-
-        return multimodal_mask
-
     def prepare_inputs_for_generation(
         self,
         past_key_values: list[list[torch.Tensor]],
diff --git a/lmdeploy/pytorch/models/qwen3_5.py b/lmdeploy/pytorch/models/qwen3_5.py
index 2983ec3951..92135321bb 100644
--- a/lmdeploy/pytorch/models/qwen3_5.py
+++ b/lmdeploy/pytorch/models/qwen3_5.py
@@ -15,7 +15,6 @@
 from lmdeploy.pytorch.distributed import get_tp_world_rank
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, LayerNorm, RMSNorm, SiluAndMul
 from lmdeploy.pytorch.nn.gated_delta import CausalConv1d, GatedDelta, GatedDeltaMeta, build_rmsnorm_gated
 from lmdeploy.pytorch.nn.linear import (
@@ -27,7 +26,6 @@
 )
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters
 from lmdeploy.pytorch.weight_loader.model_weight_loader import default_weight_loader, load_weight
-from lmdeploy.vl.constants import Modality
 
 from .patch import add_prefix, get_build_model_context
 from .qwen2_5_vl import Qwen2_5_VisionRotaryEmbedding as Qwen3_5VisionRotaryEmbedding
@@ -1166,27 +1164,6 @@ def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()
 
-    def get_multimodal_mask(self, input_ids: torch.Tensor, mm_inputs: list[MultiModalData]) -> torch.Tensor:
-        """Get position masks for vision tokens."""
-        image_token_id = next((m.meta.get('image_token_id') for m in mm_inputs if m.modality == Modality.IMAGE), None)
-        video_token_id = next((m.meta.get('video_token_id') for m in mm_inputs if m.modality == Modality.VIDEO), None)
-
-        image_mask, video_mask = None, None
-        if image_token_id is not None:
-            image_mask = (input_ids == image_token_id)
-        if video_token_id is not None:
-            video_mask = (input_ids == video_token_id)
-
-        multimodal_mask = None
-        if image_mask is not None and video_mask is not None:
-            multimodal_mask = image_mask | video_mask
-        elif image_mask is not None:
-            multimodal_mask = image_mask
-        elif video_mask is not None:
-            multimodal_mask = video_mask
-
-        return multimodal_mask
-
     def prepare_inputs_for_generation(
         self,
         past_key_values: list[list[torch.Tensor]],
diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py
index 96478349da..ca8e359ded 100644
--- a/lmdeploy/pytorch/models/qwen3_vl.py
+++ b/lmdeploy/pytorch/models/qwen3_vl.py
@@ -601,27 +601,6 @@ def get_input_embeddings(self):
         """Get input embeddings."""
         return self.language_model.get_input_embeddings()
 
-    def get_multimodal_mask(self, input_ids: torch.Tensor, mm_inputs: list[MultiModalData]) -> torch.Tensor:
-        """Get position masks for vision tokens."""
-        image_token_id = next((m.meta.get('image_token_id') for m in mm_inputs if m.modality == Modality.IMAGE), None)
-        video_token_id = next((m.meta.get('video_token_id') for m in mm_inputs if m.modality == Modality.VIDEO), None)
-
-        image_mask, video_mask = None, None
-        if image_token_id is not None:
-            image_mask = (input_ids == image_token_id)
-        if video_token_id is not None:
-            video_mask = (input_ids == video_token_id)
-
-        multimodal_mask = None
-        if image_mask is not None and video_mask is not None:
-            multimodal_mask = image_mask | video_mask
-        elif image_mask is not None:
-            multimodal_mask = image_mask
-        elif video_mask is not None:
-            multimodal_mask = video_mask
-
-        return multimodal_mask
-
     def prepare_inputs_for_generation(
         self,
         past_key_values: list[list[torch.Tensor]],
diff --git a/lmdeploy/pytorch/models/utils/model.py b/lmdeploy/pytorch/models/utils/model.py
index 3c99240f07..ffa9b546e5 100644
--- a/lmdeploy/pytorch/models/utils/model.py
+++ b/lmdeploy/pytorch/models/utils/model.py
@@ -8,8 +8,10 @@
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor
 from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta, StepContext
 from lmdeploy.pytorch.models.patch import get_build_model_context
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn.embedding import ParallelEmbedding
 from lmdeploy.pytorch.nn.linear import build_rowwise_linear
+from lmdeploy.vl.constants import Modality
 
 
 class BaseModelMetaProcessor:
@@ -150,6 +152,31 @@ def build_lm_head(self,
         )
         return lm_head
 
+    def get_multimodal_mask(self, input_ids: torch.Tensor, mm_inputs: list[MultiModalData]) -> torch.Tensor:
+        """Get position masks for vision tokens."""
+        image_token_id = next((m.meta.get('image_token_id') for m in mm_inputs if m.modality == Modality.IMAGE), None)
+        video_token_id = next((m.meta.get('video_token_id') for m in mm_inputs if m.modality == Modality.VIDEO), None)
+        ts_token_id = next((m.meta.get('ts_token_id') for m in mm_inputs if m.modality == Modality.TIME_SERIES), None)
+
+        image_mask, video_mask, ts_mask = None, None, None
+        if image_token_id is not None:
+            image_mask = (input_ids == image_token_id)
+        if video_token_id is not None:
+            video_mask = (input_ids == video_token_id)
+        if ts_token_id is not None:
+            ts_mask = (input_ids == ts_token_id)
+
+        multimodal_mask = None
+        if image_mask is not None and video_mask is not None:
+            multimodal_mask = image_mask | video_mask
+        elif image_mask is not None:
+            multimodal_mask = image_mask
+        elif video_mask is not None:
+            multimodal_mask = video_mask
+        elif ts_mask is not None:
+            multimodal_mask = ts_mask
+
+        return multimodal_mask
 
 def vlm_model(vlm_cls):
     if not issubclass(vlm_cls, torch.nn.Module):
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 53a4e42025..961dd9607c 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -372,7 +372,6 @@ def preprocess(self,
                 continue
 
             current_modality = self.ATTR_NAME_TO_MODALITY.get(attr_name)
-
             if current_modality:
                 if current_modality not in collected_mm_items:
                     collected_mm_items[current_modality] = {}

From 71112f4539815e8800f90304bbf2efcd30e8283f Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 16 Apr 2026 18:30:48 +0800
Subject: [PATCH 08/21] fix glm4.1v

---
 lmdeploy/pytorch/configurations/glm4_1v.py | 19 +++++
 lmdeploy/pytorch/models/glm4_1v.py         | 63 +++++++++++++++--
 lmdeploy/serve/processors/multimodal.py    |  3 +-
 lmdeploy/vl/model/base.py                  | 80 +++++++++++-----------
 lmdeploy/vl/model/glm4_1v.py               | 55 +++------------
 5 files changed, 128 insertions(+), 92 deletions(-)
 create mode 100644 lmdeploy/pytorch/configurations/glm4_1v.py

diff --git a/lmdeploy/pytorch/configurations/glm4_1v.py b/lmdeploy/pytorch/configurations/glm4_1v.py
new file mode 100644
index 0000000000..fbad2616ed
--- /dev/null
+++ b/lmdeploy/pytorch/configurations/glm4_1v.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import AutoModelConfigBuilder
+from .default import DefaultModelConfigBuilder
+
+
+class Glm4vModelConfigBuilder(AutoModelConfigBuilder):
+
+    @classmethod
+    def condition(cls, hf_config):
+        return hf_config.model_type == 'glm4v'
+
+    @classmethod
+    def build(cls, hf_config, model_path: str = None, **kwargs):
+        """build."""
+        bos_token_id = getattr(hf_config, 'bos_token_id', None)
+        hf_config.text_config.bos_token_id = bos_token_id
+        cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs)
+        cfg.hf_config = hf_config
+        return cfg
diff --git a/lmdeploy/pytorch/models/glm4_1v.py b/lmdeploy/pytorch/models/glm4_1v.py
index bc7be1a07b..7ab240208b 100644
--- a/lmdeploy/pytorch/models/glm4_1v.py
+++ b/lmdeploy/pytorch/models/glm4_1v.py
@@ -2,22 +2,23 @@
 # adapted from:
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm4v/modeling_glm4v.py
 
-from collections.abc import Callable, Iterable
+from collections.abc import Callable, Iterable, Sequence
 from typing import Any
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor
+from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, FlashAttention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
 from lmdeploy.pytorch.nn.linear import build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .glm4 import Glm4DecoderLayer
-from .qwen2_vl import Qwen2VLInputProcessor as Glm4vInputProcessor
 from .utils.cudagraph import CudaGraphMixin
 from .utils.model import DeployModelMixin, vlm_model
 
@@ -629,7 +630,7 @@ def prepare_inputs_for_generation(
                 pixel_values = torch.cat([data.data for data in image_data])
                 image_token_id = image_data[0].meta['image_token_id']
                 image_mask = input_ids == image_token_id
-                grid_thw = torch.cat([data.meta['grid_thw'] for data in image_data]).cpu()
+                grid_thw = torch.stack([data.meta['grid_thw'] for data in image_data]).cpu()
                 vis_pos_emb, image_type_ids = self.visual.rot_pos_emb(grid_thw)
                 vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
                                                          grid_thw[:, 0]).to(pixel_values.device)
@@ -722,3 +723,57 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
     def get_input_processor(self) -> BaseModelInputProcessor:
         """Get input processor."""
         return self.input_processor
+
+class Glm4vInputProcessor(BaseModelInputProcessor):
+    """Glm4v input processor."""
+
+    def __init__(self, config: PretrainedConfig) -> None:
+        self.config = config
+
+    @classmethod
+    def _get_multimodal_pos_ids(cls, grid_thw: Sequence[int]) -> np.ndarray:
+        """Get mrope ids."""
+        t, h, w = grid_thw
+        h = h // 2
+        w = w // 2
+        stride = np.array([h * w, w, 1])[None]
+        size = np.array([t, h, w])[None]
+        pos_ids = np.arange(t * h * w)[:, None].repeat(3, axis=1)
+        pos_ids = pos_ids // stride % size
+        return pos_ids
+
+    @classmethod
+    def make_mrope(cls, grid_thw: torch.Tensor):
+        grid_thw = grid_thw.tolist() if grid_thw.dim() == 1 else grid_thw[0].tolist()
+        img_pos_ids = cls._get_multimodal_pos_ids(grid_thw)
+        return img_pos_ids
+
+    def preprocess_input(self,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
+                         **kwargs) -> PreprocessInputResult:
+        """Prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_imgs = []
+        for input_mm in input_multimodals:
+            pixel_values = input_mm['pixel_values']
+            image_grid_thw = input_mm['image_grid_thw']
+            offset = input_mm['offset']
+            image_token_id = input_mm['image_token_id']
+
+            mrope_pos_ids = self.make_mrope(image_grid_thw)
+
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset[0],
+                                     end=offset[1],
+                                     mrope_pos_ids=mrope_pos_ids,
+                                     meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
+            input_imgs.append(mm_data)
+
+        result = PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=dict(image=input_imgs),
+        )
+        return result
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index b07374e5e9..6034434a2d 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -372,9 +372,8 @@ async def _get_multimodal_prompt_input(self,
                                                                sequence_start=sequence_start,
                                                                chat_template_kwargs=chat_template_kwargs)
 
-        input_ids, multimodal = await self.vl_encoder.preprocess(messages, input_prompt, mm_processor_kwargs)
+        results = await self.vl_encoder.preprocess(messages, input_prompt, mm_processor_kwargs)
 
-        results = {'input_ids': input_ids, 'multimodal': multimodal}
         if self.backend == 'turbomind':
             # for tm engine, this module perform vision embedding after image
             # preprocessing. It utilizes the hf model's vision embeddings
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 961dd9607c..3dbc41ae8a 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -127,8 +127,42 @@ def build_model(self, ):
         if self.backend == 'turbomind' or self.with_llm:
             raise NotImplementedError()
 
+    def get_override_size(self, processor, mm_processor_kwargs: dict[str, Any] | None = None):
+        default_min = processor.size['shortest_edge']
+        default_max = processor.size['longest_edge']
+        if not mm_processor_kwargs:
+            return {'shortest_edge': default_min, 'longest_edge': default_max}
+
+        override_min = mm_processor_kwargs.get('min_pixels', default_min)
+        override_max = mm_processor_kwargs.get('max_pixels', default_max)
+        if override_min > override_max:
+            logger.info(
+                f'Overriding min_pixels {override_min} > max_pixels {override_max}, ' \
+                f'falling back to defaults, min_pixels={default_min} and max_pixels={default_max}.'
+            )
+            return {'shortest_edge': default_min, 'longest_edge': default_max}
+
+        logger.info(f'Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
+        return {'shortest_edge': override_min, 'longest_edge': override_max}
+
+    def get_expanded_input_ids(self, input_prompt, collected_mm_items) -> torch.Tensor:
+        """Get input_ids with multimodal tokens expanded."""
+        image_grid_thw = collected_mm_items.get(Modality.IMAGE, {}).get('image_grid_thw', None)
+        merge_length = self.processor.image_processor.merge_size ** 2
+        image_index = 0
+        input_ids = []
+        for token in input_prompt:
+            if token == self.image_token_id:
+                image_tokens = image_grid_thw[image_index].prod() // merge_length
+                input_ids.extend([self.image_token_id] * image_tokens)
+                image_index += 1
+            else:
+                input_ids.append(token)
+        input_ids = torch.tensor(input_ids)
+        return input_ids
+
     # adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/mm_utils.py
-    def _get_expanded_mm_items(self, collected_mm_items):
+    def get_expanded_mm_items(self, collected_mm_items):
         """Hf processor outputs produced bundled data for multiple
         images/videos we need to expand them into per-image/video entries for
         better cache locality and fine-grained scheduling."""
@@ -269,40 +303,6 @@ def _get_expanded_mm_items(self, collected_mm_items):
 
         return expanded_mm_items
 
-    def _get_override_size(self, processor, mm_processor_kwargs: dict[str, Any] | None = None):
-        default_min = processor.size['shortest_edge']
-        default_max = processor.size['longest_edge']
-        if not mm_processor_kwargs:
-            return {'shortest_edge': default_min, 'longest_edge': default_max}
-
-        override_min = mm_processor_kwargs.get('min_pixels', default_min)
-        override_max = mm_processor_kwargs.get('max_pixels', default_max)
-        if override_min > override_max:
-            logger.info(
-                f'Overriding min_pixels {override_min} > max_pixels {override_max}, ' \
-                f'falling back to defaults, min_pixels={default_min} and max_pixels={default_max}.'
-            )
-            return {'shortest_edge': default_min, 'longest_edge': default_max}
-
-        logger.info(f'Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
-        return {'shortest_edge': override_min, 'longest_edge': override_max}
-
-    def _get_expanded_input_ids(self, input_prompt, collected_mm_items) -> torch.Tensor:
-        """Get input_ids with multimodal tokens expanded."""
-        image_grid_thw = collected_mm_items.get(Modality.IMAGE, {}).get('image_grid_thw', None)
-        merge_length = self.processor.image_processor.merge_size ** 2
-        image_index = 0
-        input_ids = []
-        for token in input_prompt:
-            if token == self.image_token_id:
-                image_tokens = image_grid_thw[image_index].prod() // merge_length
-                input_ids.extend([self.image_token_id] * image_tokens)
-                image_index += 1
-            else:
-                input_ids.append(token)
-        input_ids = torch.tensor(input_ids)
-        return input_ids
-
     def preprocess(self,
                    messages: list[dict],
                    input_prompt: str | list[int],
@@ -330,12 +330,12 @@ def preprocess(self,
         mm_processor_kwargs = mm_processor_kwargs or {}
         if raw_images:
             kwargs['images'] = raw_images
-            kwargs['size'] = self._get_override_size(self.processor.image_processor,
+            kwargs['size'] = self.get_override_size(self.processor.image_processor,
                                                      mm_processor_kwargs.get('image', None))
         if raw_videos:
             kwargs['videos'] = raw_videos
             kwargs['video_metadata'] = video_metadatas
-            kwargs['size'] = self._get_override_size(self.processor.video_processor,
+            kwargs['size'] = self.get_override_size(self.processor.video_processor,
                                                      mm_processor_kwargs.get('video', None))
             # perform resize in hf processor, while sample frames has been done in video loader
             kwargs['do_resize'] = True
@@ -385,7 +385,7 @@ def preprocess(self,
         if isinstance(input_prompt, str):
             input_ids = processor_outputs['input_ids'].flatten()
         else:
-            input_ids = self._get_expanded_input_ids(input_prompt, collected_mm_items)
+            input_ids = self.get_expanded_input_ids(input_prompt, collected_mm_items)
 
         # compute offsets for all items
         for modality, item in collected_mm_items.items():
@@ -396,9 +396,9 @@ def preprocess(self,
             )
 
         # expand bundled hf processor outputs into per-image/video entry
-        expanded_mm_items = self._get_expanded_mm_items(collected_mm_items)
+        expanded_mm_items = self.get_expanded_mm_items(collected_mm_items)
 
-        return input_ids.tolist(), expanded_mm_items
+        return dict(input_ids=input_ids.tolist(), multimodal=expanded_mm_items)
 
     def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
diff --git a/lmdeploy/vl/model/glm4_1v.py b/lmdeploy/vl/model/glm4_1v.py
index 6a796105fc..fc997d9ac3 100644
--- a/lmdeploy/vl/model/glm4_1v.py
+++ b/lmdeploy/vl/model/glm4_1v.py
@@ -3,7 +3,7 @@
 from transformers import AutoConfig
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
+from lmdeploy.vl.model.base import VISION_MODELS, MultimodalSpecialTokens, VisionModel
 
 logger = get_logger('lmdeploy')
 
@@ -25,52 +25,15 @@ def match(cls, config: AutoConfig):
     def build_preprocessor(self):
         from transformers import AutoProcessor
         self.processor = AutoProcessor.from_pretrained(self.model_path)
-        tokenizer = self.processor.tokenizer
-        image_token = self.processor.image_token
-        self.image_token_id = tokenizer.encode(image_token)[-1]
 
-    def build_model(self):
-        raise NotImplementedError('turbomind has not supported glm4v yet')
+        self.image_token = self.processor.image_token
+        self.image_token_id = self.processor.image_token_id
 
-    def preprocess(self, messages: list[dict]) -> list[dict]:
-        """Refer to `super().preprocess()` for spec."""
-        images = self.collect_multimodal_items(messages)
-        optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'}
-        outputs = []
-        for modality, image, params in images:
-            item = dict(type='image', image=image)
-            item.update({key: params[key] for key in params.keys() if key in optional_keys})
-            result = self.processor.image_processor(images=image, videos=None, return_tensors='pt')
-            merge_length = self.processor.image_processor.merge_size**2
-            image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
-            result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
-            outputs.append(result)
-        messages.append(dict(role='preprocess', content=outputs))
-        return messages
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self.image_token,
+            image_token_id=self.image_token_id
+        )
 
-    @staticmethod
-    def proc_messages(messages, chat_template, sequence_start):
+    def apply_chat_template(self, messages, chat_template, sequence_start, chat_template_kwargs=None):
         """Apply chat template to get the prompt."""
-        prompt_messages = []
-        IMAGE_TOKEN = '<IMAGE_TOKEN>'
-        for message in messages:
-            if isinstance(message['content'], str):
-                prompt_messages.append(message)
-                continue
-            elif message['role'] in ['images', 'preprocess', 'forward']:
-                continue
-            n_images = len([1 for x in message['content'] if x['type'] == 'image'])
-            content = [item['text'] for item in message['content'] if item['type'] == 'text']
-            prompt = content[0]
-            if IMAGE_TOKEN in prompt and '<|begin_of_image|>' not in prompt:
-                prompt = prompt.replace(IMAGE_TOKEN, f'<|begin_of_image|>{IMAGE_TOKEN}<|end_of_image|>')
-            else:
-                prompt = f'<|begin_of_image|>{IMAGE_TOKEN}<|end_of_image|>' * \
-                    n_images + prompt
-            prompt_messages.append(dict(role=message['role'], content=prompt))
-        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
-        return prompt, IMAGE_TOKEN
-
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
-        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
+        return chat_template.messages2prompt(messages, sequence_start, **chat_template_kwargs)

From 1994bfa7e4deecc2b7237aee71d9d3eb7badbbfd Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 16 Apr 2026 20:57:22 +0800
Subject: [PATCH 09/21] compatible with legacy preprocess, give up re-writing
 all ...

---
 lmdeploy/serve/processors/multimodal.py |  44 +++++-----
 lmdeploy/vl/engine.py                   |  66 +++++++++++----
 lmdeploy/vl/model/base.py               | 108 +++++++++++++++++++++++-
 3 files changed, 177 insertions(+), 41 deletions(-)

diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 6034434a2d..54bbaa062d 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
+import inspect
 from typing import Any, Literal
 
 import PIL
@@ -310,11 +311,6 @@ def _has_multimodal_input(self, messages: list[dict]) -> bool:
             isinstance(message.get('content'), list) and any(
                 item.get('type') in multimodal_types for item in message['content']) for message in messages)
 
-    def _has_input_ids_input(self, messages: list[dict]) -> bool:
-        """Check whether the messages contain input_ids directly."""
-        users = [x['content'] for x in messages if x['role'] == 'user']
-        return len(users) == 1 and isinstance(users[0], list) and isinstance(users[0][0].get('text', ''), list)
-
     async def _get_text_prompt_input(self,
                                      prompt: str | list[dict],
                                      do_preprocess: bool,
@@ -363,23 +359,11 @@ async def _get_multimodal_prompt_input(self,
         chat_template = self.chat_template if do_preprocess else BaseChatTemplate()
         messages = await self.async_parse_multimodal_item(messages, media_io_kwargs)
 
-        # get input prompt as either text or input ids list
-        if self._has_input_ids_input(messages):
-            input_prompt = messages[0]['content'][0]['text']
-        else:
-            input_prompt = self.vl_encoder.apply_chat_template(messages=messages,
-                                                               chat_template=chat_template,
-                                                               sequence_start=sequence_start,
-                                                               chat_template_kwargs=chat_template_kwargs)
-
-        results = await self.vl_encoder.preprocess(messages, input_prompt, mm_processor_kwargs)
+        uses_new_preprocess = 'mm_processor_kwargs' in inspect.signature(
+            self.vl_encoder.model.preprocess).parameters
 
         if self.backend == 'turbomind':
-            # for tm engine, this module perform vision embedding after image
-            # preprocessing. It utilizes the hf model's vision embeddings
-            # functions and returns the input_ids, input_embeddings,
-            # embedding_ranges and so on. All the returned values are passed
-            # to tm engine for token generation
+            results = await self.vl_encoder.preprocess(messages, None, mm_processor_kwargs)
             results = await self.vl_encoder.async_infer(results)
             results = await self.vl_encoder.wrap_for_turbomind(messages=results,
                                                                chat_template=chat_template,
@@ -388,8 +372,22 @@ async def _get_multimodal_prompt_input(self,
                                                                tools=tools,
                                                                chat_template_kwargs=chat_template_kwargs)
         elif self.backend == 'pytorch':
-            # for pt engine, this module only conduct the image preprocessing
-            # It leaves the vision embedding to the pt engine
-            return results
+            if uses_new_preprocess:
+                if self.vl_encoder.model.has_input_ids(messages):
+                    input_prompt = messages[0]['content'][0]['text']
+                else:
+                    input_prompt = self.vl_encoder.apply_chat_template(messages=messages,
+                                                                       chat_template=chat_template,
+                                                                       sequence_start=sequence_start,
+                                                                       chat_template_kwargs=chat_template_kwargs)
+                results = await self.vl_encoder.preprocess(messages, input_prompt, mm_processor_kwargs)
+            else:
+                results = await self.vl_encoder.preprocess(messages, mm_processor_kwargs)
+                results = await self.vl_encoder.wrap_for_pytorch(messages=results,
+                                                                 chat_template=chat_template,
+                                                                 tokenizer=self.tokenizer,
+                                                                 sequence_start=sequence_start,
+                                                                 tools=tools,
+                                                                 chat_template_kwargs=chat_template_kwargs)
 
         return results
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index fa3c5fd955..b1e0331c13 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -24,11 +24,6 @@ def _raise_exception_on_finish(task: asyncio.Task) -> None:
         raise e
 
 
-def _accepts_arg(func, arg_name: str) -> bool:
-    """Check if a function accepts a specific keyword argument."""
-    return arg_name in inspect.signature(func).parameters
-
-
 class ImageEncoder:
     """Image encoder."""
 
@@ -54,20 +49,14 @@ def apply_chat_template(self, messages, chat_template, sequence_start, chat_temp
 
     async def preprocess(self,
                          messages: list[dict],
-                         input_prompt: str | list[int],
+                         input_prompt: str | list[int] | None = None,
                          mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
         """Preprocess multimodal data in the messages."""
-        if _accepts_arg(self.model.preprocess, 'mm_processor_kwargs'):
-            future = asyncio.get_event_loop().run_in_executor(self.executor,
-                                                              self.model.preprocess,
-                                                              messages,
-                                                              input_prompt,
-                                                              mm_processor_kwargs)
-        else:
-            future = asyncio.get_event_loop().run_in_executor(self.executor,
-                                                              self.model.preprocess,
-                                                              messages,
-                                                              input_prompt)
+        sig_params = inspect.signature(self.model.preprocess).parameters
+        kwargs = {k: v for k, v in [('input_prompt', input_prompt), ('mm_processor_kwargs', mm_processor_kwargs)]
+                  if k in sig_params}
+        future = asyncio.get_event_loop().run_in_executor(
+            self.executor, lambda: self.model.preprocess(messages, **kwargs))
         future.add_done_callback(_raise_exception_on_finish)
         outputs = await future
         return outputs
@@ -123,3 +112,46 @@ async def wrap_for_turbomind(
                 messages[i]['preprocess'] = None
                 messages[i]['forward'] = None
         return result
+
+    async def wrap_for_pytorch(
+            self,
+            messages: list[dict],
+            chat_template,
+            tokenizer,
+            sequence_start,
+            tools: list[object] | None = None,
+            chat_template_kwargs: dict | None = None,
+        ) -> list[dict]:
+            """
+            Args:
+                messages (list[dict]): a list of message, which is supposed to be
+                    the output of `preprocess`
+
+            Returns:
+                list[dict]: a list of dicts passed to pytorch engine_instance's forward.
+                    Each dict has the following structure::
+
+                        {
+                            'prompt': 'the prompt after applying chat template',
+                            'input_ids': [],
+                            'multimodal': {
+                                'pixel_values': torch.Tensor,
+                                ...
+                            },
+                        }
+            """
+            has_input_ids = self.model.has_input_ids(messages)
+            if not has_input_ids:
+                result = self.model.to_pytorch(messages,
+                                            chat_template,
+                                            tokenizer,
+                                            sequence_start,
+                                            tools=tools,
+                                            chat_template_kwargs=chat_template_kwargs)
+            else:
+                result = self.model.to_pytorch_with_input_ids(messages)
+            # clear data
+            for i, message in enumerate(messages):
+                if isinstance(message['content'], list):
+                    messages[i]['preprocess'] = None
+            return result
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 3dbc41ae8a..64280c233a 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dataclasses
 from abc import ABC, abstractmethod
+from itertools import groupby
 from typing import Any
 
 import numpy as np
@@ -307,7 +308,11 @@ def preprocess(self,
                    messages: list[dict],
                    input_prompt: str | list[int],
                    mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
-        """Refer to `super().preprocess()` for spec."""
+        """Preprocess multimodal data and return input_ids + multimodal
+        features.
+
+        New-style models inherit this implementation. Legacy models override with `def preprocess(self, messages)`.
+        """
 
         mm_items = self.collect_multimodal_items(messages)
 
@@ -400,6 +405,19 @@ def preprocess(self,
 
         return dict(input_ids=input_ids.tolist(), multimodal=expanded_mm_items)
 
+    @staticmethod
+    def has_input_ids(messages: list[dict]) -> bool:
+        """Check whether the messages contain input_ids directly.
+
+        Args:
+            messages (list[dict]): a list of message, which is supposed to be
+                the output of `preprocess`
+        Returns:
+            bool: whether the messages contain input_ids directly
+        """
+        users = [x['content'] for x in messages if x['role'] == 'user']
+        return len(users) == 1 and isinstance(users[0], list) and isinstance(users[0][0].get('text', ''), list)
+
     def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
@@ -415,6 +433,22 @@ def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         if self.backend == 'turbomind':
             raise NotImplementedError()
 
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, chat_template_kwargs=None, **kwargs):
+        """Pack the preprocessing results in a format compatible with what is
+        required by pytorch engine. ONLY implement it when the backend is
+        pytorch engine.
+
+        Args:
+            messages(list[dict]): the output of `preprocess`
+            chat_template: the chat template defined in `lmdeploy/model.py`
+            tokenzer: the tokenizer model
+            sequence_start: starting flag of a sequence
+            chat_template_kwargs: additional arguments for chat template
+                processing, such as `add_vision_id` and `enable_thinking`
+        """
+        if self.backend == 'pytorch':
+            raise NotImplementedError()
+
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, chat_template_kwargs=None, **kwargs):
         """Pack the forwarding results in a format compatible with what is
         required by turbomind engine. ONLY implement it when the backend is
@@ -482,6 +516,78 @@ def IMAGE_TOKEN_included(messages):
                     return True
         return False
 
+    def to_pytorch_with_input_ids(self, messages):
+        """Pack the preprocessing results in a format compatible with what is
+        required by pytorch engine when input_ids are provided directly.
+
+        Args:
+            messages(list[dict]): the output of `preprocess`
+        """
+        # collect all preprocessing result from messages
+        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
+        assert len(preps) == 1
+        preps = preps[0]
+
+        _input_ids = messages[0]['content'][0]['text']
+        segs = []
+        for k, g in groupby(_input_ids, lambda x: x == self.image_token_id):
+            if not k:
+                segs.append(list(g))
+            else:
+                segs.extend([[]] * (len(list(g)) - 1))
+        if _input_ids[0] == self.image_token_id:
+            segs = [[]] + segs
+        if _input_ids[-1] == self.image_token_id:
+            segs = segs + [[]]
+
+        assert self.image_token_id == preps[0]['image_token_id']
+        assert len(segs) == len(preps) + 1, (f'the number of image token id {self.image_token_id} is not equal '
+                                             f'to input images, {len(segs) - 1} vs {len(preps)}')
+        input_ids = []
+        for i, seg in enumerate(segs):
+            if i > 0 and i <= len(preps):
+                preps[i - 1].update(offset=len(input_ids))
+                image_tokens = preps[i - 1]['image_tokens']
+                input_ids.extend([self.image_token_id] * image_tokens)
+            input_ids.extend(seg)
+
+        return dict(prompt=None, input_ids=input_ids, multimodal=preps)
+
+    def to_pytorch_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start):
+        """Auxiliary function to pack the preprocessing results in a format
+        compatible with what is required by pytorch engine.
+
+        Args:
+            messages(list[dict]): the output of `preprocess`
+            prompt(str): the prompt after applying chat template
+            IMAGE_TOKEN(str): a placeholder where image tokens will be
+                inserted
+            tokenzer: the tokenizer model
+            sequence_start: starting flag of a sequence
+        """
+        # collect all preprocessing result from messages
+        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
+        assert len(preps) == 1
+        preps = preps[0]
+
+        # split prompt into segments and validate data
+        segs = prompt.split(IMAGE_TOKEN)
+        assert len(segs) == len(preps) + 1, (f'the number of {IMAGE_TOKEN} is not equal '
+                                             f'to input images, {len(segs) - 1} vs {len(preps)}')
+
+        # calculate the image token offset for each image
+        input_ids = []
+        for i, seg in enumerate(segs):
+            if i > 0 and i <= len(preps):
+                preps[i - 1].update(offset=len(input_ids))
+                image_tokens = preps[i - 1]['image_tokens']
+                assert self.image_token_id == preps[i - 1]['image_token_id']
+                input_ids.extend([self.image_token_id] * image_tokens)
+            token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start))
+            input_ids.extend(token_ids)
+
+        return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)
+
     def to_turbomind_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start):
         """Auxiliary function to pack the forwarding results in a format
         compatible with what is required by turbomind engine.

From f47d0113fb90e376bdb3293e294fac08760e60c6 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 16 Apr 2026 21:22:45 +0800
Subject: [PATCH 10/21] fix bugs

---
 lmdeploy/pytorch/messages.py            |  4 +--
 lmdeploy/serve/core/async_engine.py     |  4 +--
 lmdeploy/serve/processors/multimodal.py | 12 ++++-----
 lmdeploy/vl/model/base.py               | 33 +++++++++++++------------
 4 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index 8dd2656fa4..966bd430cd 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -576,13 +576,13 @@ def __init__(self, multimodals: MultiModalInputs = None):
         self.multimodals = multimodals
 
     def get_datas(self, start=0, end=-1):
-        """Get multimodals from prompts position [start, end]."""
+        """Get multimodals from prompts position [start, end)."""
         outs: MultiModalInputs = dict()
         test_range = range(start, end)
         for modal_type, modal_datas in self.multimodals.items():
             data = []
             for modal_data in modal_datas:
-                if (modal_data.start not in test_range and modal_data.end not in test_range):
+                if (modal_data.start not in test_range or modal_data.end - 1 not in test_range):
                     continue
                 data.append(modal_data)
             if len(data) > 0:
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index 9483ae6254..9d05bbdab7 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -396,11 +396,9 @@ async def generate(
                                                                         media_io_kwargs=media_io_kwargs,
                                                                         mm_processor_kwargs=mm_processor_kwargs,
                                                                         **kwargs)
-            # prompt = prompt_input['prompt']
             input_ids = prompt_input['input_ids']
             self.request_logger.log_inputs(session,
-                                        #    prompt=prompt,
-                                           prompt='DEBUG!',
+                                           prompt=prompt_input.get('prompt'),
                                            prompt_token_ids=input_ids,
                                            gen_config=gen_config,
                                            adapter_name=adapter_name)
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 54bbaa062d..56cbfe365a 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -38,6 +38,10 @@ def __init__(self,
         self.chat_template = chat_template
         self.vl_encoder = vl_encoder
         self.backend = backend
+        self._uses_new_preprocess = (
+            vl_encoder is not None and
+            'mm_processor_kwargs' in inspect.signature(vl_encoder.model.preprocess).parameters
+        )
 
     @staticmethod
     def merge_message_content(msg: dict) -> dict:
@@ -113,7 +117,6 @@ def _parse_multimodal_item(i: int, in_messages: list[dict], out_messages: list[d
             item_params = item.get(item_type, {}).copy()
             data_src = item_params.pop('url', None) or item_params.pop('data', None)
 
-            modality = None
             if item_type == 'image_data':
                 modality = Modality.IMAGE
                 data = data_src
@@ -359,11 +362,8 @@ async def _get_multimodal_prompt_input(self,
         chat_template = self.chat_template if do_preprocess else BaseChatTemplate()
         messages = await self.async_parse_multimodal_item(messages, media_io_kwargs)
 
-        uses_new_preprocess = 'mm_processor_kwargs' in inspect.signature(
-            self.vl_encoder.model.preprocess).parameters
-
         if self.backend == 'turbomind':
-            results = await self.vl_encoder.preprocess(messages, None, mm_processor_kwargs)
+            results = await self.vl_encoder.preprocess(messages, mm_processor_kwargs)
             results = await self.vl_encoder.async_infer(results)
             results = await self.vl_encoder.wrap_for_turbomind(messages=results,
                                                                chat_template=chat_template,
@@ -372,7 +372,7 @@ async def _get_multimodal_prompt_input(self,
                                                                tools=tools,
                                                                chat_template_kwargs=chat_template_kwargs)
         elif self.backend == 'pytorch':
-            if uses_new_preprocess:
+            if self._uses_new_preprocess:
                 if self.vl_encoder.model.has_input_ids(messages):
                     input_prompt = messages[0]['content'][0]['text']
                 else:
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 64280c233a..b95122e696 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -83,22 +83,6 @@ def __init__(self,
             'ts_values',
         ]
 
-    @staticmethod
-    def get_mm_items_offset(
-        input_ids: torch.Tensor, mm_token_id: int
-    ) -> list[tuple[int, int]]:
-        """
-        Get a set of range for mm_items from input_ids
-        Example:
-            input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
-            mm_token_id = 3
-            return result = [(2,4),(6,7)]
-        """
-        mask = input_ids == mm_token_id
-        start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
-        end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
-        return list(zip(start_positions.tolist(), end_positions.tolist()))
-
     def get_pad_token_id(self, model_path, hf_config):
         """Get pad_token_id from hf_config or tokenizer."""
         pad_token_id = getattr(hf_config, 'pad_token_id', None)
@@ -128,6 +112,23 @@ def build_model(self, ):
         if self.backend == 'turbomind' or self.with_llm:
             raise NotImplementedError()
 
+    @staticmethod
+    def get_mm_items_offset(
+        input_ids: torch.Tensor, mm_token_id: int
+    ) -> list[tuple[int, int]]:
+        """
+        Get a set of range for mm_items from input_ids
+        Example:
+            input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
+            mm_token_id = 3
+            return result = [(2,4),(6,7)]
+        """
+        mask = input_ids == mm_token_id
+        start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
+        end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
+        end_positions += 1 # convert to exclusive end index, compatible with legacy pytorch implementation
+        return list(zip(start_positions.tolist(), end_positions.tolist()))
+
     def get_override_size(self, processor, mm_processor_kwargs: dict[str, Any] | None = None):
         default_min = processor.size['shortest_edge']
         default_max = processor.size['longest_edge']

From 1d40a7620790e60e3ec4bb351f2f7325d53c4dea Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 16 Apr 2026 21:27:28 +0800
Subject: [PATCH 11/21] minor

---
 lmdeploy/serve/core/async_engine.py |  5 +-
 lmdeploy/vl/engine.py               | 78 ++++++++++++++---------------
 2 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index 9d05bbdab7..f8bb9ccda2 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -396,9 +396,10 @@ async def generate(
                                                                         media_io_kwargs=media_io_kwargs,
                                                                         mm_processor_kwargs=mm_processor_kwargs,
                                                                         **kwargs)
-            input_ids = prompt_input['input_ids']
+            prompt = prompt_input.get('prompt')
+            input_ids = prompt_input.get('input_ids')
             self.request_logger.log_inputs(session,
-                                           prompt=prompt_input.get('prompt'),
+                                           prompt=prompt,
                                            prompt_token_ids=input_ids,
                                            gen_config=gen_config,
                                            adapter_name=adapter_name)
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index b1e0331c13..5b8634dcbb 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -74,45 +74,6 @@ async def async_infer(self, messages: list[dict]) -> list[dict]:
         outputs = await future
         return outputs
 
-    async def wrap_for_turbomind(
-        self,
-        messages: list[dict],
-        chat_template,
-        tokenizer,
-        sequence_start,
-        tools: list[object] | None = None,
-        chat_template_kwargs: dict | None = None,
-    ) -> dict:
-        """
-        Args:
-            messages (list[dict]): a list of message, which is supposed to be
-                the output of `async_infer`
-
-        Returns:
-            dict: a dict passed to turbomind engine_instance's forward.
-                The dict has the following structure::
-
-                    {
-                        'prompt': 'the prompt after applying chat template',
-                        'input_ids': [],
-                        'input_embeddings': list[torch.Tensor],
-                        'input_embedding_ranges': list[torch.Tensor],
-                        ...
-                    }
-        """
-        result = self.model.to_turbomind(messages,
-                                         chat_template,
-                                         tokenizer,
-                                         sequence_start,
-                                         tools=tools,
-                                         chat_template_kwargs=chat_template_kwargs)
-        # clear data
-        for i, message in enumerate(messages):
-            if isinstance(message['content'], list):
-                messages[i]['preprocess'] = None
-                messages[i]['forward'] = None
-        return result
-
     async def wrap_for_pytorch(
             self,
             messages: list[dict],
@@ -155,3 +116,42 @@ async def wrap_for_pytorch(
                 if isinstance(message['content'], list):
                     messages[i]['preprocess'] = None
             return result
+
+    async def wrap_for_turbomind(
+        self,
+        messages: list[dict],
+        chat_template,
+        tokenizer,
+        sequence_start,
+        tools: list[object] | None = None,
+        chat_template_kwargs: dict | None = None,
+    ) -> dict:
+        """
+        Args:
+            messages (list[dict]): a list of message, which is supposed to be
+                the output of `async_infer`
+
+        Returns:
+            dict: a dict passed to turbomind engine_instance's forward.
+                The dict has the following structure::
+
+                    {
+                        'prompt': 'the prompt after applying chat template',
+                        'input_ids': [],
+                        'input_embeddings': list[torch.Tensor],
+                        'input_embedding_ranges': list[torch.Tensor],
+                        ...
+                    }
+        """
+        result = self.model.to_turbomind(messages,
+                                         chat_template,
+                                         tokenizer,
+                                         sequence_start,
+                                         tools=tools,
+                                         chat_template_kwargs=chat_template_kwargs)
+        # clear data
+        for i, message in enumerate(messages):
+            if isinstance(message['content'], list):
+                messages[i]['preprocess'] = None
+                messages[i]['forward'] = None
+        return result

From b69fd8b0a4c41b9daebdb6f0987d76e3dd19347e Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 16 Apr 2026 21:28:53 +0800
Subject: [PATCH 12/21] minor

---
 lmdeploy/vl/engine.py | 82 +++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index 5b8634dcbb..67cfbb82f0 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -75,47 +75,47 @@ async def async_infer(self, messages: list[dict]) -> list[dict]:
         return outputs
 
     async def wrap_for_pytorch(
-            self,
-            messages: list[dict],
-            chat_template,
-            tokenizer,
-            sequence_start,
-            tools: list[object] | None = None,
-            chat_template_kwargs: dict | None = None,
-        ) -> list[dict]:
-            """
-            Args:
-                messages (list[dict]): a list of message, which is supposed to be
-                    the output of `preprocess`
-
-            Returns:
-                list[dict]: a list of dicts passed to pytorch engine_instance's forward.
-                    Each dict has the following structure::
-
-                        {
-                            'prompt': 'the prompt after applying chat template',
-                            'input_ids': [],
-                            'multimodal': {
-                                'pixel_values': torch.Tensor,
-                                ...
-                            },
-                        }
-            """
-            has_input_ids = self.model.has_input_ids(messages)
-            if not has_input_ids:
-                result = self.model.to_pytorch(messages,
-                                            chat_template,
-                                            tokenizer,
-                                            sequence_start,
-                                            tools=tools,
-                                            chat_template_kwargs=chat_template_kwargs)
-            else:
-                result = self.model.to_pytorch_with_input_ids(messages)
-            # clear data
-            for i, message in enumerate(messages):
-                if isinstance(message['content'], list):
-                    messages[i]['preprocess'] = None
-            return result
+        self,
+        messages: list[dict],
+        chat_template,
+        tokenizer,
+        sequence_start,
+        tools: list[object] | None = None,
+        chat_template_kwargs: dict | None = None,
+    ) -> list[dict]:
+        """
+        Args:
+            messages (list[dict]): a list of message, which is supposed to be
+                the output of `preprocess`
+
+        Returns:
+            list[dict]: a list of dicts passed to pytorch engine_instance's forward.
+                Each dict has the following structure::
+
+                    {
+                        'prompt': 'the prompt after applying chat template',
+                        'input_ids': [],
+                        'multimodal': {
+                            'pixel_values': torch.Tensor,
+                            ...
+                        },
+                    }
+        """
+        has_input_ids = self.model.has_input_ids(messages)
+        if not has_input_ids:
+            result = self.model.to_pytorch(messages,
+                                        chat_template,
+                                        tokenizer,
+                                        sequence_start,
+                                        tools=tools,
+                                        chat_template_kwargs=chat_template_kwargs)
+        else:
+            result = self.model.to_pytorch_with_input_ids(messages)
+        # clear data
+        for i, message in enumerate(messages):
+            if isinstance(message['content'], list):
+                messages[i]['preprocess'] = None
+        return result
 
     async def wrap_for_turbomind(
         self,

From 70d417890dfc5d5a76a58ddcae5355b580d4de40 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 16 Apr 2026 21:37:00 +0800
Subject: [PATCH 13/21] minor

---
 lmdeploy/vl/engine.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index 67cfbb82f0..8379e7708a 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -104,11 +104,11 @@ async def wrap_for_pytorch(
         has_input_ids = self.model.has_input_ids(messages)
         if not has_input_ids:
             result = self.model.to_pytorch(messages,
-                                        chat_template,
-                                        tokenizer,
-                                        sequence_start,
-                                        tools=tools,
-                                        chat_template_kwargs=chat_template_kwargs)
+                                           chat_template,
+                                           tokenizer,
+                                           sequence_start,
+                                           tools=tools,
+                                           chat_template_kwargs=chat_template_kwargs)
         else:
             result = self.model.to_pytorch_with_input_ids(messages)
         # clear data

From 006a8ca6ac5ec40c6fcea6317efddc11d9769335 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 17 Apr 2026 11:16:53 +0800
Subject: [PATCH 14/21] fix ut

---
 .../test_vl/test_hf_chat_template.py          | 48 +++++++++++++++++--
 .../test_vl/test_qwen3vl_processor.py         | 36 ++++++++------
 2 files changed, 66 insertions(+), 18 deletions(-)

diff --git a/tests/test_lmdeploy/test_vl/test_hf_chat_template.py b/tests/test_lmdeploy/test_vl/test_hf_chat_template.py
index fdd3d193af..9a7a488081 100644
--- a/tests/test_lmdeploy/test_vl/test_hf_chat_template.py
+++ b/tests/test_lmdeploy/test_vl/test_hf_chat_template.py
@@ -117,6 +117,46 @@ def models(self):
             'Qwen/Qwen2.5-VL-7B-Instruct',
             'Qwen/Qwen2.5-VL-32B-Instruct',
             'Qwen/Qwen2.5-VL-72B-Instruct',
+        ]
+        models = [get_model_and_chat_template(model_path) for model_path in model_list]
+        return models
+
+    def test_proc_messages(self, models, mock_messages):
+        for model, chat_template in models:
+            model.build_preprocessor()
+            reference = model.processor.apply_chat_template(mock_messages,
+                                                            add_generation_prompt=True,
+                                                            tokenize=False,
+                                                            return_dict=True)
+            prompt, _ = model.proc_messages(mock_messages, chat_template, sequence_start=True)
+            assert prompt == reference
+
+    def test_pure_img_messages(self, models, mock_pure_img_messages):
+        for model, chat_template in models:
+            model.build_preprocessor()
+            reference = model.processor.apply_chat_template(mock_pure_img_messages,
+                                                            add_generation_prompt=True,
+                                                            tokenize=False,
+                                                            return_dict=True)
+            prompt, _ = model.proc_messages(mock_pure_img_messages, chat_template, sequence_start=True)
+            assert prompt == reference
+
+    def test_pure_text_messages(self, models, mock_pure_text_messages):
+        for model, chat_template in models:
+            model.build_preprocessor()
+            reference = model.processor.apply_chat_template(mock_pure_text_messages,
+                                                            add_generation_prompt=True,
+                                                            tokenize=False,
+                                                            return_dict=True)
+            prompt, _ = model.proc_messages(mock_pure_text_messages, chat_template, sequence_start=True)
+            assert prompt == reference
+
+
+class TestQwen3VLChatTemplate:
+
+    @pytest.fixture(scope='module')
+    def models(self):
+        model_list = [
             'Qwen/Qwen3-VL-2B-Instruct',
             'Qwen/Qwen3-VL-2B-Thinking',
             'Qwen/Qwen3-VL-4B-Instruct',
@@ -133,14 +173,14 @@ def models(self):
         models = [get_model_and_chat_template(model_path) for model_path in model_list]
         return models
 
-    def test_proc_messages(self, models, mock_messages):
+    def test_apply_chat_template(self, models, mock_messages):
         for model, chat_template in models:
             model.build_preprocessor()
             reference = model.processor.apply_chat_template(mock_messages,
                                                             add_generation_prompt=True,
                                                             tokenize=False,
                                                             return_dict=True)
-            prompt, _ = model.proc_messages(mock_messages, chat_template, sequence_start=True)
+            prompt = model.apply_chat_template(mock_messages, chat_template, sequence_start=True)
             assert prompt == reference
 
     def test_pure_img_messages(self, models, mock_pure_img_messages):
@@ -150,7 +190,7 @@ def test_pure_img_messages(self, models, mock_pure_img_messages):
                                                             add_generation_prompt=True,
                                                             tokenize=False,
                                                             return_dict=True)
-            prompt, _ = model.proc_messages(mock_pure_img_messages, chat_template, sequence_start=True)
+            prompt = model.apply_chat_template(mock_pure_img_messages, chat_template, sequence_start=True)
             assert prompt == reference
 
     def test_pure_text_messages(self, models, mock_pure_text_messages):
@@ -160,5 +200,5 @@ def test_pure_text_messages(self, models, mock_pure_text_messages):
                                                             add_generation_prompt=True,
                                                             tokenize=False,
                                                             return_dict=True)
-            prompt, _ = model.proc_messages(mock_pure_text_messages, chat_template, sequence_start=True)
+            prompt = model.apply_chat_template(mock_pure_text_messages, chat_template, sequence_start=True)
             assert prompt == reference
diff --git a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py b/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
index 3d7b0d4458..759de3fc47 100644
--- a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
+++ b/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
@@ -39,9 +39,17 @@ def sample_video_messages(video_data):
     return [{'role': 'user', 'content': [{'type': 'video', 'data': frames, 'video_metadata': metadata}]}]
 
 
-def _preprocess(model, messages, **kwargs):
-    result = model.preprocess(messages=list(messages), **kwargs)
-    return result[-1]['content'][0]
+def _preprocess(model, messages, mm_processor_kwargs=None):
+    """Call model.preprocess following the same flow as the engine:
+
+    apply_chat_template → input_prompt → preprocess.
+    """
+    from lmdeploy.model import MODELS
+    chat_template = MODELS.module_dict['hf'](model_path=model.model_path)
+    input_prompt = model.apply_chat_template(messages, chat_template, sequence_start=True)
+    result = model.preprocess(messages=list(messages), input_prompt=input_prompt,
+                              mm_processor_kwargs=mm_processor_kwargs)
+    return result['multimodal'][0]
 
 
 def test_image_with_custom_pixels(qwen3vl_model, sample_messages):
@@ -56,14 +64,14 @@ def test_image_with_custom_pixels(qwen3vl_model, sample_messages):
     default_shape = _preprocess(qwen3vl_model, sample_messages)['pixel_values'].shape
 
     # [60, 1536]
+    small_kwargs = {'image': {'min_pixels': 10 * 32 * 32, 'max_pixels': 20 * 32 * 32}}
     small_shape = _preprocess(qwen3vl_model, sample_messages,
-                              mm_processor_kwargs={'min_pixels': 10 * 32 * 32,
-                                                   'max_pixels': 20 * 32 * 32})['pixel_values'].shape
+                              mm_processor_kwargs=small_kwargs)['pixel_values'].shape
 
     # [468, 1536]
+    large_kwargs = {'image': {'min_pixels': 100 * 32 * 32, 'max_pixels': 20000 * 32 * 32}}
     large_shape = _preprocess(qwen3vl_model, sample_messages,
-                              mm_processor_kwargs={'min_pixels': 100 * 32 * 32,
-                                                   'max_pixels': 20000 * 32 * 32})['pixel_values'].shape
+                              mm_processor_kwargs=large_kwargs)['pixel_values'].shape
 
     assert small_shape[0] < default_shape[0] < large_shape[0]
 
@@ -72,20 +80,20 @@ def test_video_with_custom_pixels(qwen3vl_model, sample_video_messages):
     """Test that mm_processor_kwargs min/max pixels affect video preprocessing.
 
     Videos process at native resolution by default, so we compare two constrained ranges rather than comparing against
-    the default.
+    the default. Per-frame shapes are compared (each multimodal item is one frame).
     """
 
     # [28160, 1536]
     default_shape = _preprocess(qwen3vl_model, sample_video_messages)['pixel_values_videos'].shape
 
-    # [32, 1536]
+    # [4, 1536]
+    small_kwargs = {'video': {'min_pixels': 10 * 32 * 32, 'max_pixels': 20 * 32 * 32}}
     small_shape = _preprocess(qwen3vl_model, sample_video_messages,
-                              mm_processor_kwargs={'min_pixels': 10 * 32 * 32,
-                                                   'max_pixels': 20 * 32 * 32})['pixel_values_videos'].shape
+                              mm_processor_kwargs=small_kwargs)['pixel_values_videos'].shape
 
-    # [256, 1536]
+    # [32, 1536]
+    medium_kwargs = {'video': {'min_pixels': 50 * 32 * 32, 'max_pixels': 200 * 32 * 32}}
     medium_shape = _preprocess(qwen3vl_model, sample_video_messages,
-                               mm_processor_kwargs={'min_pixels': 50 * 32 * 32,
-                                                    'max_pixels': 200 * 32 * 32})['pixel_values_videos'].shape
+                               mm_processor_kwargs=medium_kwargs)['pixel_values_videos'].shape
 
     assert small_shape[0] < medium_shape[0] <= default_shape[0]

From 7922cc3cf3d0b79990661ffe33e9309269fa55a0 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 17 Apr 2026 11:48:27 +0800
Subject: [PATCH 15/21] fix qwen3vl moe

---
 lmdeploy/pytorch/models/qwen3_vl_moe.py |  9 ++++-----
 lmdeploy/serve/processors/multimodal.py | 17 ++++++-----------
 lmdeploy/vl/engine.py                   |  2 ++
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/lmdeploy/pytorch/models/qwen3_vl_moe.py b/lmdeploy/pytorch/models/qwen3_vl_moe.py
index 8de27b1b3b..9dd8263c4a 100644
--- a/lmdeploy/pytorch/models/qwen3_vl_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_vl_moe.py
@@ -146,7 +146,7 @@ def forward(
         pixel_values: torch.Tensor = None,
         vis_cu_seqlens: torch.Tensor = None,
         vis_pos_emb: torch.Tensor = None,
-        image_mask: torch.Tensor = None,
+        multimodal_mask: torch.Tensor = None,
         pos_embeds: torch.Tensor = None,
         grid_thw: torch.Tensor = None,
         **kwargs,
@@ -175,10 +175,9 @@ def forward(
                 image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, dtype)
 
                 # mask and scatter to create final input embeddings
-                expanded_image_mask = image_mask.unsqueeze(-1).expand_as(inputs_embeds)
-                inputs_embeds = inputs_embeds.masked_scatter(expanded_image_mask, image_embeds)
-
-                visual_pos_masks = expanded_image_mask
+                multimodal_mask = multimodal_mask.unsqueeze(-1).expand_as(inputs_embeds)
+                inputs_embeds = inputs_embeds.masked_scatter(multimodal_mask, image_embeds)
+                visual_pos_masks = multimodal_mask
 
         # router replay
         all_routed_experts = None
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 56cbfe365a..5be6b91241 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -38,10 +38,8 @@ def __init__(self,
         self.chat_template = chat_template
         self.vl_encoder = vl_encoder
         self.backend = backend
-        self._uses_new_preprocess = (
-            vl_encoder is not None and
-            'mm_processor_kwargs' in inspect.signature(vl_encoder.model.preprocess).parameters
-        )
+        _sig = inspect.signature(vl_encoder.model.preprocess).parameters if vl_encoder else {}
+        self._uses_new_preprocess = 'input_prompt' in _sig and 'mm_processor_kwargs' in _sig
 
     @staticmethod
     def merge_message_content(msg: dict) -> dict:
@@ -373,13 +371,10 @@ async def _get_multimodal_prompt_input(self,
                                                                chat_template_kwargs=chat_template_kwargs)
         elif self.backend == 'pytorch':
             if self._uses_new_preprocess:
-                if self.vl_encoder.model.has_input_ids(messages):
-                    input_prompt = messages[0]['content'][0]['text']
-                else:
-                    input_prompt = self.vl_encoder.apply_chat_template(messages=messages,
-                                                                       chat_template=chat_template,
-                                                                       sequence_start=sequence_start,
-                                                                       chat_template_kwargs=chat_template_kwargs)
+                input_prompt = self.vl_encoder.apply_chat_template(messages=messages,
+                                                                   chat_template=chat_template,
+                                                                   sequence_start=sequence_start,
+                                                                   chat_template_kwargs=chat_template_kwargs)
                 results = await self.vl_encoder.preprocess(messages, input_prompt, mm_processor_kwargs)
             else:
                 results = await self.vl_encoder.preprocess(messages, mm_processor_kwargs)
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index 8379e7708a..b284b6caac 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -43,6 +43,8 @@ def __init__(
         torch.cuda.empty_cache()
 
     def apply_chat_template(self, messages, chat_template, sequence_start, chat_template_kwargs=None):
+        if self.model.has_input_ids(messages):
+            return messages[0]['content'][0]['text']
         return self.model.apply_chat_template(
             messages, chat_template, sequence_start, chat_template_kwargs
         )

From 2f992c94e2915663b136ebe76512df98b66a118c Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 17 Apr 2026 14:23:36 +0800
Subject: [PATCH 16/21] allow modality-specific kwargs, add ut

---
 lmdeploy/vl/model/base.py                     | 59 ++++++++++-------
 .../test_vl/test_qwen3vl_processor.py         | 63 +++++++++++++++++++
 2 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index b95122e696..e960254588 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -129,22 +129,27 @@ def get_mm_items_offset(
         end_positions += 1 # convert to exclusive end index, compatible with legacy pytorch implementation
         return list(zip(start_positions.tolist(), end_positions.tolist()))
 
-    def get_override_size(self, processor, mm_processor_kwargs: dict[str, Any] | None = None):
-        default_min = processor.size['shortest_edge']
-        default_max = processor.size['longest_edge']
+    def get_override_size(self, processor, mm_processor_kwargs: dict[str, Any] | None = None, modality: str = ''):
         if not mm_processor_kwargs:
-            return {'shortest_edge': default_min, 'longest_edge': default_max}
-
+            return None
+        try:
+            default_min = processor.size['shortest_edge']
+            default_max = processor.size['longest_edge']
+        except (AttributeError, KeyError, TypeError):
+            tag = f'[{modality}] ' if modality else ''
+            logger.warning(f'{tag}processor does not expose size[shortest_edge/longest_edge], '
+                           f'mm_processor_kwargs size override will be skipped.')
+            return None
         override_min = mm_processor_kwargs.get('min_pixels', default_min)
         override_max = mm_processor_kwargs.get('max_pixels', default_max)
+        tag = f'[{modality}] ' if modality else ''
         if override_min > override_max:
-            logger.info(
-                f'Overriding min_pixels {override_min} > max_pixels {override_max}, ' \
+            logger.warning(
+                f'{tag}Overriding min_pixels {override_min} > max_pixels {override_max}, ' \
                 f'falling back to defaults, min_pixels={default_min} and max_pixels={default_max}.'
             )
-            return {'shortest_edge': default_min, 'longest_edge': default_max}
-
-        logger.info(f'Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
+            return None
+        logger.warning(f'{tag}Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
         return {'shortest_edge': override_min, 'longest_edge': override_max}
 
     def get_expanded_input_ids(self, input_prompt, collected_mm_items) -> torch.Tensor:
@@ -331,27 +336,33 @@ def preprocess(self,
             else:
                 raise ValueError(f'unsupported modality {modality}')
 
-        # get kwrags for processor
+        # get kwargs for processor
         kwargs = {}
+        images_kwargs = {}
+        videos_kwargs = {}
         mm_processor_kwargs = mm_processor_kwargs or {}
         if raw_images:
             kwargs['images'] = raw_images
-            kwargs['size'] = self.get_override_size(self.processor.image_processor,
-                                                     mm_processor_kwargs.get('image', None))
+            image_size = self.get_override_size(self.processor.image_processor,
+                                                 mm_processor_kwargs.get('image'),
+                                                 modality='image')
+            if image_size is not None:
+                images_kwargs['size'] = image_size
         if raw_videos:
             kwargs['videos'] = raw_videos
-            kwargs['video_metadata'] = video_metadatas
-            kwargs['size'] = self.get_override_size(self.processor.video_processor,
-                                                     mm_processor_kwargs.get('video', None))
+            videos_kwargs['video_metadata'] = video_metadatas
             # perform resize in hf processor, while sample frames has been done in video loader
-            kwargs['do_resize'] = True
-            kwargs['do_sample_frames'] = False
-        if raw_images and raw_videos and kwargs.get('size', None) is not None:
-            logger.warning(
-                'Overriding mm processor kwargs for mixed modality can be problematic, '
-                'skip kwargs setting for both processors to avoid potential issues.'
-            )
-            kwargs.pop('size')
+            videos_kwargs['do_resize'] = True
+            videos_kwargs['do_sample_frames'] = False
+            video_size = self.get_override_size(self.processor.video_processor,
+                                                 mm_processor_kwargs.get('video'),
+                                                 modality='video')
+            if video_size is not None:
+                videos_kwargs['size'] = video_size
+        if images_kwargs:
+            kwargs['images_kwargs'] = images_kwargs
+        if videos_kwargs:
+            kwargs['videos_kwargs'] = videos_kwargs
         if raw_time_series:
             assert hasattr(self, 'time_series_processor'), \
                 'time series processor is not defined for time series input'
diff --git a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py b/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
index 759de3fc47..1ed9a8f790 100644
--- a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
+++ b/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
@@ -1,6 +1,7 @@
 import pytest
 
 from lmdeploy.vl import load_image, load_video
+from lmdeploy.vl.constants import Modality
 from lmdeploy.vl.model.qwen3 import Qwen3VLModel
 
 QWEN3VL_MODELS = [
@@ -97,3 +98,65 @@ def test_video_with_custom_pixels(qwen3vl_model, sample_video_messages):
                                mm_processor_kwargs=medium_kwargs)['pixel_values_videos'].shape
 
     assert small_shape[0] < medium_shape[0] <= default_shape[0]
+
+
+@pytest.fixture
+def sample_mixed_messages(pil_image, video_data):
+    frames, metadata = video_data
+    return [{
+        'role': 'user',
+        'content': [
+            {'type': 'image', 'data': pil_image},
+            {'type': 'video', 'data': frames, 'video_metadata': metadata},
+        ]
+    }]
+
+
+def _preprocess_by_modality(model, messages, mm_processor_kwargs=None):
+    """Like _preprocess but returns all multimodal items grouped by
+    modality."""
+    from lmdeploy.model import MODELS
+    chat_template = MODELS.module_dict['hf'](model_path=model.model_path)
+    input_prompt = model.apply_chat_template(messages, chat_template, sequence_start=True)
+    result = model.preprocess(messages=list(messages), input_prompt=input_prompt,
+                              mm_processor_kwargs=mm_processor_kwargs)
+    by_modality = {}
+    for item in result['multimodal']:
+        by_modality.setdefault(item['modality'], []).append(item)
+    return by_modality
+
+
+def test_mixed_image_video_independent_size(qwen3vl_model, sample_mixed_messages):
+    """Per-modality mm_processor_kwargs must not bleed across image and video.
+
+    Shrinking image budget must not change video token count, and vice versa.
+    """
+    default = _preprocess_by_modality(qwen3vl_model, sample_mixed_messages)
+    default_image_patches = default[Modality.IMAGE][0]['pixel_values'].shape[0]
+    default_video_patches = sum(item['pixel_values_videos'].shape[0] for item in default[Modality.VIDEO])
+
+    # shrink image only — video must be unchanged
+    small_image = _preprocess_by_modality(qwen3vl_model, sample_mixed_messages,
+                                          mm_processor_kwargs={'image': {'min_pixels': 10 * 32 * 32,
+                                                                         'max_pixels': 20 * 32 * 32}})
+    assert small_image[Modality.IMAGE][0]['pixel_values'].shape[0] < default_image_patches
+    assert sum(item['pixel_values_videos'].shape[0]
+               for item in small_image[Modality.VIDEO]) == default_video_patches
+
+    # shrink video only — image must be unchanged
+    small_video = _preprocess_by_modality(qwen3vl_model, sample_mixed_messages,
+                                          mm_processor_kwargs={'video': {'min_pixels': 10 * 32 * 32,
+                                                                         'max_pixels': 20 * 32 * 32}})
+    assert small_video[Modality.IMAGE][0]['pixel_values'].shape[0] == default_image_patches
+    assert sum(item['pixel_values_videos'].shape[0]
+               for item in small_video[Modality.VIDEO]) < default_video_patches
+
+    # shrink both simultaneously — both must decrease independently
+    small_both = _preprocess_by_modality(qwen3vl_model, sample_mixed_messages,
+                                         mm_processor_kwargs={
+                                             'image': {'min_pixels': 10 * 32 * 32, 'max_pixels': 20 * 32 * 32},
+                                             'video': {'min_pixels': 10 * 32 * 32, 'max_pixels': 20 * 32 * 32},
+                                         })
+    assert small_both[Modality.IMAGE][0]['pixel_values'].shape[0] < default_image_patches
+    assert sum(item['pixel_values_videos'].shape[0]
+               for item in small_both[Modality.VIDEO]) < default_video_patches

From 0787a8ae6361374bbb7495ab315e5378866caf0f Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 17 Apr 2026 14:50:54 +0800
Subject: [PATCH 17/21] docs: add multi-modal input format reference (EN + ZH)

Add multimodal_inputs.md covering all modalities (text, image, video,
audio, time series, mixed) with OpenAI-style examples, local file /
base64 usage via lmdeploy.vl.utils helpers, and mm_processor_kwargs /
media_io_kwargs guidance. Link from vl_pipeline.md and index.rst.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/en/multi_modal/index.rst               |   6 +
 docs/en/multi_modal/multimodal_inputs.md    | 621 ++++++++++++++++++++
 docs/en/multi_modal/vl_pipeline.md          |   2 +
 docs/zh_cn/multi_modal/index.rst            |   6 +
 docs/zh_cn/multi_modal/multimodal_inputs.md | 620 +++++++++++++++++++
 docs/zh_cn/multi_modal/vl_pipeline.md       |   2 +
 lmdeploy/vl/model/base.py                   |   8 +-
 7 files changed, 1261 insertions(+), 4 deletions(-)
 create mode 100644 docs/en/multi_modal/multimodal_inputs.md
 create mode 100644 docs/zh_cn/multi_modal/multimodal_inputs.md

diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst
index a041172edb..ac0e649244 100644
--- a/docs/en/multi_modal/index.rst
+++ b/docs/en/multi_modal/index.rst
@@ -1,6 +1,12 @@
 Vision-Language Models
 =================================
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Guides
+
+   multimodal_inputs.md
+
 .. toctree::
    :maxdepth: 2
    :caption: Examples
diff --git a/docs/en/multi_modal/multimodal_inputs.md b/docs/en/multi_modal/multimodal_inputs.md
new file mode 100644
index 0000000000..0f6b8af1b3
--- /dev/null
+++ b/docs/en/multi_modal/multimodal_inputs.md
@@ -0,0 +1,621 @@
+# Multi-Modal Inputs
+
+LMDeploy uses the OpenAI message format for all modalities. Each content item in a message is a dict with a `type` field that determines how it is decoded.
+
+**Quick reference:**
+
+| Modality    | `type` key        | URL field             |
+| ----------- | ----------------- | --------------------- |
+| Text        | `text`            | —                     |
+| Image       | `image_url`       | `image_url.url`       |
+| Video       | `video_url`       | `video_url.url`       |
+| Audio       | `audio_url`       | `audio_url.url`       |
+| Time Series | `time_series_url` | `time_series_url.url` |
+
+All examples below target the lmdeploy OpenAI-compatible API server. Start it with:
+
+```bash
+lmdeploy serve api_server <model_path> --server-port 23333
+```
+
+______________________________________________________________________
+
+## Text
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [{
+            'type': 'text',
+            'text': 'Who are you?',
+        }],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Single Image
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+                },
+            },
+            {
+                'type': 'text',
+                'text': 'Describe this image.',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Multiple Images
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+                },
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+                },
+            },
+            {
+                'type': 'text',
+                'text': 'Compare these two images. What are the similarities and differences?',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Single Video
+
+> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5-VL**, and **InternS1-Pro** models only.
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'video_url',
+                'video_url': {
+                    'url': video_url,
+                },
+            },
+            {
+                'type': 'text',
+                'text': "What's in this video?",
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+    max_completion_tokens=256,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Multiple Videos
+
+> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5-VL**, and **InternS1-Pro** models only.
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+video_url_1 = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url_2 = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'video_url',
+                'video_url': {'url': video_url_1},
+            },
+            {
+                'type': 'video_url',
+                'video_url': {'url': video_url_2},
+            },
+            {
+                'type': 'text',
+                'text': 'Compare these two videos. What are the similarities and differences?',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+    max_completion_tokens=256,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Single Audio
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'audio_url',
+                'audio_url': {
+                    'url': 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg',
+                },
+            },
+            {
+                'type': 'text',
+                'text': 'Describe this audio.',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Multiple Audios
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+audio_url_1 = 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg'
+audio_url_2 = 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {'type': 'audio_url', 'audio_url': {'url': audio_url_1}},
+            {'type': 'audio_url', 'audio_url': {'url': audio_url_2}},
+            {
+                'type': 'text',
+                'text': 'Compare these two audios. What are the similarities and differences?',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Mixed Image and Video
+
+> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5-VL**, and **InternS1-Pro** models only.
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+image_url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
+video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {'url': image_url},
+            },
+            {
+                'type': 'video_url',
+                'video_url': {'url': video_url},
+            },
+            {
+                'type': 'text',
+                'text': 'Describe both the image and the video.',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+    max_completion_tokens=256,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Time Series
+
+> **Note:** Time series input is currently supported for the **InternS1-Pro** model only.
+
+The `time_series_url` content item requires a `sampling_rate` field (in Hz) alongside the URL.
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': ('Please determine whether an Earthquake event has occurred. '
+                         'If so, specify P-wave and S-wave starting indices.'),
+            },
+            {
+                'type': 'time_series_url',
+                'time_series_url': {
+                    'url': 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/0092638_seism.npy',
+                    'sampling_rate': 100,
+                },
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+    max_completion_tokens=256,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Local Files and Base64
+
+In addition to HTTP URLs, lmdeploy accepts:
+
+- **Local file paths** via `file://` scheme: `file:///absolute/path/to/file.jpg`
+- **Base64-encoded data** via data URLs: `data:<mime>;base64,<encoded_data>`
+
+Use the helpers in `lmdeploy.vl.utils` to encode local files:
+
+<details>
+<summary>Local file path example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': 'file:///path/to/your/image.jpg',
+                },
+            },
+            {'type': 'text', 'text': 'Describe this image.'},
+        ],
+    }],
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+<details>
+<summary>Base64 encoding example (image)</summary>
+
+```python
+from openai import OpenAI
+from lmdeploy.vl.utils import encode_image_base64
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+b64 = encode_image_base64('/path/to/your/image.jpg')
+image_url = f'data:image/jpeg;base64,{b64}'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {'url': image_url},
+            },
+            {'type': 'text', 'text': 'Describe this image.'},
+        ],
+    }],
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+<details>
+<summary>Base64 encoding example (video)</summary>
+
+```python
+from openai import OpenAI
+from lmdeploy.vl.utils import encode_video_base64
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+# num_frames controls how many frames to sample before encoding
+b64 = encode_video_base64('/path/to/your/video.mp4', num_frames=16)
+video_url = f'data:video/mp4;base64,{b64}'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'video_url',
+                'video_url': {'url': video_url},
+            },
+            {'type': 'text', 'text': 'Describe this video.'},
+        ],
+    }],
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+<details>
+<summary>Base64 encoding example (time series)</summary>
+
+```python
+from openai import OpenAI
+from lmdeploy.vl.utils import encode_time_series_base64
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+b64 = encode_time_series_base64('/path/to/your/data.npy')
+ts_url = f'data:application/octet-stream;base64,{b64}'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {'type': 'text', 'text': 'Analyze this time series.'},
+            {
+                'type': 'time_series_url',
+                'time_series_url': {
+                    'url': ts_url,
+                    'sampling_rate': 100,
+                },
+            },
+        ],
+    }],
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Processor and IO kwargs
+
+Two optional parameters let you control media processing:
+
+- **`mm_processor_kwargs`**: controls vision token resolution (min/max pixels per image or video frame)
+- **`media_io_kwargs`**: controls how media is loaded (e.g. video frame sampling rate and count)
+
+Both are passed as extra fields in the API request body via `extra_body`, or directly to `pipe()` when using the pipeline API.
+
+<details>
+<summary>API server example (extra_body)</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {'type': 'video_url', 'video_url': {'url': video_url}},
+            {'type': 'text', 'text': 'Describe this video.'},
+        ],
+    }],
+    max_completion_tokens=256,
+    extra_body={
+        'mm_processor_kwargs': {
+            'image': {
+                'min_pixels': 128 * 32 * 32,
+                'max_pixels': 256 * 32 * 32,
+            },
+            'video': {
+                'min_pixels': 4 * 32 * 32,
+                'max_pixels': 256 * 32 * 32,
+            },
+        },
+        'media_io_kwargs': {
+            'video': {
+                'num_frames': 16,
+                'fps': 2,
+            },
+        },
+    },
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+<details>
+<summary>Pipeline API equivalent</summary>
+
+```python
+from lmdeploy import pipeline, PytorchEngineConfig
+
+pipe = pipeline('<model_path>', backend_config=PytorchEngineConfig(tp=1))
+
+video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+
+messages = [{
+    'role': 'user',
+    'content': [
+        {'type': 'video_url', 'video_url': {'url': video_url}},
+        {'type': 'text', 'text': 'Describe this video.'},
+    ],
+}]
+
+response = pipe(
+    messages,
+    mm_processor_kwargs={
+        'image': {
+            'min_pixels': 128 * 32 * 32,
+            'max_pixels': 256 * 32 * 32,
+        },
+        'video': {
+            'min_pixels': 4 * 32 * 32,
+            'max_pixels': 256 * 32 * 32,
+        },
+    },
+    media_io_kwargs={
+        'video': {
+            'num_frames': 16,
+            'fps': 2,
+        },
+    },
+)
+print(response)
+```
+
+</details>
diff --git a/docs/en/multi_modal/vl_pipeline.md b/docs/en/multi_modal/vl_pipeline.md
index 1e73fb2e06..4972ba91d5 100644
--- a/docs/en/multi_modal/vl_pipeline.md
+++ b/docs/en/multi_modal/vl_pipeline.md
@@ -10,6 +10,8 @@ Moreover, we will provide practical inference examples tailored to scenarios wit
 
 Using the pipeline interface to infer other VLM models is similar, with the main difference being the configuration and installation dependencies of the models. You can read [here](https://lmdeploy.readthedocs.io/en/latest/multi_modal/index.html) for environment installation and configuration methods for different models.
 
+> **See also:** [Multi-Modal Inputs](multimodal_inputs.md) — message format reference for all modalities (image, video, audio, time series) with OpenAI-style examples.
+
 ## A 'Hello, world' example
 
 ```python
diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst
index 9a61f6efdb..ed33bba8d3 100644
--- a/docs/zh_cn/multi_modal/index.rst
+++ b/docs/zh_cn/multi_modal/index.rst
@@ -1,6 +1,12 @@
 视觉语言模型
 =================================
 
+.. toctree::
+   :maxdepth: 2
+   :caption: 指南
+
+   multimodal_inputs.md
+
 .. toctree::
    :maxdepth: 2
    :caption: 示例
diff --git a/docs/zh_cn/multi_modal/multimodal_inputs.md b/docs/zh_cn/multi_modal/multimodal_inputs.md
new file mode 100644
index 0000000000..c5e54abe75
--- /dev/null
+++ b/docs/zh_cn/multi_modal/multimodal_inputs.md
@@ -0,0 +1,620 @@
+# 多模态输入
+
+LMDeploy 使用 OpenAI 消息格式处理所有模态。消息中的每个内容项都是一个包含 `type` 字段的字典，该字段决定了数据的解码方式。
+
+**快速参考：**
+
+| 模态     | `type` 字段       | URL 字段              |
+| -------- | ----------------- | --------------------- |
+| 文本     | `text`            | —                     |
+| 图像     | `image_url`       | `image_url.url`       |
+| 视频     | `video_url`       | `video_url.url`       |
+| 音频     | `audio_url`       | `audio_url.url`       |
+| 时序数据 | `time_series_url` | `time_series_url.url` |
+
+以下示例均面向 lmdeploy 兼容 OpenAI 的 API 服务。启动服务：
+
+```bash
+lmdeploy serve api_server <model_path> --server-port 23333
+```
+
+______________________________________________________________________
+
+## 纯文本
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [{
+            'type': 'text',
+            'text': '你是谁？',
+        }],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 单张图像
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+                },
+            },
+            {
+                'type': 'text',
+                'text': '描述这张图片。',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 多张图像
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+                },
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+                },
+            },
+            {
+                'type': 'text',
+                'text': '比较这两张图片，有哪些相似点和不同点？',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 单个视频
+
+> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5-VL** 和 **InternS1-Pro** 模型。
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'video_url',
+                'video_url': {
+                    'url': video_url,
+                },
+            },
+            {
+                'type': 'text',
+                'text': '这个视频里有什么？',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+    max_completion_tokens=256,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 多个视频
+
+> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5-VL** 和 **InternS1-Pro** 模型。
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+video_url_1 = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url_2 = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'video_url',
+                'video_url': {'url': video_url_1},
+            },
+            {
+                'type': 'video_url',
+                'video_url': {'url': video_url_2},
+            },
+            {
+                'type': 'text',
+                'text': '比较这两个视频，有哪些相似点和不同点？',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+    max_completion_tokens=256,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 单个音频
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'audio_url',
+                'audio_url': {
+                    'url': 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg',
+                },
+            },
+            {
+                'type': 'text',
+                'text': '描述这段音频。',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 多个音频
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+audio_url_1 = 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg'
+audio_url_2 = 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {'type': 'audio_url', 'audio_url': {'url': audio_url_1}},
+            {'type': 'audio_url', 'audio_url': {'url': audio_url_2}},
+            {
+                'type': 'text',
+                'text': '比较这两段音频，有哪些相似点和不同点？',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 图像与视频混合
+
+> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5-VL** 和 **InternS1-Pro** 模型。
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+image_url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
+video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {'url': image_url},
+            },
+            {
+                'type': 'video_url',
+                'video_url': {'url': video_url},
+            },
+            {
+                'type': 'text',
+                'text': '描述这张图片和这个视频。',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+    max_completion_tokens=256,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 时序数据
+
+> **注意：** 时序数据输入目前仅支持 **InternS1-Pro** 模型。
+
+`time_series_url` 内容项需要在 URL 之外额外提供 `sampling_rate` 字段（单位：Hz）。
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': '请判断是否发生了地震事件，若有请指出P波和S波的起始索引。',
+            },
+            {
+                'type': 'time_series_url',
+                'time_series_url': {
+                    'url': 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/0092638_seism.npy',
+                    'sampling_rate': 100,
+                },
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+    max_completion_tokens=256,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 本地文件与 Base64
+
+除 HTTP URL 外，lmdeploy 还支持：
+
+- **本地文件路径**，使用 `file://` 协议：`file:///absolute/path/to/file.jpg`
+- **Base64 编码数据**，使用 data URL：`data:<mime>;base64,<encoded_data>`
+
+可使用 `lmdeploy.vl.utils` 中的工具函数对本地文件进行编码：
+
+<details>
+<summary>本地文件路径示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': 'file:///path/to/your/image.jpg',
+                },
+            },
+            {'type': 'text', 'text': '描述这张图片。'},
+        ],
+    }],
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+<details>
+<summary>Base64 编码示例（图像）</summary>
+
+```python
+from openai import OpenAI
+from lmdeploy.vl.utils import encode_image_base64
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+b64 = encode_image_base64('/path/to/your/image.jpg')
+image_url = f'data:image/jpeg;base64,{b64}'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'image_url',
+                'image_url': {'url': image_url},
+            },
+            {'type': 'text', 'text': '描述这张图片。'},
+        ],
+    }],
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+<details>
+<summary>Base64 编码示例（视频）</summary>
+
+```python
+from openai import OpenAI
+from lmdeploy.vl.utils import encode_video_base64
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+# num_frames 控制编码前采样的帧数
+b64 = encode_video_base64('/path/to/your/video.mp4', num_frames=16)
+video_url = f'data:video/mp4;base64,{b64}'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'video_url',
+                'video_url': {'url': video_url},
+            },
+            {'type': 'text', 'text': '描述这个视频。'},
+        ],
+    }],
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+<details>
+<summary>Base64 编码示例（时序数据）</summary>
+
+```python
+from openai import OpenAI
+from lmdeploy.vl.utils import encode_time_series_base64
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+b64 = encode_time_series_base64('/path/to/your/data.npy')
+ts_url = f'data:application/octet-stream;base64,{b64}'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {'type': 'text', 'text': '分析这段时序数据。'},
+            {
+                'type': 'time_series_url',
+                'time_series_url': {
+                    'url': ts_url,
+                    'sampling_rate': 100,
+                },
+            },
+        ],
+    }],
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 处理器与 IO 参数
+
+两个可选参数用于控制媒体处理行为：
+
+- **`mm_processor_kwargs`**：控制视觉 token 的分辨率（每张图片或视频帧的最小/最大像素数）
+- **`media_io_kwargs`**：控制媒体加载方式（如视频帧采样率和帧数）
+
+两者均通过 `extra_body` 作为请求体中的额外字段传入 API，或在使用 pipeline API 时直接传给 `pipe()`。
+
+<details>
+<summary>API 服务示例（extra_body）</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {'type': 'video_url', 'video_url': {'url': video_url}},
+            {'type': 'text', 'text': '描述这个视频。'},
+        ],
+    }],
+    max_completion_tokens=256,
+    extra_body={
+        'mm_processor_kwargs': {
+            'image': {
+                'min_pixels': 128 * 32 * 32,
+                'max_pixels': 256 * 32 * 32,
+            },
+            'video': {
+                'min_pixels': 4 * 32 * 32,
+                'max_pixels': 256 * 32 * 32,
+            },
+        },
+        'media_io_kwargs': {
+            'video': {
+                'num_frames': 16,
+                'fps': 2,
+            },
+        },
+    },
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+<details>
+<summary>Pipeline API 等价写法</summary>
+
+```python
+from lmdeploy import pipeline, PytorchEngineConfig
+
+pipe = pipeline('<model_path>', backend_config=PytorchEngineConfig(tp=1))
+
+video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+
+messages = [{
+    'role': 'user',
+    'content': [
+        {'type': 'video_url', 'video_url': {'url': video_url}},
+        {'type': 'text', 'text': '描述这个视频。'},
+    ],
+}]
+
+response = pipe(
+    messages,
+    mm_processor_kwargs={
+        'image': {
+            'min_pixels': 128 * 32 * 32,
+            'max_pixels': 256 * 32 * 32,
+        },
+        'video': {
+            'min_pixels': 4 * 32 * 32,
+            'max_pixels': 256 * 32 * 32,
+        },
+    },
+    media_io_kwargs={
+        'video': {
+            'num_frames': 16,
+            'fps': 2,
+        },
+    },
+)
+print(response)
+```
+
+</details>
diff --git a/docs/zh_cn/multi_modal/vl_pipeline.md b/docs/zh_cn/multi_modal/vl_pipeline.md
index 67ff082964..9662bcc569 100644
--- a/docs/zh_cn/multi_modal/vl_pipeline.md
+++ b/docs/zh_cn/multi_modal/vl_pipeline.md
@@ -10,6 +10,8 @@ LMDeploy 把视觉-语言模型（VLM）复杂的推理过程，抽象为简单
 
 使用 pipeline 接口推理其他 VLM 模型，大同小异，主要区别在于模型依赖的配置和安装。你可以阅读[此处](https://lmdeploy.readthedocs.io/zh-cn/latest/multi_modal/)，查看不同模型的环境安装和配置方式
 
+> **另请参阅：** [多模态输入](multimodal_inputs.md) — 涵盖所有模态（图像、视频、音频、时序数据）的消息格式参考，包含 OpenAI 风格示例。
+
 ## "Hello, world" 示例
 
 ```python
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index e960254588..8bc61a6318 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -344,8 +344,8 @@ def preprocess(self,
         if raw_images:
             kwargs['images'] = raw_images
             image_size = self.get_override_size(self.processor.image_processor,
-                                                 mm_processor_kwargs.get('image'),
-                                                 modality='image')
+                                                mm_processor_kwargs.get('image'),
+                                                modality='image')
             if image_size is not None:
                 images_kwargs['size'] = image_size
         if raw_videos:
@@ -355,8 +355,8 @@ def preprocess(self,
             videos_kwargs['do_resize'] = True
             videos_kwargs['do_sample_frames'] = False
             video_size = self.get_override_size(self.processor.video_processor,
-                                                 mm_processor_kwargs.get('video'),
-                                                 modality='video')
+                                                mm_processor_kwargs.get('video'),
+                                                modality='video')
             if video_size is not None:
                 videos_kwargs['size'] = video_size
         if images_kwargs:

From 8611115b2fa2109ad55f47572e1bd8069d54fc75 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 17 Apr 2026 15:04:13 +0800
Subject: [PATCH 18/21] docs: update video/audio URLs to official Qwen assets

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/en/multi_modal/multimodal_inputs.md    | 26 +++++++--------------
 docs/zh_cn/multi_modal/multimodal_inputs.md | 26 +++++++--------------
 2 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/docs/en/multi_modal/multimodal_inputs.md b/docs/en/multi_modal/multimodal_inputs.md
index 0f6b8af1b3..8c34957aa4 100644
--- a/docs/en/multi_modal/multimodal_inputs.md
+++ b/docs/en/multi_modal/multimodal_inputs.md
@@ -145,7 +145,7 @@ from openai import OpenAI
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
-video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -188,8 +188,8 @@ from openai import OpenAI
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
-video_url_1 = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
-video_url_2 = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4'
+video_url_1 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
+video_url_2 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -240,7 +240,7 @@ response = client.chat.completions.create(
             {
                 'type': 'audio_url',
                 'audio_url': {
-                    'url': 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg',
+                    'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav',
                 },
             },
             {
@@ -270,8 +270,8 @@ from openai import OpenAI
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
-audio_url_1 = 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg'
-audio_url_2 = 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg'
+audio_url_1 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav'
+audio_url_2 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -310,7 +310,7 @@ client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
 image_url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
-video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -542,7 +542,7 @@ from openai import OpenAI
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
-video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -556,10 +556,6 @@ response = client.chat.completions.create(
     max_completion_tokens=256,
     extra_body={
         'mm_processor_kwargs': {
-            'image': {
-                'min_pixels': 128 * 32 * 32,
-                'max_pixels': 256 * 32 * 32,
-            },
             'video': {
                 'min_pixels': 4 * 32 * 32,
                 'max_pixels': 256 * 32 * 32,
@@ -586,7 +582,7 @@ from lmdeploy import pipeline, PytorchEngineConfig
 
 pipe = pipeline('<model_path>', backend_config=PytorchEngineConfig(tp=1))
 
-video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 messages = [{
     'role': 'user',
@@ -599,10 +595,6 @@ messages = [{
 response = pipe(
     messages,
     mm_processor_kwargs={
-        'image': {
-            'min_pixels': 128 * 32 * 32,
-            'max_pixels': 256 * 32 * 32,
-        },
         'video': {
             'min_pixels': 4 * 32 * 32,
             'max_pixels': 256 * 32 * 32,
diff --git a/docs/zh_cn/multi_modal/multimodal_inputs.md b/docs/zh_cn/multi_modal/multimodal_inputs.md
index c5e54abe75..091779ddea 100644
--- a/docs/zh_cn/multi_modal/multimodal_inputs.md
+++ b/docs/zh_cn/multi_modal/multimodal_inputs.md
@@ -145,7 +145,7 @@ from openai import OpenAI
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
-video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -188,8 +188,8 @@ from openai import OpenAI
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
-video_url_1 = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
-video_url_2 = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4'
+video_url_1 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
+video_url_2 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -240,7 +240,7 @@ response = client.chat.completions.create(
             {
                 'type': 'audio_url',
                 'audio_url': {
-                    'url': 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg',
+                    'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav',
                 },
             },
             {
@@ -270,8 +270,8 @@ from openai import OpenAI
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
-audio_url_1 = 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg'
-audio_url_2 = 'https://upload.wikimedia.org/wikipedia/commons/4/4e/BWV_543-fugue.ogg'
+audio_url_1 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav'
+audio_url_2 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -310,7 +310,7 @@ client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
 image_url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
-video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -541,7 +541,7 @@ from openai import OpenAI
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
 
-video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 response = client.chat.completions.create(
     model=model_name,
@@ -555,10 +555,6 @@ response = client.chat.completions.create(
     max_completion_tokens=256,
     extra_body={
         'mm_processor_kwargs': {
-            'image': {
-                'min_pixels': 128 * 32 * 32,
-                'max_pixels': 256 * 32 * 32,
-            },
             'video': {
                 'min_pixels': 4 * 32 * 32,
                 'max_pixels': 256 * 32 * 32,
@@ -585,7 +581,7 @@ from lmdeploy import pipeline, PytorchEngineConfig
 
 pipe = pipeline('<model_path>', backend_config=PytorchEngineConfig(tp=1))
 
-video_url = 'http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4'
+video_url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
 
 messages = [{
     'role': 'user',
@@ -598,10 +594,6 @@ messages = [{
 response = pipe(
     messages,
     mm_processor_kwargs={
-        'image': {
-            'min_pixels': 128 * 32 * 32,
-            'max_pixels': 256 * 32 * 32,
-        },
         'video': {
             'min_pixels': 4 * 32 * 32,
             'max_pixels': 256 * 32 * 32,

From de923d747e156b170427b6521260c490efe83841 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 17 Apr 2026 15:06:46 +0800
Subject: [PATCH 19/21] docs: fix model name Qwen3.5-VL -> Qwen3.5

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/en/multi_modal/multimodal_inputs.md    | 6 +++---
 docs/zh_cn/multi_modal/multimodal_inputs.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/en/multi_modal/multimodal_inputs.md b/docs/en/multi_modal/multimodal_inputs.md
index 8c34957aa4..958888ebbc 100644
--- a/docs/en/multi_modal/multimodal_inputs.md
+++ b/docs/en/multi_modal/multimodal_inputs.md
@@ -134,7 +134,7 @@ ______________________________________________________________________
 
 ## Single Video
 
-> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5-VL**, and **InternS1-Pro** models only.
+> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5**, and **InternS1-Pro** models only.
 
 <details>
 <summary>Complete example</summary>
@@ -177,7 +177,7 @@ ______________________________________________________________________
 
 ## Multiple Videos
 
-> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5-VL**, and **InternS1-Pro** models only.
+> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5**, and **InternS1-Pro** models only.
 
 <details>
 <summary>Complete example</summary>
@@ -298,7 +298,7 @@ ______________________________________________________________________
 
 ## Mixed Image and Video
 
-> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5-VL**, and **InternS1-Pro** models only.
+> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5**, and **InternS1-Pro** models only.
 
 <details>
 <summary>Complete example</summary>
diff --git a/docs/zh_cn/multi_modal/multimodal_inputs.md b/docs/zh_cn/multi_modal/multimodal_inputs.md
index 091779ddea..35a2769f14 100644
--- a/docs/zh_cn/multi_modal/multimodal_inputs.md
+++ b/docs/zh_cn/multi_modal/multimodal_inputs.md
@@ -134,7 +134,7 @@ ______________________________________________________________________
 
 ## 单个视频
 
-> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5-VL** 和 **InternS1-Pro** 模型。
+> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5** 和 **InternS1-Pro** 模型。
 
 <details>
 <summary>完整示例</summary>
@@ -177,7 +177,7 @@ ______________________________________________________________________
 
 ## 多个视频
 
-> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5-VL** 和 **InternS1-Pro** 模型。
+> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5** 和 **InternS1-Pro** 模型。
 
 <details>
 <summary>完整示例</summary>
@@ -298,7 +298,7 @@ ______________________________________________________________________
 
 ## 图像与视频混合
 
-> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5-VL** 和 **InternS1-Pro** 模型。
+> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5** 和 **InternS1-Pro** 模型。
 
 <details>
 <summary>完整示例</summary>

From 52a46b711c32171f7cb5e59046383f0355a4cd73 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 17 Apr 2026 15:29:36 +0800
Subject: [PATCH 20/21] fix: address PR #4531 review comments

- glm4_1v: guard chat_template_kwargs against None before ** expansion
- base: use local time_series_processor to avoid mutating self.processor
- base: fix preprocess return type annotation list[dict] -> dict[str, Any]
- base: lower valid size-override log from WARNING to INFO

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 lmdeploy/vl/model/base.py    | 12 ++++++------
 lmdeploy/vl/model/glm4_1v.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 8bc61a6318..8c3ab4aee6 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -149,7 +149,7 @@ def get_override_size(self, processor, mm_processor_kwargs: dict[str, Any] | Non
                 f'falling back to defaults, min_pixels={default_min} and max_pixels={default_max}.'
             )
             return None
-        logger.warning(f'{tag}Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
+        logger.info(f'{tag}Overriding processor size with min_pixels={override_min} and max_pixels={override_max}.')
         return {'shortest_edge': override_min, 'longest_edge': override_max}
 
     def get_expanded_input_ids(self, input_prompt, collected_mm_items) -> torch.Tensor:
@@ -313,9 +313,9 @@ def get_expanded_mm_items(self, collected_mm_items):
     def preprocess(self,
                    messages: list[dict],
                    input_prompt: str | list[int],
-                   mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
-        """Preprocess multimodal data and return input_ids + multimodal
-        features.
+                   mm_processor_kwargs: dict[str, Any] | None = None) -> dict[str, Any]:
+        """Preprocess multimodal data and return a dict with ``input_ids`` and
+        multimodal features.
 
         New-style models inherit this implementation. Legacy models override with `def preprocess(self, messages)`.
         """
@@ -369,13 +369,13 @@ def preprocess(self,
             assert not raw_images and not raw_videos, \
                 'time series is not compatible with image/video input'
             self.tokenizer = self.processor.tokenizer
-            self.processor = self.time_series_processor
+            time_series_processor = self.time_series_processor
             kwargs['time_series'] = raw_time_series
             kwargs['sampling_rate'] = sampling_rates
 
         # process raw items with hf processor
         input_text = input_prompt if isinstance(input_prompt, str) else ''
-        processor_outputs = self.processor(
+        processor_outputs = (time_series_processor if raw_time_series else self.processor)(
             text=[input_text],
             padding=True,
             return_tensors='pt',
diff --git a/lmdeploy/vl/model/glm4_1v.py b/lmdeploy/vl/model/glm4_1v.py
index fc997d9ac3..feeed84145 100644
--- a/lmdeploy/vl/model/glm4_1v.py
+++ b/lmdeploy/vl/model/glm4_1v.py
@@ -36,4 +36,4 @@ def build_preprocessor(self):
 
     def apply_chat_template(self, messages, chat_template, sequence_start, chat_template_kwargs=None):
         """Apply chat template to get the prompt."""
-        return chat_template.messages2prompt(messages, sequence_start, **chat_template_kwargs)
+        return chat_template.messages2prompt(messages, sequence_start, **(chat_template_kwargs or {}))

From 3df15927694ce8d1a0ecf0c025240ea144114b41 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 17 Apr 2026 20:47:27 +0800
Subject: [PATCH 21/21] refactor: rename interns1_pro_ts.py to
 interns1_pro_time_series.py

---
 lmdeploy/pytorch/models/interns1_pro.py                         | 2 +-
 .../models/{interns1_pro_ts.py => interns1_pro_time_series.py}  | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename lmdeploy/pytorch/models/{interns1_pro_ts.py => interns1_pro_time_series.py} (100%)

diff --git a/lmdeploy/pytorch/models/interns1_pro.py b/lmdeploy/pytorch/models/interns1_pro.py
index 4467fc0bba..734a10f480 100644
--- a/lmdeploy/pytorch/models/interns1_pro.py
+++ b/lmdeploy/pytorch/models/interns1_pro.py
@@ -12,7 +12,7 @@
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 from lmdeploy.vl.constants import Modality
 
-from .interns1_pro_ts import InternS1ProTimeSeriesModel
+from .interns1_pro_time_series import InternS1ProTimeSeriesModel
 from .patch import add_prefix, get_build_model_context
 from .qwen3_moe import Qwen3MoeModel
 from .qwen3_vl import Qwen3VLVisionModel
diff --git a/lmdeploy/pytorch/models/interns1_pro_ts.py b/lmdeploy/pytorch/models/interns1_pro_time_series.py
similarity index 100%
rename from lmdeploy/pytorch/models/interns1_pro_ts.py
rename to lmdeploy/pytorch/models/interns1_pro_time_series.py