From e167f6bdcc87525dc746aeea2831f1ab5da05ec9 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Thu, 14 May 2026 06:16:51 +0000 Subject: [PATCH 1/3] feat(multimodal): add max_image_token_count guard with OOM risk guidance --- docs/CN/source/tutorial/api_server_args.rst | 6 ++++++ docs/EN/source/tutorial/api_server_args.rst | 6 ++++++ lightllm/server/api_cli.py | 6 ++++++ lightllm/server/core/objs/start_args_type.py | 1 + lightllm/server/httpserver/manager.py | 13 ++++++++++++- lightllm/server/httpserver_for_pd_master/manager.py | 8 +++++++- 6 files changed, 38 insertions(+), 2 deletions(-) diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst index 6d372fee2..41e28a132 100644 --- a/docs/CN/source/tutorial/api_server_args.rst +++ b/docs/CN/source/tutorial/api_server_args.rst @@ -272,6 +272,12 @@ PD 分离模式参数 多模态资源的缓存服务器容量,默认为 ``200`` +.. option:: --max_image_token_count + + 单张图片在转换为 token 后允许的最大 token 数量,默认为 ``4096`` + + 当任意图片超过该阈值时,请求会被拒绝。 + .. option:: --visual_infer_batch_size 每次推理批次中处理的图像数量,默认为 ``1`` diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst index ab5143a47..df484c2aa 100644 --- a/docs/EN/source/tutorial/api_server_args.rst +++ b/docs/EN/source/tutorial/api_server_args.rst @@ -270,6 +270,12 @@ Multimodal Parameters Cache server capacity for multimodal resources, default is ``200`` +.. option:: --max_image_token_count + + Maximum allowed token count for a single image after tokenization, default is ``4096`` + + Requests are rejected when any image exceeds this limit. + .. option:: --visual_infer_batch_size Number of images processed in each inference batch, default is ``1`` diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index 4a345000b..361576e04 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -442,6 +442,12 @@ def make_argument_parser() -> argparse.ArgumentParser: parser.add_argument( "--cache_capacity", type=int, default=200, help="cache server capacity for multimodal resources" ) + parser.add_argument( + "--max_image_token_count", + type=int, + default=4096, + help="maximum allowed token count for one image after tokenization", + ) parser.add_argument( "--embed_cache_storage_size", type=float, diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py index ce613b105..ce7cf012a 100644 --- a/lightllm/server/core/objs/start_args_type.py +++ b/lightllm/server/core/objs/start_args_type.py @@ -95,6 +95,7 @@ class StartArgs: enable_decode_microbatch_overlap: bool = field(default=False) enable_prefill_microbatch_overlap: bool = field(default=False) cache_capacity: int = field(default=200) + max_image_token_count: int = field(default=4096) embed_cache_storage_size: float = field(default=4) data_type: Optional[str] = field( default=None, metadata={"choices": ["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"]} diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index d54de63f3..bdf898b4b 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -181,6 +181,14 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas): self.cache_client.root.set_items_data(update_data_ids) return + def _assert_image_token_count(self, token_num: int): + assert token_num <= self.args.max_image_token_count, ( + f"single image token count {token_num} exceeds max_image_token_count {self.args.max_image_token_count}. " + f"You can increase this limit by setting --max_image_token_count to a larger value when starting " + f"LightLLM. Warning: increasing this limit raises runtime OOM risk." + ) + return + async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): # 只有 P 和 NORMAL 节点需要真的管理多模态资源 if self.pd_mode.is_P_or_NORMAL(): @@ -190,6 +198,7 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, data = img.read() # must after init_imageitem_extral_params token_num = self.tokenizer.get_image_token_length(img) + self._assert_image_token_count(token_num) md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params))) md5sums.append(md5sum) img.md5 = md5sum @@ -245,7 +254,9 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar for img in multimodal_params.images: img_count += 1 self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params) - image_tokens += self.tokenizer.get_image_token_length(img) + token_num = self.tokenizer.get_image_token_length(img) + self._assert_image_token_count(token_num) + image_tokens += token_num for audio in multimodal_params.audios: audio_count += 1 self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params) diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py index af7a1e29f..406d83759 100644 --- a/lightllm/server/httpserver_for_pd_master/manager.py +++ b/lightllm/server/httpserver_for_pd_master/manager.py @@ -81,7 +81,13 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar for img in multimodal_params.images: img_count += 1 self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params) - image_tokens += self.tokenizer.get_image_token_length(img) + token_num = self.tokenizer.get_image_token_length(img) + assert token_num <= self.args.max_image_token_count, ( + f"single image token count {token_num} exceeds max_image_token_count {self.args.max_image_token_count}." + f"You can increase this limit by setting --max_image_token_count to a larger value when starting " + f"LightLLM. Warning: increasing this limit raises runtime OOM risk." + ) + image_tokens += token_num for audio in multimodal_params.audios: audio_count += 1 self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params) From 17aec9f31a2027070cf7a610dd80d13cf39c2b81 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Thu, 14 May 2026 06:36:53 +0000 Subject: [PATCH 2/3] fix log --- lightllm/server/router/manager.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py index 24f8da6e6..045723d07 100644 --- a/lightllm/server/router/manager.py +++ b/lightllm/server/router/manager.py @@ -436,9 +436,11 @@ def _generate_new_batch(self): new_batch = self.req_queue.generate_new_batch( Batch.merge_two_batch(self.running_batch, self.schedule_new_batch) ) + + if new_batch is not None and len(new_batch.reqs) > 0: + logger.info(f"generate new batch, {new_batch.simple_log()}") + self.schedule_new_batch = Batch.merge_two_batch(self.schedule_new_batch, new_batch) - if self.schedule_new_batch is not None: - logger.info(f"gen new batch, {self.schedule_new_batch.simple_log()}") return def _multinode_tp_generate_new_batch(self): From a7a1128c99b0292a14049240ea1f0cd5a403a042 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Thu, 14 May 2026 06:53:30 +0000 Subject: [PATCH 3/3] fix --- docs/CN/source/tutorial/api_server_args.rst | 2 +- docs/EN/source/tutorial/api_server_args.rst | 2 +- lightllm/server/api_cli.py | 2 +- lightllm/server/core/objs/start_args_type.py | 2 +- lightllm/server/httpserver/manager.py | 13 ++++++++----- lightllm/server/httpserver_for_pd_master/manager.py | 13 ++++++++----- 6 files changed, 20 insertions(+), 14 deletions(-) diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst index 41e28a132..42ab04e4f 100644 --- a/docs/CN/source/tutorial/api_server_args.rst +++ b/docs/CN/source/tutorial/api_server_args.rst @@ -274,7 +274,7 @@ PD 分离模式参数 .. option:: --max_image_token_count - 单张图片在转换为 token 后允许的最大 token 数量,默认为 ``4096`` + 单张图片在转换为 token 后允许的最大 token 数量,默认为 ``6128`` 当任意图片超过该阈值时,请求会被拒绝。 diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst index df484c2aa..7f3f8f208 100644 --- a/docs/EN/source/tutorial/api_server_args.rst +++ b/docs/EN/source/tutorial/api_server_args.rst @@ -272,7 +272,7 @@ Multimodal Parameters .. option:: --max_image_token_count - Maximum allowed token count for a single image after tokenization, default is ``4096`` + Maximum allowed token count for a single image after tokenization, default is ``6128`` Requests are rejected when any image exceeds this limit. diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index 361576e04..26c651b15 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -445,7 +445,7 @@ def make_argument_parser() -> argparse.ArgumentParser: parser.add_argument( "--max_image_token_count", type=int, - default=4096, + default=6128, help="maximum allowed token count for one image after tokenization", ) parser.add_argument( diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py index ce7cf012a..b02094eed 100644 --- a/lightllm/server/core/objs/start_args_type.py +++ b/lightllm/server/core/objs/start_args_type.py @@ -95,7 +95,7 @@ class StartArgs: enable_decode_microbatch_overlap: bool = field(default=False) enable_prefill_microbatch_overlap: bool = field(default=False) cache_capacity: int = field(default=200) - max_image_token_count: int = field(default=4096) + max_image_token_count: int = field(default=6128) embed_cache_storage_size: float = field(default=4) data_type: Optional[str] = field( default=None, metadata={"choices": ["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"]} diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index bdf898b4b..02ed716f1 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -182,11 +182,14 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas): return def _assert_image_token_count(self, token_num: int): - assert token_num <= self.args.max_image_token_count, ( - f"single image token count {token_num} exceeds max_image_token_count {self.args.max_image_token_count}. " - f"You can increase this limit by setting --max_image_token_count to a larger value when starting " - f"LightLLM. Warning: increasing this limit raises runtime OOM risk." - ) + if token_num > self.args.max_image_token_count: + err_msg = ( + f"single image token count {token_num} exceeds max_image_token_count {self.args.max_image_token_count}." + f"You can increase this limit by setting --max_image_token_count to a larger value when starting " + f"LightLLM. Warning: increasing this limit raises runtime OOM risk." + ) + logger.warning(err_msg) + raise ValueError(err_msg) return async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py index 406d83759..307a3d48a 100644 --- a/lightllm/server/httpserver_for_pd_master/manager.py +++ b/lightllm/server/httpserver_for_pd_master/manager.py @@ -82,11 +82,14 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar img_count += 1 self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params) token_num = self.tokenizer.get_image_token_length(img) - assert token_num <= self.args.max_image_token_count, ( - f"single image token count {token_num} exceeds max_image_token_count {self.args.max_image_token_count}." - f"You can increase this limit by setting --max_image_token_count to a larger value when starting " - f"LightLLM. Warning: increasing this limit raises runtime OOM risk." - ) + if token_num > self.args.max_image_token_count: + err_msg = ( + f"the image token count {token_num} > max_image_token_count {self.args.max_image_token_count}. " + f"You can increase this limit by setting --max_image_token_count to a larger value when starting " + f"LightLLM. Warning: increasing this limit raises runtime OOM risk." + ) + logger.warning(err_msg) + raise ValueError(err_msg) image_tokens += token_num for audio in multimodal_params.audios: audio_count += 1