From 95a7340382876469c60888393023cc43c7b6ed1f Mon Sep 17 00:00:00 2001 From: sangchengmeng Date: Tue, 13 May 2025 15:22:48 +0800 Subject: [PATCH 1/4] debug cache --- lightllm/server/core/objs/req.py | 2 ++ lightllm/server/embed_cache/impl/naive_memory_cache.py | 5 +++++ lightllm/server/httpserver/manager.py | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lightllm/server/core/objs/req.py b/lightllm/server/core/objs/req.py index c8d8476e51..294fd72c5b 100644 --- a/lightllm/server/core/objs/req.py +++ b/lightllm/server/core/objs/req.py @@ -200,9 +200,11 @@ def can_release(self): can_released_mark = self.can_released_mark if self.is_aborted and can_released_mark and ref_count_ok: + print("because of aborted, can release") return True if self.finish_status.is_finished() and can_released_mark and ref_count_ok and self.out_tokens_queue.is_empty(): + print("because of finished, can release") return True return False diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py index c03b084c47..36007cbada 100644 --- a/lightllm/server/embed_cache/impl/naive_memory_cache.py +++ b/lightllm/server/embed_cache/impl/naive_memory_cache.py @@ -78,6 +78,10 @@ def _clear(self): t = time.time() for id, record in items: if record.ref <= 0 or t - record.visittime >= self.expired_secs: + if record.ref <= 0: + logger.info(f"id {id}'s record ref is 0") + if t - record.visittime >= self.expired_secs: + logger.info(f"id {id}'s record expired, because of time_expired") if record.data: free_shm(get_shm_name_data(id)) if record.embed: @@ -129,6 +133,7 @@ def alloc(self, md5sum: str, token_num: int) -> dict: return {"id": record.id, "token_id": record.token_id, "token_num": record.token_num} def release(self, id: int) -> None: + logger.info(f"Releasing id {id}") with self.lock: self._records[id].ref -= 1 diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 5ef4112473..5076cdd7e6 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -161,6 +161,7 @@ async def _release_multimodal_resources(self, multimodal_params: MultimodalParam if multimodal_params is not None: for img in multimodal_params.images: if img.uuid is not None: + logger.info(f"Releasing id {img.uuid}") self.cache_client.root.release(img.uuid) # 将 uuid 等 赋值为 None, 防止因为abort等异常情况造成重复释放异常 img.uuid = None @@ -593,8 +594,8 @@ async def recycle_resource_loop(self): release_req_status: List[ReqStatus] = [] for req_status in self.req_id_to_out_inf.values(): if req_status.can_release(): + logger.info(f"req_status {req_status.group_req_objs.group_req_id} can release") release_req_status.append(req_status) - for req_status in release_req_status: self.req_id_to_out_inf.pop(req_status.group_req_objs.group_req_id, None) for req in req_status.group_req_objs.shm_req_objs: From 7e8f6d7ed80a23df94b4bc2c4c15618330330950 Mon Sep 17 00:00:00 2001 From: sangchengmeng Date: Thu, 15 May 2025 21:13:21 +0800 Subject: [PATCH 2/4] [FIX]fix deadlock in shm --- lightllm/server/httpserver/manager.py | 28 ++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 5076cdd7e6..03b755dc19 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -52,6 +52,7 @@ def __init__( self.multinode_req_manager = None self.nnodes = args.nnodes + self.lock = asyncio.Lock() self.node_rank = args.node_rank self.transfer_lock = asyncio.Lock() # the lock for transfer to next module in multi node mode. self.disable_abort = args.nnodes > 1 and args.dp == 1 # mulitnode dp=1 mode, disable abort @@ -141,19 +142,20 @@ async def _alloc_resource(self, item: Union[ImageItem, AudioItem]): async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): # 只有 P 和 NORMAL 节点需要真的管理多模态资源 if self.pd_mode.is_P_or_NORMAL(): - for img in multimodal_params.images: - self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) - record = await self._alloc_resource(img) - img.uuid = record["id"] - img.token_id = record["token_id"] - img.token_num = record["token_num"] - for audio in multimodal_params.audios: - self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) - record = await self._alloc_resource(audio) - audio.uuid = record["id"] - audio.token_id = record["token_id"] - audio.token_num = record["token_num"] - return + async with self.lock: + for img in multimodal_params.images: + self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) + record = await self._alloc_resource(img) + img.uuid = record["id"] + img.token_id = record["token_id"] + img.token_num = record["token_num"] + for audio in multimodal_params.audios: + self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) + record = await self._alloc_resource(audio) + audio.uuid = record["id"] + audio.token_id = record["token_id"] + audio.token_num = record["token_num"] + return async def _release_multimodal_resources(self, multimodal_params: MultimodalParams): # 只有 P 和 NORMAL 节点需要真的管理多模态资源 From 2862e1633525d0c45ab93be60ebb4a66442ad065 Mon Sep 17 00:00:00 2001 From: sangchengmeng Date: Fri, 16 May 2025 17:09:19 +0800 Subject: [PATCH 3/4] [FIX]fix dead lock when alloc resource --- lightllm/server/httpserver/manager.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 03b755dc19..2dc753e2e1 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -52,7 +52,7 @@ def __init__( self.multinode_req_manager = None self.nnodes = args.nnodes - self.lock = asyncio.Lock() + self._resource_lock = asyncio.Lock() self.node_rank = args.node_rank self.transfer_lock = asyncio.Lock() # the lock for transfer to next module in multi node mode. self.disable_abort = args.nnodes > 1 and args.dp == 1 # mulitnode dp=1 mode, disable abort @@ -142,7 +142,13 @@ async def _alloc_resource(self, item: Union[ImageItem, AudioItem]): async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): # 只有 P 和 NORMAL 节点需要真的管理多模态资源 if self.pd_mode.is_P_or_NORMAL(): - async with self.lock: + # Acquire the lock so that two concurrent requests cannot both + # allocate more records than the cache_capacity. + # For example, if cache_capacity is 10 and each request has 6 images, + # without the lock one request might allocate 5 images, + # then another request allocates 5 more images, filling cache_capacity, + # and both wait for space to free, causing a deadlock. + async with self._resource_lock: for img in multimodal_params.images: self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) record = await self._alloc_resource(img) From bd2df3ddb4aea8ae6a0cd1ff1f0feb7a9f5e4401 Mon Sep 17 00:00:00 2001 From: sangchengmeng Date: Fri, 16 May 2025 17:16:12 +0800 Subject: [PATCH 4/4] [FIX]fix dead lock when alloc resource --- lightllm/server/core/objs/req.py | 2 -- .../server/embed_cache/impl/naive_memory_cache.py | 5 ----- lightllm/server/httpserver/manager.py | 11 +++-------- 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/lightllm/server/core/objs/req.py b/lightllm/server/core/objs/req.py index 294fd72c5b..c8d8476e51 100644 --- a/lightllm/server/core/objs/req.py +++ b/lightllm/server/core/objs/req.py @@ -200,11 +200,9 @@ def can_release(self): can_released_mark = self.can_released_mark if self.is_aborted and can_released_mark and ref_count_ok: - print("because of aborted, can release") return True if self.finish_status.is_finished() and can_released_mark and ref_count_ok and self.out_tokens_queue.is_empty(): - print("because of finished, can release") return True return False diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py index 36007cbada..c03b084c47 100644 --- a/lightllm/server/embed_cache/impl/naive_memory_cache.py +++ b/lightllm/server/embed_cache/impl/naive_memory_cache.py @@ -78,10 +78,6 @@ def _clear(self): t = time.time() for id, record in items: if record.ref <= 0 or t - record.visittime >= self.expired_secs: - if record.ref <= 0: - logger.info(f"id {id}'s record ref is 0") - if t - record.visittime >= self.expired_secs: - logger.info(f"id {id}'s record expired, because of time_expired") if record.data: free_shm(get_shm_name_data(id)) if record.embed: @@ -133,7 +129,6 @@ def alloc(self, md5sum: str, token_num: int) -> dict: return {"id": record.id, "token_id": record.token_id, "token_num": record.token_num} def release(self, id: int) -> None: - logger.info(f"Releasing id {id}") with self.lock: self._records[id].ref -= 1 diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 2dc753e2e1..e02eaaf796 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -142,12 +142,9 @@ async def _alloc_resource(self, item: Union[ImageItem, AudioItem]): async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): # 只有 P 和 NORMAL 节点需要真的管理多模态资源 if self.pd_mode.is_P_or_NORMAL(): - # Acquire the lock so that two concurrent requests cannot both - # allocate more records than the cache_capacity. - # For example, if cache_capacity is 10 and each request has 6 images, - # without the lock one request might allocate 5 images, - # then another request allocates 5 more images, filling cache_capacity, - # and both wait for space to free, causing a deadlock. + # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。 + # 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10, + # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。 async with self._resource_lock: for img in multimodal_params.images: self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) @@ -169,7 +166,6 @@ async def _release_multimodal_resources(self, multimodal_params: MultimodalParam if multimodal_params is not None: for img in multimodal_params.images: if img.uuid is not None: - logger.info(f"Releasing id {img.uuid}") self.cache_client.root.release(img.uuid) # 将 uuid 等 赋值为 None, 防止因为abort等异常情况造成重复释放异常 img.uuid = None @@ -602,7 +598,6 @@ async def recycle_resource_loop(self): release_req_status: List[ReqStatus] = [] for req_status in self.req_id_to_out_inf.values(): if req_status.can_release(): - logger.info(f"req_status {req_status.group_req_objs.group_req_id} can release") release_req_status.append(req_status) for req_status in release_req_status: self.req_id_to_out_inf.pop(req_status.group_req_objs.group_req_id, None)