Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
5a10eb2
rebase
Oct 14, 2025
135e4b6
rebase1
Oct 14, 2025
3565793
fix
Oct 14, 2025
9158f3a
fix
Oct 15, 2025
cbad99a
fix
Oct 15, 2025
8559841
fix
Oct 15, 2025
255f391
fix
Oct 15, 2025
b72252e
fix
Oct 15, 2025
6cb5282
fix hugepage_num
Oct 15, 2025
94d0e22
reformat
Oct 15, 2025
4a9d0f3
router addd cpu_cache_client
Oct 15, 2025
13aa1af
fix cpu cache aborted
Oct 15, 2025
53aa5a1
fix
Oct 15, 2025
291e49d
fix
Oct 15, 2025
dd42001
refractor multi_kv_level_cache
Oct 15, 2025
8b3a1b3
remove compute_sequence_hash
Oct 16, 2025
070443c
diverse_copy support cpu cache pipline
Oct 16, 2025
649b4a8
fix wait event
Oct 16, 2025
71f414a
fix
Oct 16, 2025
542cb48
fix
Oct 16, 2025
7ae1d0c
diversemode fix
Oct 17, 2025
a829095
fix names
Oct 17, 2025
9bce1e2
fix cpu cache load kernel
Oct 17, 2025
590782f
fix method name
Oct 17, 2025
bca4e35
fix
Oct 17, 2025
da76a05
fix
Oct 20, 2025
1911b34
fix
Oct 20, 2025
f23113f
fix
Oct 20, 2025
7b7f85e
fix
Oct 20, 2025
505f860
fix lock range
Oct 20, 2025
f320a7a
fix
Oct 20, 2025
2c36e20
fix
Oct 20, 2025
658c40e
fix ill error
Oct 20, 2025
b6455df
improve kernel
Oct 21, 2025
be62703
fix
Oct 21, 2025
3b51af8
fix radix cache
Oct 22, 2025
f902b18
fix
Oct 22, 2025
0528bea
fix cpu kv kernel bug
Oct 22, 2025
c7d7a1b
improve cpu kv kernel setting
Oct 22, 2025
7f4606d
fix
Oct 22, 2025
067cf29
fix
Oct 22, 2025
7b07245
fix
Oct 22, 2025
33cd95d
fix
Oct 22, 2025
6b58763
fix
Oct 22, 2025
3e08d57
fix fa3 sync and add env
Oct 22, 2025
a480437
fix format
hiworldwzj Oct 22, 2025
a99315c
fix unittest
hiworldwzj Oct 22, 2025
23ee96e
fix
Oct 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion lightllm/common/basemodel/basemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def __init__(self, kvargs):
self.finetune_config = kvargs.get("finetune_config", None)
self.max_req_num = kvargs.get("max_req_num", 1000)
self.max_seq_length = kvargs.get("max_seq_length", 1024 * 5)
# 用于等待外围的一些模块的初始化完成(如 CPU KV Cache 注册完成)
self.wait_events = kvargs.get("wait_events", [])
# is_token_healing 和 return_all_prompt_logics 是有排斥关系的两个模式,只能单独有一个生效
# 主要是在prefill阶段返回多少个token的用于后续处理相关。
self.is_token_healing = kvargs.get("is_token_healing", False)
Expand Down Expand Up @@ -110,12 +112,19 @@ def __init__(self, kvargs):
self._init_inferstate_cls()
self._autotune_warmup()
self._init_padded_req()
# wait必须在init cudagraph 之前,避免错误捕获
self._wait_other_modules_ready()
self._init_cudagraph()
self._check_max_len_infer()
torch.cuda.empty_cache()
set_model_init_status(True)
return

def _wait_other_modules_ready(self):
for event in self.wait_events:
event.wait()
return

def _init_config(self):
with open(os.path.join(self.weight_dir_, "config.json"), "r") as json_file:
self.config = json.load(json_file)
Expand Down Expand Up @@ -352,8 +361,13 @@ def _prefill(
alloc_mem_index=infer_state.mem_index,
max_q_seq_len=infer_state.max_q_seq_len,
)
prefill_mem_indexes_ready_event = torch.cuda.Event()
prefill_mem_indexes_ready_event.record()

infer_state.init_some_extra_state(self, model_input.input_ids)
return self._context_forward(model_input.input_ids, infer_state)
model_output = self._context_forward(model_input.input_ids, infer_state)
model_output.prefill_mem_indexes_ready_event = prefill_mem_indexes_ready_event
return model_output

def _decode(
self,
Expand Down Expand Up @@ -505,13 +519,18 @@ def microbatch_overlap_prefill(self, model_input0: ModelInput, model_input1: Mod
)
infer_state1.init_some_extra_state(self, input_ids1)

prefill_mem_indexes_ready_event = torch.cuda.Event()
prefill_mem_indexes_ready_event.record()

model_output0, model_output1 = self._overlap_tpsp_context_forward(
input_ids0, infer_state0, input_ids1=input_ids1, infer_state1=infer_state1
)

# 在开启使用deepep的时候,需要调用clear_deepep_buffer做资源清理,没有启用的时候
# 该调用没有实际意义
dist_group_manager.clear_deepep_buffer()
model_output0.prefill_mem_indexes_ready_event = prefill_mem_indexes_ready_event
model_output1.prefill_mem_indexes_ready_event = prefill_mem_indexes_ready_event
return model_output0, model_output1

@torch.no_grad()
Expand Down
2 changes: 2 additions & 0 deletions lightllm/common/basemodel/batch_objs.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ def to_cuda(self):
class ModelOutput:
# 通用变量
logits: torch.Tensor
# 用于判断 mem_indexes 是否成功写入 req manager 中的事件对象。
prefill_mem_indexes_ready_event: torch.Event = None

# 专有变量,用于一些特殊的模型,特殊的模式下, 传递一些特殊
# 的输出变量。只在特殊的模型模式下才会具体使用和生效。
Expand Down
Loading