ModelTC · hiworldwzj · Dec 5, 2025 · Dec 5, 2025 · gemini-code-assist · Dec 5, 2025
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
@@ -46,6 +46,8 @@ def __init__(self) -> None:
         else:
             self.prefill = self.prefill_normal
             self.decode = self.decode_normal
+
+        self.classed_req_strict_prefill = False
-        self.classed_req_strict_prefill = False
+        # Override base class setting. For chunked prefill, a request that has
+        # finished prefilling the prompt and is ready for the first token generation
+        # should be classified as a decode request, not a prefill one.
+        self.classed_req_strict_prefill = False
-        self.classed_req_strict_prefill = False
+        # Override base class setting. For chunked prefill, a request that has
+        # finished prefilling the prompt and is ready for the first token generation
+        # should be classified as a decode request, not a prefill one.
+        self.classed_req_strict_prefill = False
         return
 
     def infer_loop(self):

diff --git a/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py b/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py
@@ -58,6 +58,8 @@ def __init__(self) -> None:
                 self.decode = self.decode_overlap
             else:
                 self.decode = self.decode_normal
+
+        self.classed_req_strict_prefill = False
-        self.classed_req_strict_prefill = False
+        # Override base class setting. For this backend, a request that has
+        # finished prefilling the prompt and is ready for the first token generation
+        # should be classified as a decode request, not a prefill one.
+        self.classed_req_strict_prefill = False
-        self.classed_req_strict_prefill = False
+        # Override base class setting. For this backend, a request that has
+        # finished prefilling the prompt and is ready for the first token generation
+        # should be classified as a decode request, not a prefill one.
+        self.classed_req_strict_prefill = False
         return
 
     def infer_loop(self):