ModelEngine-Group · ygwpz · Nov 29, 2025 · Nov 28, 2025
diff --git a/docs/source/user-guide/sparse-attention/esa.md b/docs/source/user-guide/sparse-attention/esa.md
@@ -9,6 +9,7 @@ ESA provides developers with an intuitive example of how to implement their own
 ### Basic Usage
 ESA can be launched using the following command:
 ```shell
+export ENABLE_SPARSE=TRUE
 export MODEL_PATH="/path/to/model" # For example: /home/models/Qwen2.5-14B-Instruct
 export DATASET_PATH="/path/to/longbench/multifieldqa_zh.jsonl" # For example: /home/data/Longbench/data/multifieldqa_zh.jsonl
 python examples/offline_inference_esa.py

diff --git a/docs/source/user-guide/sparse-attention/gsa.md b/docs/source/user-guide/sparse-attention/gsa.md
@@ -107,6 +107,8 @@ ktc = KVTransferConfig(
 Thus, an example command for launching the online LLM service is as follows:
 
 ```shell
+export ENABLE_SPARSE=TRUE
+
 vllm serve /home/models/DeepSeek-R1-Distill-Qwen-32B \
 --served-model-name DeepSeek-R1-Distill-Qwen-32B \
 --max-model-len 131000 \

diff --git a/docs/source/user-guide/sparse-attention/kvcomp.md b/docs/source/user-guide/sparse-attention/kvcomp.md
@@ -97,6 +97,7 @@ This design ensures both **efficiency** and **accuracy** by preserving essential
 KVComp is part of the UCM Sparse Attention module. For installation instructions, please refer to the [UCM's top-level README](https://github.com/ModelEngine-Group/unified-cache-management). Once UCM is installed, KVComp is naturally supported by running the following example python scripts.
 
 ```bash
+export ENABLE_SPARSE=TRUE
 python ucm/sandbox/sparse/kvcomp/offline_inference_kvcomp.py
 ```
 

diff --git a/docs/source/user-guide/sparse-attention/kvstar.md b/docs/source/user-guide/sparse-attention/kvstar.md
@@ -32,6 +32,7 @@ For long-sequence inference, KVstar achieves the following with minimal accuracy
 ### Basic Usage
 KVstar can be launched using the following command:
 ```shell
+export ENABLE_SPARSE=TRUE
 export MODEL_PATH="/path/to/model" # For example: /home/models/Qwen2.5-14B-Instruct
 export DATASET_PATH="/path/to/longbench/multifieldqa_zh.jsonl" # For example: /home/data/Longbench/data/multifieldqa_zh.jsonl
 export DATA_DIR="/path/to/data"

diff --git a/examples/offline_inference_esa.py b/examples/offline_inference_esa.py
@@ -24,6 +24,7 @@
 def setup_environment_variables():
     os.environ["VLLM_USE_V1"] = "1"
     os.environ["PYTHONHASHSEED"] = "123456"
+    os.environ["ENABLE_SPARSE"] = "true"
 
     global model, path_to_dataset, data_dir, tokenizer
     model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")

diff --git a/examples/offline_inference_kvcomp.py b/examples/offline_inference_kvcomp.py
@@ -24,6 +24,7 @@
 def setup_environment_variables():
     os.environ["VLLM_USE_V1"] = "1"
     os.environ["PYTHONHASHSEED"] = "123456"
+    os.environ["ENABLE_SPARSE"] = "true"
 
     global model, path_to_dataset, data_dir, tokenizer
     model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")

diff --git a/examples/offline_inference_kvstar.py b/examples/offline_inference_kvstar.py
@@ -24,6 +24,7 @@
 def setup_environment_variables():
     os.environ["VLLM_USE_V1"] = "1"
     os.environ["PYTHONHASHSEED"] = "123456"
+    os.environ["ENABLE_SPARSE"] = "true"
 
     global model, path_to_dataset, data_dir, tokenizer
     model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")

diff --git a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py
@@ -24,17 +24,32 @@
 
 from __future__ import annotations
 
+import os
+
 from ucm.logger import init_logger
 
 logger = init_logger(__name__)
 
+ENABLE_SPARSE = os.getenv("ENABLE_SPARSE")
+
+
+def _enable_sparse() -> bool:
+    return ENABLE_SPARSE is not None and ENABLE_SPARSE.lower() == "true"
+
 
 def _apply_ascend_patch() -> None:
     """Apply patch for vLLM-Ascend."""
-    _patch_attention_v1()
-    _patch_mla_v1()
-    _patch_model_runner_v1()
-    _patch_worker_v1()
+    try:
+        if _enable_sparse():
+            _patch_attention_v1()
+            _patch_mla_v1()
+            _patch_model_runner_v1()
+            _patch_worker_v1()
+            logger.info("UCM sparse adapt patches applied successfully")
+
+    except Exception as e:
+        logger.error(f"Could not apply sparse adapt patches: {e}")
+        raise e
 
 
 # ========================= vllm_ascend/attention/attention_v1.py =========================

diff --git a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py
@@ -23,23 +23,33 @@
 #
 from __future__ import annotations
 
+import os
+
 from ucm.logger import init_logger
 
 logger = init_logger(__name__)
 
+ENABLE_SPARSE = os.getenv("ENABLE_SPARSE")
+
+
+def _enable_sparse() -> bool:
+    return ENABLE_SPARSE is not None and ENABLE_SPARSE.lower() == "true"
+
 
 def _apply_sparse_adapt() -> None:
     """Apply sparse adapt patches."""
     try:
-        _patch_block_table()
-        _patch_kv_cache_manager()
-        _patch_shared_storage_connector()
-        _patch_attention_layer()
-        _patch_mla_common()
-        _patch_gpu_model_runner()
-        _patch_gpu_worker()
-        _patch_scheduler_output()
-        _patch_scheduler()
+        if _enable_sparse():
+            _patch_block_table()
+            _patch_kv_cache_manager()
+            _patch_shared_storage_connector()
+            _patch_attention_layer()
+            _patch_mla_common()
+            _patch_gpu_model_runner()
+            _patch_gpu_worker()
+            _patch_scheduler_output()
+            _patch_scheduler()
+            logger.info("UCM sparse adapt patches applied successfully")
     except Exception as e:
         logger.error(f"Could not apply sparse adapt patches: {e}")
         raise e

diff --git a/ucm/sparse/kvcomp/README.md b/ucm/sparse/kvcomp/README.md
@@ -97,6 +97,7 @@ This design ensures both **efficiency** and **accuracy** by preserving essential
 KVComp is part of the UCM Sparse Attention module. For installation instructions, please refer to the [UCM's top-level README](../../../../README.md). Once UCM is installed, KVComp is naturally supported by running the following example python scripts.
 
 ```bash
+export ENABLE_SPARSE=TRUE
 python ucm/sandbox/sparse/kvcomp/offline_inference_kvcomp.py
 ```
-Original file line number
+Diff line change
@@ Expand Up @@
     KVComp is part of the UCM Sparse Attention module. For installation instructions, please refer to the [UCM's top-level README](https://github.com/ModelEngine-Group/unified-cache-management). Once UCM is installed, KVComp is naturally supported by running the following example python scripts.
     ```bash
+    export ENABLE_SPARSE=TRUE
     python ucm/sandbox/sparse/kvcomp/offline_inference_kvcomp.py
     ```
@@ Expand Down @@