From e04c0555adcf380ea9275cfdd8de34b8564a43c5 Mon Sep 17 00:00:00 2001
From: lijiachen19 <lijiachen19@huawei.com>
Date: Fri, 28 Nov 2025 01:38:40 -0800
Subject: [PATCH] add env variable ENABLE_SPARSE

---
 .../source/user-guide/sparse-attention/esa.md |  1 +
 .../source/user-guide/sparse-attention/gsa.md |  2 ++
 .../user-guide/sparse-attention/kvcomp.md     |  1 +
 .../user-guide/sparse-attention/kvstar.md     |  1 +
 examples/offline_inference_esa.py             |  1 +
 examples/offline_inference_kvcomp.py          |  1 +
 examples/offline_inference_kvstar.py          |  1 +
 .../patch_funcs/v092/vllm_ascend_patch.py     | 23 ++++++++++++---
 .../vllm/patch/patch_funcs/v092/vllm_patch.py | 28 +++++++++++++------
 ucm/sparse/kvcomp/README.md                   |  1 +
 10 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/docs/source/user-guide/sparse-attention/esa.md b/docs/source/user-guide/sparse-attention/esa.md
index aee584c43..53beadf10 100644
--- a/docs/source/user-guide/sparse-attention/esa.md
+++ b/docs/source/user-guide/sparse-attention/esa.md
@@ -9,6 +9,7 @@ ESA provides developers with an intuitive example of how to implement their own
 ### Basic Usage
 ESA can be launched using the following command:
 ```shell
+export ENABLE_SPARSE=TRUE
 export MODEL_PATH="/path/to/model" # For example: /home/models/Qwen2.5-14B-Instruct
 export DATASET_PATH="/path/to/longbench/multifieldqa_zh.jsonl" # For example: /home/data/Longbench/data/multifieldqa_zh.jsonl
 python examples/offline_inference_esa.py
diff --git a/docs/source/user-guide/sparse-attention/gsa.md b/docs/source/user-guide/sparse-attention/gsa.md
index 327fe0769..5a96287a3 100644
--- a/docs/source/user-guide/sparse-attention/gsa.md
+++ b/docs/source/user-guide/sparse-attention/gsa.md
@@ -107,6 +107,8 @@ ktc = KVTransferConfig(
 Thus, an example command for launching the online LLM service is as follows:
 
 ```shell
+export ENABLE_SPARSE=TRUE
+
 vllm serve /home/models/DeepSeek-R1-Distill-Qwen-32B \
 --served-model-name DeepSeek-R1-Distill-Qwen-32B \
 --max-model-len 131000 \
diff --git a/docs/source/user-guide/sparse-attention/kvcomp.md b/docs/source/user-guide/sparse-attention/kvcomp.md
index 3c1d0b238..4e0cbc715 100644
--- a/docs/source/user-guide/sparse-attention/kvcomp.md
+++ b/docs/source/user-guide/sparse-attention/kvcomp.md
@@ -97,6 +97,7 @@ This design ensures both **efficiency** and **accuracy** by preserving essential
 KVComp is part of the UCM Sparse Attention module. For installation instructions, please refer to the [UCM's top-level README](https://github.com/ModelEngine-Group/unified-cache-management). Once UCM is installed, KVComp is naturally supported by running the following example python scripts.
 
 ```bash
+export ENABLE_SPARSE=TRUE
 python ucm/sandbox/sparse/kvcomp/offline_inference_kvcomp.py
 ```
 
diff --git a/docs/source/user-guide/sparse-attention/kvstar.md b/docs/source/user-guide/sparse-attention/kvstar.md
index cf6222158..41d13358b 100644
--- a/docs/source/user-guide/sparse-attention/kvstar.md
+++ b/docs/source/user-guide/sparse-attention/kvstar.md
@@ -32,6 +32,7 @@ For long-sequence inference, KVstar achieves the following with minimal accuracy
 ### Basic Usage
 KVstar can be launched using the following command:
 ```shell
+export ENABLE_SPARSE=TRUE
 export MODEL_PATH="/path/to/model" # For example: /home/models/Qwen2.5-14B-Instruct
 export DATASET_PATH="/path/to/longbench/multifieldqa_zh.jsonl" # For example: /home/data/Longbench/data/multifieldqa_zh.jsonl
 export DATA_DIR="/path/to/data"
diff --git a/examples/offline_inference_esa.py b/examples/offline_inference_esa.py
index 02908fb7b..c420e9b9e 100644
--- a/examples/offline_inference_esa.py
+++ b/examples/offline_inference_esa.py
@@ -24,6 +24,7 @@
 def setup_environment_variables():
     os.environ["VLLM_USE_V1"] = "1"
     os.environ["PYTHONHASHSEED"] = "123456"
+    os.environ["ENABLE_SPARSE"] = "true"
 
     global model, path_to_dataset, data_dir, tokenizer
     model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")
diff --git a/examples/offline_inference_kvcomp.py b/examples/offline_inference_kvcomp.py
index 595850bec..6aa2ed310 100644
--- a/examples/offline_inference_kvcomp.py
+++ b/examples/offline_inference_kvcomp.py
@@ -24,6 +24,7 @@
 def setup_environment_variables():
     os.environ["VLLM_USE_V1"] = "1"
     os.environ["PYTHONHASHSEED"] = "123456"
+    os.environ["ENABLE_SPARSE"] = "true"
 
     global model, path_to_dataset, data_dir, tokenizer
     model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")
diff --git a/examples/offline_inference_kvstar.py b/examples/offline_inference_kvstar.py
index e26113993..702175423 100644
--- a/examples/offline_inference_kvstar.py
+++ b/examples/offline_inference_kvstar.py
@@ -24,6 +24,7 @@
 def setup_environment_variables():
     os.environ["VLLM_USE_V1"] = "1"
     os.environ["PYTHONHASHSEED"] = "123456"
+    os.environ["ENABLE_SPARSE"] = "true"
 
     global model, path_to_dataset, data_dir, tokenizer
     model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")
diff --git a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py
index 2b63838b0..6abccf1b8 100644
--- a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py
+++ b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py
@@ -24,17 +24,32 @@
 
 from __future__ import annotations
 
+import os
+
 from ucm.logger import init_logger
 
 logger = init_logger(__name__)
 
+ENABLE_SPARSE = os.getenv("ENABLE_SPARSE")
+
+
+def _enable_sparse() -> bool:
+    return ENABLE_SPARSE is not None and ENABLE_SPARSE.lower() == "true"
+
 
 def _apply_ascend_patch() -> None:
     """Apply patch for vLLM-Ascend."""
-    _patch_attention_v1()
-    _patch_mla_v1()
-    _patch_model_runner_v1()
-    _patch_worker_v1()
+    try:
+        if _enable_sparse():
+            _patch_attention_v1()
+            _patch_mla_v1()
+            _patch_model_runner_v1()
+            _patch_worker_v1()
+            logger.info("UCM sparse adapt patches applied successfully")
+
+    except Exception as e:
+        logger.error(f"Could not apply sparse adapt patches: {e}")
+        raise e
 
 
 # ========================= vllm_ascend/attention/attention_v1.py =========================
diff --git a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py
index 9f532b1ec..2a697efb0 100644
--- a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py
+++ b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py
@@ -23,23 +23,33 @@
 #
 from __future__ import annotations
 
+import os
+
 from ucm.logger import init_logger
 
 logger = init_logger(__name__)
 
+ENABLE_SPARSE = os.getenv("ENABLE_SPARSE")
+
+
+def _enable_sparse() -> bool:
+    return ENABLE_SPARSE is not None and ENABLE_SPARSE.lower() == "true"
+
 
 def _apply_sparse_adapt() -> None:
     """Apply sparse adapt patches."""
     try:
-        _patch_block_table()
-        _patch_kv_cache_manager()
-        _patch_shared_storage_connector()
-        _patch_attention_layer()
-        _patch_mla_common()
-        _patch_gpu_model_runner()
-        _patch_gpu_worker()
-        _patch_scheduler_output()
-        _patch_scheduler()
+        if _enable_sparse():
+            _patch_block_table()
+            _patch_kv_cache_manager()
+            _patch_shared_storage_connector()
+            _patch_attention_layer()
+            _patch_mla_common()
+            _patch_gpu_model_runner()
+            _patch_gpu_worker()
+            _patch_scheduler_output()
+            _patch_scheduler()
+            logger.info("UCM sparse adapt patches applied successfully")
     except Exception as e:
         logger.error(f"Could not apply sparse adapt patches: {e}")
         raise e
diff --git a/ucm/sparse/kvcomp/README.md b/ucm/sparse/kvcomp/README.md
index b010e7d91..76283551c 100644
--- a/ucm/sparse/kvcomp/README.md
+++ b/ucm/sparse/kvcomp/README.md
@@ -97,6 +97,7 @@ This design ensures both **efficiency** and **accuracy** by preserving essential
 KVComp is part of the UCM Sparse Attention module. For installation instructions, please refer to the [UCM's top-level README](../../../../README.md). Once UCM is installed, KVComp is naturally supported by running the following example python scripts.
 
 ```bash
+export ENABLE_SPARSE=TRUE
 python ucm/sandbox/sparse/kvcomp/offline_inference_kvcomp.py
 ```