From e04c0555adcf380ea9275cfdd8de34b8564a43c5 Mon Sep 17 00:00:00 2001 From: lijiachen19 Date: Fri, 28 Nov 2025 01:38:40 -0800 Subject: [PATCH] add env variable ENABLE_SPARSE --- .../source/user-guide/sparse-attention/esa.md | 1 + .../source/user-guide/sparse-attention/gsa.md | 2 ++ .../user-guide/sparse-attention/kvcomp.md | 1 + .../user-guide/sparse-attention/kvstar.md | 1 + examples/offline_inference_esa.py | 1 + examples/offline_inference_kvcomp.py | 1 + examples/offline_inference_kvstar.py | 1 + .../patch_funcs/v092/vllm_ascend_patch.py | 23 ++++++++++++--- .../vllm/patch/patch_funcs/v092/vllm_patch.py | 28 +++++++++++++------ ucm/sparse/kvcomp/README.md | 1 + 10 files changed, 47 insertions(+), 13 deletions(-) diff --git a/docs/source/user-guide/sparse-attention/esa.md b/docs/source/user-guide/sparse-attention/esa.md index aee584c43..53beadf10 100644 --- a/docs/source/user-guide/sparse-attention/esa.md +++ b/docs/source/user-guide/sparse-attention/esa.md @@ -9,6 +9,7 @@ ESA provides developers with an intuitive example of how to implement their own ### Basic Usage ESA can be launched using the following command: ```shell +export ENABLE_SPARSE=TRUE export MODEL_PATH="/path/to/model" # For example: /home/models/Qwen2.5-14B-Instruct export DATASET_PATH="/path/to/longbench/multifieldqa_zh.jsonl" # For example: /home/data/Longbench/data/multifieldqa_zh.jsonl python examples/offline_inference_esa.py diff --git a/docs/source/user-guide/sparse-attention/gsa.md b/docs/source/user-guide/sparse-attention/gsa.md index 327fe0769..5a96287a3 100644 --- a/docs/source/user-guide/sparse-attention/gsa.md +++ b/docs/source/user-guide/sparse-attention/gsa.md @@ -107,6 +107,8 @@ ktc = KVTransferConfig( Thus, an example command for launching the online LLM service is as follows: ```shell +export ENABLE_SPARSE=TRUE + vllm serve /home/models/DeepSeek-R1-Distill-Qwen-32B \ --served-model-name DeepSeek-R1-Distill-Qwen-32B \ --max-model-len 131000 \ diff --git a/docs/source/user-guide/sparse-attention/kvcomp.md b/docs/source/user-guide/sparse-attention/kvcomp.md index 3c1d0b238..4e0cbc715 100644 --- a/docs/source/user-guide/sparse-attention/kvcomp.md +++ b/docs/source/user-guide/sparse-attention/kvcomp.md @@ -97,6 +97,7 @@ This design ensures both **efficiency** and **accuracy** by preserving essential KVComp is part of the UCM Sparse Attention module. For installation instructions, please refer to the [UCM's top-level README](https://github.com/ModelEngine-Group/unified-cache-management). Once UCM is installed, KVComp is naturally supported by running the following example python scripts. ```bash +export ENABLE_SPARSE=TRUE python ucm/sandbox/sparse/kvcomp/offline_inference_kvcomp.py ``` diff --git a/docs/source/user-guide/sparse-attention/kvstar.md b/docs/source/user-guide/sparse-attention/kvstar.md index cf6222158..41d13358b 100644 --- a/docs/source/user-guide/sparse-attention/kvstar.md +++ b/docs/source/user-guide/sparse-attention/kvstar.md @@ -32,6 +32,7 @@ For long-sequence inference, KVstar achieves the following with minimal accuracy ### Basic Usage KVstar can be launched using the following command: ```shell +export ENABLE_SPARSE=TRUE export MODEL_PATH="/path/to/model" # For example: /home/models/Qwen2.5-14B-Instruct export DATASET_PATH="/path/to/longbench/multifieldqa_zh.jsonl" # For example: /home/data/Longbench/data/multifieldqa_zh.jsonl export DATA_DIR="/path/to/data" diff --git a/examples/offline_inference_esa.py b/examples/offline_inference_esa.py index 02908fb7b..c420e9b9e 100644 --- a/examples/offline_inference_esa.py +++ b/examples/offline_inference_esa.py @@ -24,6 +24,7 @@ def setup_environment_variables(): os.environ["VLLM_USE_V1"] = "1" os.environ["PYTHONHASHSEED"] = "123456" + os.environ["ENABLE_SPARSE"] = "true" global model, path_to_dataset, data_dir, tokenizer model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct") diff --git a/examples/offline_inference_kvcomp.py b/examples/offline_inference_kvcomp.py index 595850bec..6aa2ed310 100644 --- a/examples/offline_inference_kvcomp.py +++ b/examples/offline_inference_kvcomp.py @@ -24,6 +24,7 @@ def setup_environment_variables(): os.environ["VLLM_USE_V1"] = "1" os.environ["PYTHONHASHSEED"] = "123456" + os.environ["ENABLE_SPARSE"] = "true" global model, path_to_dataset, data_dir, tokenizer model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct") diff --git a/examples/offline_inference_kvstar.py b/examples/offline_inference_kvstar.py index e26113993..702175423 100644 --- a/examples/offline_inference_kvstar.py +++ b/examples/offline_inference_kvstar.py @@ -24,6 +24,7 @@ def setup_environment_variables(): os.environ["VLLM_USE_V1"] = "1" os.environ["PYTHONHASHSEED"] = "123456" + os.environ["ENABLE_SPARSE"] = "true" global model, path_to_dataset, data_dir, tokenizer model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct") diff --git a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py index 2b63838b0..6abccf1b8 100644 --- a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py +++ b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py @@ -24,17 +24,32 @@ from __future__ import annotations +import os + from ucm.logger import init_logger logger = init_logger(__name__) +ENABLE_SPARSE = os.getenv("ENABLE_SPARSE") + + +def _enable_sparse() -> bool: + return ENABLE_SPARSE is not None and ENABLE_SPARSE.lower() == "true" + def _apply_ascend_patch() -> None: """Apply patch for vLLM-Ascend.""" - _patch_attention_v1() - _patch_mla_v1() - _patch_model_runner_v1() - _patch_worker_v1() + try: + if _enable_sparse(): + _patch_attention_v1() + _patch_mla_v1() + _patch_model_runner_v1() + _patch_worker_v1() + logger.info("UCM sparse adapt patches applied successfully") + + except Exception as e: + logger.error(f"Could not apply sparse adapt patches: {e}") + raise e # ========================= vllm_ascend/attention/attention_v1.py ========================= diff --git a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py index 9f532b1ec..2a697efb0 100644 --- a/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py +++ b/ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py @@ -23,23 +23,33 @@ # from __future__ import annotations +import os + from ucm.logger import init_logger logger = init_logger(__name__) +ENABLE_SPARSE = os.getenv("ENABLE_SPARSE") + + +def _enable_sparse() -> bool: + return ENABLE_SPARSE is not None and ENABLE_SPARSE.lower() == "true" + def _apply_sparse_adapt() -> None: """Apply sparse adapt patches.""" try: - _patch_block_table() - _patch_kv_cache_manager() - _patch_shared_storage_connector() - _patch_attention_layer() - _patch_mla_common() - _patch_gpu_model_runner() - _patch_gpu_worker() - _patch_scheduler_output() - _patch_scheduler() + if _enable_sparse(): + _patch_block_table() + _patch_kv_cache_manager() + _patch_shared_storage_connector() + _patch_attention_layer() + _patch_mla_common() + _patch_gpu_model_runner() + _patch_gpu_worker() + _patch_scheduler_output() + _patch_scheduler() + logger.info("UCM sparse adapt patches applied successfully") except Exception as e: logger.error(f"Could not apply sparse adapt patches: {e}") raise e diff --git a/ucm/sparse/kvcomp/README.md b/ucm/sparse/kvcomp/README.md index b010e7d91..76283551c 100644 --- a/ucm/sparse/kvcomp/README.md +++ b/ucm/sparse/kvcomp/README.md @@ -97,6 +97,7 @@ This design ensures both **efficiency** and **accuracy** by preserving essential KVComp is part of the UCM Sparse Attention module. For installation instructions, please refer to the [UCM's top-level README](../../../../README.md). Once UCM is installed, KVComp is naturally supported by running the following example python scripts. ```bash +export ENABLE_SPARSE=TRUE python ucm/sandbox/sparse/kvcomp/offline_inference_kvcomp.py ```