Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/user-guide/sparse-attention/esa.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ ESA provides developers with an intuitive example of how to implement their own
### Basic Usage
ESA can be launched using the following command:
```shell
export ENABLE_SPARSE=TRUE
export MODEL_PATH="/path/to/model" # For example: /home/models/Qwen2.5-14B-Instruct
export DATASET_PATH="/path/to/longbench/multifieldqa_zh.jsonl" # For example: /home/data/Longbench/data/multifieldqa_zh.jsonl
python examples/offline_inference_esa.py
Expand Down
2 changes: 2 additions & 0 deletions docs/source/user-guide/sparse-attention/gsa.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ ktc = KVTransferConfig(
Thus, an example command for launching the online LLM service is as follows:

```shell
export ENABLE_SPARSE=TRUE

vllm serve /home/models/DeepSeek-R1-Distill-Qwen-32B \
--served-model-name DeepSeek-R1-Distill-Qwen-32B \
--max-model-len 131000 \
Expand Down
1 change: 1 addition & 0 deletions docs/source/user-guide/sparse-attention/kvcomp.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ This design ensures both **efficiency** and **accuracy** by preserving essential
KVComp is part of the UCM Sparse Attention module. For installation instructions, please refer to the [UCM's top-level README](https://github.com/ModelEngine-Group/unified-cache-management). Once UCM is installed, KVComp is naturally supported by running the following example python scripts.

```bash
export ENABLE_SPARSE=TRUE
python ucm/sandbox/sparse/kvcomp/offline_inference_kvcomp.py
```

Expand Down
1 change: 1 addition & 0 deletions docs/source/user-guide/sparse-attention/kvstar.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ For long-sequence inference, KVstar achieves the following with minimal accuracy
### Basic Usage
KVstar can be launched using the following command:
```shell
export ENABLE_SPARSE=TRUE
export MODEL_PATH="/path/to/model" # For example: /home/models/Qwen2.5-14B-Instruct
export DATASET_PATH="/path/to/longbench/multifieldqa_zh.jsonl" # For example: /home/data/Longbench/data/multifieldqa_zh.jsonl
export DATA_DIR="/path/to/data"
Expand Down
1 change: 1 addition & 0 deletions examples/offline_inference_esa.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
def setup_environment_variables():
os.environ["VLLM_USE_V1"] = "1"
os.environ["PYTHONHASHSEED"] = "123456"
os.environ["ENABLE_SPARSE"] = "true"

global model, path_to_dataset, data_dir, tokenizer
model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")
Expand Down
1 change: 1 addition & 0 deletions examples/offline_inference_kvcomp.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
def setup_environment_variables():
os.environ["VLLM_USE_V1"] = "1"
os.environ["PYTHONHASHSEED"] = "123456"
os.environ["ENABLE_SPARSE"] = "true"

global model, path_to_dataset, data_dir, tokenizer
model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")
Expand Down
1 change: 1 addition & 0 deletions examples/offline_inference_kvstar.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
def setup_environment_variables():
os.environ["VLLM_USE_V1"] = "1"
os.environ["PYTHONHASHSEED"] = "123456"
os.environ["ENABLE_SPARSE"] = "true"

global model, path_to_dataset, data_dir, tokenizer
model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")
Expand Down
23 changes: 19 additions & 4 deletions ucm/integration/vllm/patch/patch_funcs/v092/vllm_ascend_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,32 @@

from __future__ import annotations

import os

from ucm.logger import init_logger

logger = init_logger(__name__)

ENABLE_SPARSE = os.getenv("ENABLE_SPARSE")


def _enable_sparse() -> bool:
return ENABLE_SPARSE is not None and ENABLE_SPARSE.lower() == "true"


def _apply_ascend_patch() -> None:
"""Apply patch for vLLM-Ascend."""
_patch_attention_v1()
_patch_mla_v1()
_patch_model_runner_v1()
_patch_worker_v1()
try:
if _enable_sparse():
_patch_attention_v1()
_patch_mla_v1()
_patch_model_runner_v1()
_patch_worker_v1()
logger.info("UCM sparse adapt patches applied successfully")

except Exception as e:
logger.error(f"Could not apply sparse adapt patches: {e}")
raise e


# ========================= vllm_ascend/attention/attention_v1.py =========================
Expand Down
28 changes: 19 additions & 9 deletions ucm/integration/vllm/patch/patch_funcs/v092/vllm_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,33 @@
#
from __future__ import annotations

import os

from ucm.logger import init_logger

logger = init_logger(__name__)

ENABLE_SPARSE = os.getenv("ENABLE_SPARSE")


def _enable_sparse() -> bool:
return ENABLE_SPARSE is not None and ENABLE_SPARSE.lower() == "true"


def _apply_sparse_adapt() -> None:
"""Apply sparse adapt patches."""
try:
_patch_block_table()
_patch_kv_cache_manager()
_patch_shared_storage_connector()
_patch_attention_layer()
_patch_mla_common()
_patch_gpu_model_runner()
_patch_gpu_worker()
_patch_scheduler_output()
_patch_scheduler()
if _enable_sparse():
_patch_block_table()
_patch_kv_cache_manager()
_patch_shared_storage_connector()
_patch_attention_layer()
_patch_mla_common()
_patch_gpu_model_runner()
_patch_gpu_worker()
_patch_scheduler_output()
_patch_scheduler()
logger.info("UCM sparse adapt patches applied successfully")
except Exception as e:
logger.error(f"Could not apply sparse adapt patches: {e}")
raise e
Expand Down
1 change: 1 addition & 0 deletions ucm/sparse/kvcomp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ This design ensures both **efficiency** and **accuracy** by preserving essential
KVComp is part of the UCM Sparse Attention module. For installation instructions, please refer to the [UCM's top-level README](../../../../README.md). Once UCM is installed, KVComp is naturally supported by running the following example python scripts.

```bash
export ENABLE_SPARSE=TRUE
python ucm/sandbox/sparse/kvcomp/offline_inference_kvcomp.py
```

Expand Down