ModelEngine-Group · ygwpz · Sep 26, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025
@@ -45,8 +45,8 @@ jobs:
             -c "
               pip install -v -e . --no-build-isolation
               cd \$(pip show vllm | grep Location | awk '{print \$2}') &&
-              git apply /workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt.patch &&
-              git apply /workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt-sparse.patch
+              git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch &&
+              git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
               cd /workspace/unified-cache-management
               python3 -m unittest discover -s test
             "
@@ -1 +1 @@
-recursive-include unifiedcache/csrc *
+recursive-include ucm/csrc *
@@ -15,7 +15,6 @@ RUN export PLATFORM="cuda" && \
 
 # Apply patch for vLLM
 RUN cd $(pip show vllm | grep Location | awk '{print $2}') \
-    && git apply /vllm-workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt.patch \
-    && git apply /vllm-workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt-sparse.patch
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
 
 ENTRYPOINT ["/bin/bash"]
@@ -16,12 +16,12 @@ RUN export PLATFORM="ascend" && \
 
 # Apply patch for vLLM
 RUN cd /vllm-workspace/vllm \
-    && git apply /vllm-workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt.patch \
-    && git apply /vllm-workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt-sparse.patch
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch \
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
 
 # Apply patch for vLLM-Ascend
 RUN cd /vllm-workspace/vllm-ascend \
-    && git apply /vllm-workspace/unified-cache-management/ucm/patch/0.9.2/vllm-ascend-adapt.patch \
-    && git apply /vllm-workspace/unified-cache-management/ucm/patch/0.9.2/vllm-ascend-adapt-sparse.patch
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch \
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt-sparse.patch
 
 CMD ["/bin/bash"]
@@ -3,9 +3,9 @@ This document describes how to install unified-cache-management.
 
 ## Requirements
 - OS: Linux
-- Python: >= 3.9, < 3.12
-- GPU: compute capability 8.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
-- CUDA 12.8+
+- Python: 3.12
+- GPU: NVIDIA compute capability 8.0+ (e.g., L20, L40, H20)
+- CUDA: CUDA Version 12.8
 
 You have 2 ways to install for now:
 - Setup from code: First, prepare vLLM environment, then install unified-cache-management from source code.
@@ -25,32 +25,37 @@ docker run \
     --gpus all \
     --network=host \
     --ipc=host \
-    -v <path_to_your_models>:/app/model \
-    -v <path_to_your_storage>:/app/storage \
+    -v <path_to_your_models>:/home/model \
+    -v <path_to_your_storage>:/home/storage \
     --entrypoint /bin/bash \
     --name <name_of_your_container> \
     -it vllm/vllm-openai:v0.9.2
 ```
-Refer to [Set up using docker](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#set-up-using-docker) for more information to run your own vLLM container. After installation, please apply patch to ensure uc_connector can be used:
-```bash
-cd /vllm-workspace/vllm
-git apply /vllm-workspace/unified-cache-management/ucm/patch/0.9.2/vllm-adapt.patch
-``` 
-Refer to this [issue](https://github.com/vllm-project/vllm/issues/21702) to see details of this patch's changes.
+Refer to [Set up using docker](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#set-up-using-docker) for more information to run your own vLLM container.
 
 ### Build from source code
 Follow commands below to install unified-cache-management:
+
 ```bash
 # Replace <branch_or_tag_name> with the branch or tag name needed
 git clone --depth 1 --branch <branch_or_tag_name> https://github.com/ModelEngine-Group/unified-cache-management.git
 cd unified-cache-management
 export PLATFORM=cuda
 pip install -v -e . --no-build-isolation
-cd ..
 ```
 
+After installation, please apply patch to ensure uc_connector can be used:
+
+```bash
+cd $(pip show vllm | grep Location | awk '{print $2}')
+git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
+``` 
+
+Refer to this [issue](https://github.com/vllm-project/vllm/issues/21702) to see details of this patch's changes.
+
 ## Setup from docker
-Download the pre-built docker image provided or build unified-cache-management docker image by commands below:
+Download the pre-built `vllm/vllm-openai:v0.9.2` docker image and build unified-cache-management docker image by commands below:
  ```bash
  # Build docker image using source code, replace <branch_or_tag_name> with the branch or tag name needed
  git clone --depth 1 --branch <branch_or_tag_name> https://github.com/ModelEngine-Group/unified-cache-management.git
@@ -64,8 +69,8 @@ docker run --rm \
     --gpus all \
     --network=host \
     --ipc=host \
-    -v <path_to_your_models>:/app/model \
-    -v <path_to_your_storage>:/app/storage \
+    -v <path_to_your_models>:/home/model \
+    -v <path_to_your_storage>:/home/storage \
     --name <name_of_your_container> \
     -it <image_id>
 ```
@@ -1 +1,96 @@
-# Quickstart
+# Quickstart
+## Prerequisites
+
+- OS: Linux
+- Python: 3.12
+- GPU: NVIDIA compute capability 8.0+ (e.g., L20, L40, H20)
+- CUDA: CUDA Version 12.8
+- vLLM: v0.9.2
+
+## Installation
+Before you start with UCM, please make sure that you have installed UCM correctly by following the [Installation](./installation_gpu.md) guide.
+
+## Features Overview
+
+UCM supports two key features: **Prefix Cache** and **GSA Sparsity**. 
+
+Each feature supports both **Offline Inference** and **Online API** modes. 
+
+For quick start, just follow the [usage](#usage) guide below to launch your own inference experience;
+
+For further research, click on the links blow to see more details of each feature:
+- [Prefix Cache](../user-guide/prefix-cache/base.md)
+- [GSA Sparsity](../user-guide/sparse-attention/gsa.md)
+
+## Usage
+
+<details open>
+<summary><b>Offline Inference</b></summary>
+
+You can use our official offline example script to run offline inference as following commands:
+
+```bash
+cd examples/
+python offline_inference.py
+```
+
+</details>
+
+<details>
+<summary><b>OpenAI-Compatible Online API</b></summary>
+
+For online inference , vLLM with our connector can also be deployed as a server that implements the OpenAI API protocol.
+
+First, specify the python hash seed by:
+```bash
+export PYTHONHASHSEED=123456
+```
+
+Run the following command to start the vLLM server with the Qwen/Qwen2.5-14B-Instruct model:
+
+```bash
+vllm serve /home/models/Qwen2.5-14B-Instruct \
+--max-model-len 20000 \
+--tensor-parallel-size 2 \
+--gpu_memory_utilization 0.87 \
+--trust-remote-code \
+--port 7800 \
+--kv-transfer-config \
+'{
+    "kv_connector": "UnifiedCacheConnectorV1",
+    "kv_connector_module_path": "ucm.integration.vllm.uc_connector",
+    "kv_role": "kv_both",
+    "kv_connector_extra_config": {
+        "ucm_connector_name": "UcmDramStore",
+        "ucm_connector_config": {
+            "max_cache_size": 5368709120,
+            "kv_block_size": 262144
+        }
+    }
+}'
+```
+
+If you see log as below:
+
+```bash
+INFO:     Started server process [32890]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+```
+
+Congratulations, you have successfully started the vLLM server with UCM!
+
+After successfully started the vLLM server，You can interact with the API as following:
+
+```bash
+curl http://localhost:7800/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "/home/models/Qwen2.5-14B-Instruct",
+        "prompt": "Shanghai is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }'
+```
+</details>
+
@@ -14,7 +14,7 @@
 from ucm.logger import init_logger
 
 MODEL_PATH = "/home/models/Qwen2.5-14B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_chat_template=True)
 logger = init_logger(__name__)
 
 
@@ -30,10 +30,10 @@ def build_llm_with_uc(module_path: str, name: str, model: str):
         kv_connector_module_path=module_path,
         kv_role="kv_both",
         kv_connector_extra_config={
-            "ucm_connector_name": "UcmNfsStore",
+            "ucm_connector_name": "UcmDramStore",
             "ucm_connector_config": {
-                "storage_backends": "/home/data",
-                "kv_block_size": 33554432,
+                "max_cache_size": 5368709120, 
+                "kv_block_size": 262144
             },
             "ucm_sparse_config": {
                 "ESA": {
@@ -87,36 +87,21 @@ def main():
 
     setup_environment_variables()
 
-    def get_prompt(prompt):
+    with build_llm_with_uc(module_path, name, model) as llm:
         messages = [
             {
                 "role": "system",
-                "content": "先读问题，再根据下面的文章内容回答问题，不要进行分析，不要重复问题，用简短的语句给出答案。\n\n例如：“全国美国文学研究会的第十八届年会在哪所大学举办的？”\n回答应该为：“xx大学”。\n\n",
+                "content": "You are a highly specialized assistant whose mission is to faithfully reproduce English literary texts verbatim, without any deviation, paraphrasing, or omission. Your primary responsibility is accuracy: every word, every punctuation mark, and every line must appear exactly as in the original source. Core Principles: Verbatim Reproduction: If the user asks for a passage, you must output the text word-for-word. Do not alter spelling, punctuation, capitalization, or line breaks. Do not paraphrase, summarize, modernize, or “improve” the language. Consistency: The same input must always yield the same output. Do not generate alternative versions or interpretations. Clarity of Scope: Your role is not to explain, interpret, or critique. You are not a storyteller or commentator, but a faithful copyist of English literary and cultural texts. Recognizability: Because texts must be reproduced exactly, they will carry their own cultural recognition. You should not add labels, introductions, or explanations before or after the text. Coverage: You must handle passages from classic literature, poetry, speeches, or cultural texts. Regardless of tone—solemn, visionary, poetic, persuasive—you must preserve the original form, structure, and rhythm by reproducing it precisely. Success Criteria: A human reader should be able to compare your output directly with the original and find zero differences. The measure of success is absolute textual fidelity. Your function can be summarized as follows: verbatim reproduction only, no paraphrase, no commentary, no embellishment, no omission.",
+            },
+            {
+                "role": "user",
+                "content": "Please reproduce verbatim the opening sentence of the United States Declaration of Independence (1776), starting with 'When in the Course of human events' and continuing word-for-word without paraphrasing.",
             },
-            {"role": "user", "content": prompt},
         ]
-        return tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-            add_special_tokens=True,
-        )
-
-    with build_llm_with_uc(module_path, name, model) as llm:
-        prompts = []
-
-        batch_size = 1
-
-        with open("/home/datasets/Longbench/data/multifieldqa_zh.jsonl", "r") as f:
-            for _ in range(batch_size):
-                line = f.readline()
-                if not line:
-                    break
-                data = json.loads(line)
-                context = data["context"]
-                question = data["input"]
-                prompts.append(get_prompt(f"{context}\n\n{question}"))
 
+        prompts = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
         sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=100)
 
         print_output(llm, prompts, sampling_params, "first")

@@ -26,8 +26,8 @@
 from typing import Dict, List, Tuple
 
 import torch
-from connector import ucmnfsstore
-from connector.ucmstore import Task, UcmKVStoreBase
+from ucm.store.connector import ucmnfsstore
+from ucm.store.connector.ucmstore import Task, UcmKVStoreBase
 
 
 @dataclass
@@ -48,7 +48,7 @@ def __init__(self, config: Dict):
         )
         if transfer_enable:
             param.transferDeviceId = config["device"]
-            param.transferIoSize = config["io_size"]
+            param.transferIoSize = config["transferIoSize"]
         ret = ucmnfsstore.Setup(param)
         if ret != 0:
             msg = f"Failed to initialize ucmnfsstore, errcode: {ret}."

@@ -22,7 +22,7 @@
     UcmSparseMetadata,
     UcmSparseRole,
 )
-from ucm.store.base import Task, UcmKVStoreBase
+from ucm.store.connector.ucmstore  import Task, UcmKVStoreBase
 from ucm.ucm_sparse.retrieval import retrieval_backend
 from ucm.ucm_sparse.retrieval.retrieval_worker import RetrievalWorker
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		recursive-include unifiedcache/csrc *
		recursive-include ucm/csrc *