-
-
-
-
-
diff --git a/docs/EN/source/_static/openapi.json b/docs/EN/source/_static/openapi.json
deleted file mode 100755
index d591ecf0a..000000000
--- a/docs/EN/source/_static/openapi.json
+++ /dev/null
@@ -1,536 +0,0 @@
-{
- "openapi": "3.0.2",
- "info": {
- "title": "FastAPI",
- "version": "0.1.0"
- },
- "paths": {
- "/liveness": {
- "get": {
- "summary": "Liveness",
- "operationId": "liveness_liveness_get",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- },
- "post": {
- "summary": "Liveness",
- "operationId": "liveness_liveness_post",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/readiness": {
- "get": {
- "summary": "Readiness",
- "operationId": "readiness_readiness_get",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- },
- "post": {
- "summary": "Readiness",
- "operationId": "readiness_readiness_post",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/health": {
- "get": {
- "summary": "Check server health",
- "operationId": "healthcheck_health_get",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- },
- "head": {
- "summary": "Check server health",
- "operationId": "healthcheck_health_head",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/healthz": {
- "get": {
- "summary": "Check server health",
- "operationId": "healthcheck_healthz_get",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/token_load": {
- "get": {
- "summary": "Get the current server's load on tokens",
- "operationId": "token_load_token_load_get",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/generate": {
- "post": {
- "summary": "Generate",
- "operationId": "generate_generate_post",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/generate_stream": {
- "post": {
- "summary": "Generate Stream",
- "operationId": "generate_stream_generate_stream_post",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/get_score": {
- "post": {
- "summary": "Get Score",
- "operationId": "get_score_get_score_post",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/": {
- "post": {
- "summary": "Compat Generate",
- "operationId": "compat_generate__post",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/v1/chat/completions": {
- "post": {
- "summary": "Chat Completions",
- "operationId": "chat_completions_v1_chat_completions_post",
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ChatCompletionRequest"
- }
- }
- },
- "required": true
- },
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ChatCompletionResponse"
- }
- }
- }
- },
- "422": {
- "description": "Validation Error",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/HTTPValidationError"
- }
- }
- }
- }
- }
- }
- },
- "/tokens": {
- "get": {
- "summary": "Tokens",
- "operationId": "tokens_tokens_get",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- },
- "post": {
- "summary": "Tokens",
- "operationId": "tokens_tokens_post",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- },
- "/metrics": {
- "get": {
- "summary": "Metrics",
- "operationId": "metrics_metrics_get",
- "responses": {
- "200": {
- "description": "Successful Response",
- "content": {
- "application/json": {
- "schema": {}
- }
- }
- }
- }
- }
- }
- },
- "components": {
- "schemas": {
- "ChatCompletionRequest": {
- "title": "ChatCompletionRequest",
- "required": [
- "model",
- "messages"
- ],
- "type": "object",
- "properties": {
- "model": {
- "title": "Model",
- "type": "string"
- },
- "messages": {
- "title": "Messages",
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "type": "string"
- }
- }
- },
- "function_call": {
- "title": "Function Call",
- "type": "string",
- "default": "none"
- },
- "temperature": {
- "title": "Temperature",
- "type": "number",
- "default": 1
- },
- "top_p": {
- "title": "Top P",
- "type": "number",
- "default": 1.0
- },
- "n": {
- "title": "N",
- "type": "integer",
- "default": 1
- },
- "stream": {
- "title": "Stream",
- "type": "boolean",
- "default": false
- },
- "stop": {
- "title": "Stop",
- "anyOf": [
- {
- "type": "string"
- },
- {
- "type": "array",
- "items": {
- "type": "string"
- }
- }
- ]
- },
- "max_tokens": {
- "title": "Max Tokens",
- "type": "integer",
- "default": 16
- },
- "presence_penalty": {
- "title": "Presence Penalty",
- "type": "number",
- "default": 0.0
- },
- "frequency_penalty": {
- "title": "Frequency Penalty",
- "type": "number",
- "default": 0.0
- },
- "logit_bias": {
- "title": "Logit Bias",
- "type": "object",
- "additionalProperties": {
- "type": "number"
- }
- },
- "user": {
- "title": "User",
- "type": "string"
- },
- "do_sample": {
- "title": "Do Sample",
- "type": "boolean",
- "default": false
- },
- "top_k": {
- "title": "Top K",
- "type": "integer",
- "default": -1
- },
- "ignore_eos": {
- "title": "Ignore Eos",
- "type": "boolean",
- "default": false
- }
- }
- },
- "ChatCompletionResponse": {
- "title": "ChatCompletionResponse",
- "required": [
- "model",
- "choices",
- "usage"
- ],
- "type": "object",
- "properties": {
- "id": {
- "title": "Id",
- "type": "string"
- },
- "object": {
- "title": "Object",
- "type": "string",
- "default": "chat.completion"
- },
- "created": {
- "title": "Created",
- "type": "integer"
- },
- "model": {
- "title": "Model",
- "type": "string"
- },
- "choices": {
- "title": "Choices",
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ChatCompletionResponseChoice"
- }
- },
- "usage": {
- "$ref": "#/components/schemas/UsageInfo"
- }
- }
- },
- "ChatCompletionResponseChoice": {
- "title": "ChatCompletionResponseChoice",
- "required": [
- "index",
- "message"
- ],
- "type": "object",
- "properties": {
- "index": {
- "title": "Index",
- "type": "integer"
- },
- "message": {
- "$ref": "#/components/schemas/ChatMessage"
- },
- "finish_reason": {
- "title": "Finish Reason",
- "enum": [
- "stop",
- "length",
- "function_call"
- ],
- "type": "string"
- }
- }
- },
- "ChatMessage": {
- "title": "ChatMessage",
- "required": [
- "role",
- "content"
- ],
- "type": "object",
- "properties": {
- "role": {
- "title": "Role",
- "type": "string"
- },
- "content": {
- "title": "Content",
- "type": "string"
- }
- }
- },
- "HTTPValidationError": {
- "title": "HTTPValidationError",
- "type": "object",
- "properties": {
- "detail": {
- "title": "Detail",
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ValidationError"
- }
- }
- }
- },
- "UsageInfo": {
- "title": "UsageInfo",
- "type": "object",
- "properties": {
- "prompt_tokens": {
- "title": "Prompt Tokens",
- "type": "integer",
- "default": 0
- },
- "completion_tokens": {
- "title": "Completion Tokens",
- "type": "integer",
- "default": 0
- },
- "total_tokens": {
- "title": "Total Tokens",
- "type": "integer",
- "default": 0
- }
- }
- },
- "ValidationError": {
- "title": "ValidationError",
- "required": [
- "loc",
- "msg",
- "type"
- ],
- "type": "object",
- "properties": {
- "loc": {
- "title": "Location",
- "type": "array",
- "items": {
- "anyOf": [
- {
- "type": "string"
- },
- {
- "type": "integer"
- }
- ]
- }
- },
- "msg": {
- "title": "Message",
- "type": "string"
- },
- "type": {
- "title": "Error Type",
- "type": "string"
- }
- }
- }
- }
- }
-}
\ No newline at end of file
diff --git a/docs/EN/source/assets/logos/lightllm-logo.png b/docs/EN/source/assets/logos/lightllm-logo.png
index 1a9794bf8..5b3b63917 100755
Binary files a/docs/EN/source/assets/logos/lightllm-logo.png and b/docs/EN/source/assets/logos/lightllm-logo.png differ
diff --git a/docs/EN/source/dev/performance.rst b/docs/EN/source/dev/performance.rst
deleted file mode 100755
index 3ecfb58d5..000000000
--- a/docs/EN/source/dev/performance.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-.. _Performance_Benchmark:
-
-Performance
-===========
-
-Service Performance
--------------------
-
-We compared the service performance of LightLLM and vLLM==0.1.2 on LLaMA-7B using an A800 with 80G GPU memory.
-
-To begin, prepare the data as follows:
-
-.. code-block:: shell
-
- wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-Launch the service:
-
-.. code-block:: shell
-
- python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto
-
-Evaluation:
-
-.. code-block:: shell
-
- cd test
- python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200
-
-The performance comparison results are presented below:
-
-+-------------------+-------------------+
-| vLLM | LightLLM |
-+===================+===================+
-| Total time: 361.79| Total time: 188.85|
-| Throughput: 5.53 | Throughput: 10.59 |
-| requests/s | requests/s |
-+-------------------+-------------------+
-
-Static Inference Performance
-----------------------------
-
-For debugging, we offer static performance testing scripts for various models. For instance, you can evaluate the inference performance of the LLaMA model by:
-
-.. code-block:: shell
-
- cd test/model
- python test_llama.py
\ No newline at end of file
diff --git a/docs/EN/source/dev/router.rst b/docs/EN/source/dev/router.rst
deleted file mode 100755
index 8b7bb9361..000000000
--- a/docs/EN/source/dev/router.rst
+++ /dev/null
@@ -1,61 +0,0 @@
-.. _Efficient_Router:
-
-Efficient Router
-===================
-
-The Efficient Router is introduced to manage incoming requests and dynamically determine whether the request can be fused with a batch that is already running for inference. The merging criterion is to estimate whether the maximum token occupancy in the merged inference process is smaller than the maximum capacity that the hardware can be accommodated. Here, we set this maximum capacity as max_total_token_num. With the support of Token Attention, we can accurately manage the usage of tokens, and can ensure that there is never a risk of out-of-memory (OOM).
-
-.. image:: ../assets/lightllm/ER1.png
- :alt: Efficient_Router1
- :align: center
-
-
-As shown in the figure above, each row represents the current running state of a request, the yellow colour represents the historical kv cache tokens that have been run, each grid represents a token, the grey colour represents the tokens to be generated. The number of tokens to be generated is determined by the maximum output length set for each request and the number of tokens that have been generated. In the above figure, the second row of the green grid represents a newly arrived request, and the figure lists all the requests in ascending order according to the length of the output to be generated.
-
-If we assume that the new requests are fused into a Batch for inference, the maximum token usage will inevitably occur at one of the time points, Time 1, Time 2, or Time 3. We only need to calculate if the token usage at these three time points does not exceed the max_total_token_num, which indicates that the new request can be added to the Batch for fused inference.
-
-The total used tokens of Time 1 is equal to the number of yellow cells plus the number of green cells (see the figure below)
-
-.. image:: ../assets/lightllm/ER2.png
- :alt: Efficient_Router1
- :align: center
-
-
-The total used tokens of Time 2 is equal to the number of yellow squares plus the number of green squares (see the figure below)
-
-.. image:: ../assets/lightllm/ER3.png
- :alt: Efficient_Router1
- :align: center
-
-The total used tokens of Time 3 is equal to the number of yellow squares (see the figure below)
-
-.. image:: ../assets/lightllm/ER4.png
- :alt: Efficient_Router1
- :align: center
-
-The actual maximum token usage is always one of Time 1, Time 2, or Time 3.
-
-As long as the maximum token usage during the dynamic inference process is lower than max_total_token_num, it indicates that new requests can be batched for inference.
-
-To quickly calculate the maximum token usage required for all requests in a batch, we have implemented an efficient example using numpy.
-
-
-.. code-block:: python
-
- import numpy as np
-
- def demo():
- max_total_token_num = 100
- req_list = [(5, 4), (4, 3), (5, 3), (3, 2), (4, 2)] # (run_len, left_output_len)
- req_list.sort(key=lambda x: -x[1])
-
- left_out_len_array = np.array([e[1] for e in req_list])
- has_run_len_array = np.array([e[0] for e in req_list])
- cum_run_len_array = np.cumsum(has_run_len_array)
- size_array = np.arange(1, len(req_list) + 1, 1)
- need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max()
-
- if need_max_token_num <= max_total_token_num:
- print("ok")
- else:
- print("oom")
\ No newline at end of file
diff --git a/docs/EN/source/dev/token_attention.rst b/docs/EN/source/dev/token_attention.rst
deleted file mode 100755
index bb2ca24a7..000000000
--- a/docs/EN/source/dev/token_attention.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-.. _TokenAttention:
-
-TokenAttention
-=======================
-
-Transformers form the basis of modern large language models. During autoregressive decoding, these models cache key-value tensors of context tokens into GPU memory to facilitate fast generation of the next token. However, these caches occupy significant GPU memory. The unpredictable nature of cache size, due to the variability in the length of each request, exacerbates the issue, resulting in significant memory fragmentation in the absence of a suitable memory management mechanism.
-
-To alleviate this issue, PagedAttention was proposed to store the KV cache in non-contiguous memory spaces. It partitions the KV cache of each sequence into multiple blocks, with each block containing the keys and values for a fixed number of tokens. This approach effectively controls memory waste within the last block during attention computation. While PagedAttention alleviates memory fragmentation to some extent, it still leaves room for memory waste. Additionally, when handling multiple high-concurrency requests, the allocation and deallocation of memory blocks fall short of efficiency, leading to suboptimal memory utilization.
-
-To address the above challenges, we introduce TokenAttention, an attention mechanism that manages key and value caching at the token level. Compared to PagedAttention, our TokenAttention not only minimizes memory fragmentation and enables efficient memory sharing but also facilitates efficient memory allocation and deallocation. It allows for more precise and fine-grained memory management, thus optimizing memory utilization.
-
-.. list-table:: Feature Comparison
- :widths: 30 15 15
- :header-rows: 1
-
- * - Features
- - PagedAttention
- - TokenAttention
- * - Low memory fragmentation
- - ✓
- - ✓
- * - Efficient memory sharing
- - ✓
- - ✓
- * - Efficient memory allocation and deallocation
- - ✗
- - ✓
- * - Fine-grained memory management
- - ✗
- - ✓
-
-
-The operation mechanism of TokenAttention is illustrated in the figure below:
-
-.. figure:: ../assets/lightllm/token_attn.gif
- :width: 100%
- :align: center
- :alt: Lightllm
- :class: no-scaled-link
-
-
-During model initialization, the KV cache is pre-allocated based on the user-set **max_total_token_num** and a Token Table is created to record the actual storage locations of input tokens.
-
-When handling new requests, the system first checks for available contiguous space in the pre-allocated Token cache for storing the key-value (KV) cache. TokenAttention favors assigning contiguous graphics memory space for requests to minimize memory access during the inference process. Only when contiguous space is insufficient does it allocate non-contiguous graphics memory for the requests. Since memory management is conducted on a token-by-token basis, TokenAttention achieves nearly zero waste, yielding higher throughput compared to vllm.
-
-We have implemented an efficient TokenAttention operator using OpenAI Triton. When provided with a query vector, this operator can efficiently retrieve the corresponding KV cache based on the Token Table and conduct the attention computation.
-
-Upon completion of requests, the corresponding graphics memory can be quickly freed by deleting their records on the Token Table, which makes way for scheduling new requests. Given that TokenAttention pre-allocates all KV cache space during model initialization, it can efficiently release memory for completed requests and merge different batches of requests during dynamic scheduling, thereby effectively maximizing GPU utilization.
-
-The specific steps are as follows:
-
-
-1. During model initialization, the KV cache is pre-allocated based on the user-set max_total_token_num and a Token Table is created to record the actual storage locations of input tokens.
-2. When handling new requests, the system first checks for available contiguous space in the pre-allocated Token cache for storing the key-value (KV) cache. TokenAttention favors assigning contiguous graphics memory space for requests to minimize memory access during the inference process. Only when contiguous space is insufficient does it allocate non-contiguous graphics memory for the requests. The allocated space is recorded in the Token Table for subsequent attention calculations.
-3. For cache of newly generated tokens, it is only necessary to find unused space from the pre-allocated token cache and add the corresponding entry to the Token Table. Moreover, to efficiently allocate and release the Cache, we utilize the parallel computing capabilities of torch Tensor on the GPU to manage the state of the pre-allocated Token Cache. First, we define the states as follows:
-
- .. code-block:: python
-
- self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda")
- self._mem_cum_sum = torch.empty((size,), dtype=torch.int32, device="cuda")
- self.indexes = torch.arange(0, size, dtype=torch.long, device="cuda")
- self.can_use_mem_size = size
-
-
- The mem_state records the usage status of the cache, where 1 represents unused and 0 represents used. The _mem_cum_sum is used for the cumulative sum of mem_state which is used to efficiently identify and select unused space for cache allocation. The allocation process is as follows:
-
- .. code-block:: python
-
- torch.cumsum(self.mem_state, dim=0, dtype=torch.int32, out=self._mem_cum_sum)
- #
- select_index = torch.logical_and(self._mem_cum_sum <= need_size, self.mem_state == 1)
- select_index = self.indexes[select_index]
- self.mem_state[select_index] = 0
- self.can_use_mem_size -= len(select_index)
-
-
- It can be observed that our cache state management is all done on the GPU, fully utilizing the parallel capabilities of torc, thereby allowing the system to efficiently allocate cache space for each request.
-
-4. Upon completion of requests, the corresponding graphics memory can be quickly freed by deleting their records on the Token Table, which makes way for scheduling new requests.
-
- .. code-block:: python
-
- self.can_use_mem_size += free_index.shape[0]
- self.mem_state[free_index] = 1
-
-5. Token Attention allows for zero wastage of GPU memory, due to its GPU memory management at the token level. It can accurately calculate how many new tokens the system can accommodate for computation. Therefore, when combined with a high-performance router to manage requests, it can continuously add new requests during the inference process, fully utilizing every piece of GPU memory and maximizing GPU utilization.
-
diff --git a/docs/EN/source/lightllm/lightllm_impl.rst b/docs/EN/source/framework/framework.rst
old mode 100755
new mode 100644
similarity index 100%
rename from docs/EN/source/lightllm/lightllm_impl.rst
rename to docs/EN/source/framework/framework.rst
diff --git a/docs/EN/source/framework/router.rst b/docs/EN/source/framework/router.rst
new file mode 100644
index 000000000..4f05716ed
--- /dev/null
+++ b/docs/EN/source/framework/router.rst
@@ -0,0 +1,62 @@
+.. _Efficient_Router:
+
+Efficient Router
+================
+
+Introducing an efficient router to manage incoming requests and dynamically determine whether the request can be merged with already running inference batches.
+The merge criterion is whether the estimated maximum token usage during merged inference is less than the maximum capacity that the hardware can accommodate.
+Here, we set this maximum capacity to ``max_total_token_num``. With the support of **Token Attention**, we can accurately manage token usage and ensure that out-of-memory situations never occur.
+
+.. image:: ../assets/lightllm/ER1.png
+ :alt: Efficient_Router1
+ :align: center
+
+As shown in the figure above, each row represents the current running state of a request, yellow represents historical kv cache tokens that have already been run, each cell represents a token, and gray represents tokens to be generated.
+The number of generated tokens is determined by the maximum output length set for each request and the number of tokens already generated.
+In the figure above, the second row of green grid represents a newly arrived request, and all requests are listed in ascending order according to the length of output to be generated.
+
+If we assume that the new request is merged into a batch for inference, the maximum token usage will necessarily appear at one of time point 1, time 2, or time 3. We only need to calculate whether the token usage at these time points reaches the maximum value. If none of the three time points exceed max_total_token_num, it means the new request can be added to the batch for merged inference.
+
+Total token usage at time 1 equals the number of yellow cells plus the number of green cells (see figure below)
+
+.. image:: ../assets/lightllm/ER2.png
+ :alt: Efficient_Router1
+ :align: center
+
+Total token usage at time 2 equals the number of yellow squares plus the number of green squares (see figure below)
+
+.. image:: ../assets/lightllm/ER3.png
+ :alt: Efficient_Router1
+ :align: center
+
+Total token usage at time 3 equals the number of yellow squares (see figure below)
+
+.. image:: ../assets/lightllm/ER4.png
+ :alt: Efficient_Router1
+ :align: center
+
+The actual maximum token usage is always one of time 1, time 2, or time 3.
+
+As long as the maximum token usage during dynamic inference is below max_total_token_num, it means new requests can be batched for inference.
+
+To quickly calculate the maximum token usage required by all requests in the batch, we implemented an efficient example using numpy.
+
+.. code-block:: python
+
+ import numpy as np
+
+ def demo():
+ max_total_token_num = 100
+ req_list = [(5, 4), (4, 3), (5, 3), (3, 2), (4, 2)] # (run_len, left_output_len)
+ req_list.sort(key=lambda x: -x[1])
+
+ left_out_len_array = np.array([e[1] for e in req_list])
+ has_run_len_array = np.array([e[0] for e in req_list])
+ cum_run_len_array = np.cumsum(has_run_len_array)
+ size_array = np.arange(1, len(req_list) + 1, 1)
+ need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max()
+
+ if need_max_token_num <= max_total_token_num:
+ print("ok")
+ else:
+ print("oom")
\ No newline at end of file
diff --git a/docs/EN/source/framework/token_attention.rst b/docs/EN/source/framework/token_attention.rst
new file mode 100644
index 000000000..0799f62fd
--- /dev/null
+++ b/docs/EN/source/framework/token_attention.rst
@@ -0,0 +1,83 @@
+.. _TokenAttention:
+
+TokenAttention
+==============
+
+Transformers form the foundation of modern large language models. During autoregressive decoding, these models cache key-value tensors of context tokens in GPU memory for fast generation of the next token. However, these caches occupy a large amount of GPU memory. Due to the variability in request lengths, the unpredictability of cache sizes exacerbates this problem, leading to severe memory fragmentation in the absence of appropriate memory management mechanisms.
+
+To alleviate this issue, PagedAttention was proposed to store KV cache in non-contiguous memory spaces. It divides the KV cache of each sequence into multiple blocks, with each block containing keys and values for a fixed number of tokens. This method effectively controls memory waste within the last block during attention computation. While PagedAttention alleviates memory fragmentation to some extent, it still leaves room for memory waste. Additionally, when handling multiple high-concurrency requests, the efficiency of memory block allocation and deallocation is low, resulting in poor memory utilization.
+
+To address the above challenges, we introduced TokenAttention, an attention mechanism that manages key and value cache at the token level. Compared to PagedAttention, our TokenAttention can not only minimize memory fragmentation and achieve efficient memory sharing, but also promote efficient memory allocation and deallocation. It allows for more precise and fine-grained memory management, thereby optimizing memory utilization.
+
+.. list-table:: Feature Comparison
+ :widths: 30 15 15
+ :header-rows: 1
+
+ * - Features
+ - PagedAttention
+ - TokenAttention
+ * - Low Memory Fragmentation
+ - ✓
+ - ✓
+ * - Efficient Memory Sharing
+ - ✓
+ - ✓
+ * - Efficient Memory Allocation and Deallocation
+ - ✗
+ - ✓
+ * - Fine-grained Memory Management
+ - ✗
+ - ✓
+
+The operation mechanism of TokenAttention is shown in the figure below:
+
+.. figure:: ../assets/lightllm/token_attn.gif
+ :width: 100%
+ :align: center
+ :alt: Lightllm
+ :class: no-scaled-link
+
+During model initialization, KV cache is pre-allocated according to the user-set **max_total_token_num**, and a Token Table is created to record the actual storage location of input tokens.
+
+When processing new requests, the system first checks if there is available contiguous space in the pre-allocated token cache for storing key-value (KV) cache. TokenAttention tends to allocate contiguous graphics memory space for requests to minimize memory access during inference. Only when contiguous space is insufficient will non-contiguous memory be allocated for requests. Since memory management is performed token by token, TokenAttention achieves almost zero waste and produces higher throughput compared to vllm.
+
+We implemented an efficient TokenAttention operator using OpenAI Triton. When provided with query vectors, this operator can efficiently retrieve the corresponding KV cache based on the Token Table and perform attention computation.
+
+After request completion, the corresponding memory can be quickly released by deleting records on the token table, making way for scheduling new requests. Since TokenAttention pre-allocates all KV cache space during model initialization, it can efficiently release memory for completed requests and merge requests from different batches during dynamic scheduling, effectively maximizing GPU utilization.
+
+Specific steps are as follows:
+
+1. During model initialization, the system pre-allocates KV cache memory according to the user-set ``max_total_token_num`` and creates a Token Table to record the actual storage location of input tokens.
+
+2. When processing new requests, the system first checks if there is available contiguous space in the pre-allocated token cache for storing KV Cache. TokenAttention tends to allocate contiguous memory for requests to minimize memory access during inference. Only when contiguous space is insufficient will non-contiguous memory be allocated for requests. The allocated space is recorded in the Token Table for subsequent attention computation.
+
+3. For caching newly generated tokens, it's only necessary to find unused space from the pre-allocated token cache and add the corresponding entries to the token table. Additionally, to efficiently allocate and deallocate cache, we utilize Torch Tensor's parallel computing capabilities on GPU to manage the state of pre-allocated token cache. First, we define the state as follows:
+
+ .. code-block:: python
+
+ self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda")
+ self._mem_cum_sum = torch.empty((size,), dtype=torch.int32, device="cuda")
+ self.indexes = torch.arange(0, size, dtype=torch.long, device="cuda")
+ self.can_use_mem_size = size
+
+ ``mem_state`` records the usage state of the cache, where 1 represents unused and 0 represents used. ``_mem_cum_sum`` is used for the cumulative sum of ``mem_state``, used to efficiently identify and select unused space for cache allocation. The allocation process is as follows:
+
+ .. code-block:: python
+
+ torch.cumsum(self.mem_state, dim=0, dtype=torch.int32, out=self._mem_cum_sum)
+ #
+ select_index = torch.logical_and(self._mem_cum_sum <= need_size, self.mem_state == 1)
+ select_index = self.indexes[select_index]
+ self.mem_state[select_index] = 0
+ self.can_use_mem_size -= len(select_index)
+
+ It can be observed that our cache state management is entirely completed on GPU, fully utilizing torch's parallel capabilities, thereby allowing the system to efficiently allocate cache space for each request.
+
+4. After request completion, the corresponding memory can be quickly released by deleting records on the ``Token Table``, making way for scheduling new requests.
+
+ .. code-block:: python
+
+ self.can_use_mem_size += free_index.shape[0]
+ self.mem_state[free_index] = 1
+
+5. Due to token-level GPU memory management, TokenAttention can achieve zero waste of GPU memory. It can accurately calculate how many new tokens the system can accommodate for computation. Therefore, when combined with ``Efficient Router`` to manage requests, it can continuously add new requests during inference, fully utilizing every piece of GPU memory and maximizing GPU utilization.
\ No newline at end of file
diff --git a/docs/EN/source/getting_started/benchmark.rst b/docs/EN/source/getting_started/benchmark.rst
new file mode 100644
index 000000000..87caaa06a
--- /dev/null
+++ b/docs/EN/source/getting_started/benchmark.rst
@@ -0,0 +1,199 @@
+Benchmark Testing Guide
+=======================
+
+LightLLM provides multiple performance testing tools, including service performance testing and static inference performance testing. This document will detailedly introduce how to use these tools for performance evaluation.
+
+Service Performance Testing (Service Benchmark)
+----------------------------------------------
+
+Service performance testing is mainly used to evaluate LightLLM's performance in real service scenarios, including key metrics such as throughput and latency.
+
+QPS Testing (benchmark_qps.py)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+QPS (Queries Per Second) testing is the core tool for evaluating service performance, supporting LightLLM and OpenAI compatible API formats.
+
+**Usage:**
+
+.. code-block:: bash
+
+ python test/benchmark/service/benchmark_qps.py \
+ --url http://127.0.0.1:8000/generate_stream \
+ --tokenizer_path /path/to/tokenizer \
+ --num_clients 100 \
+ --input_num 2000 \
+ --input_qps 30.0 \
+ --input_len 1024 \
+ --output_len 128 \
+ --server_api lightllm \
+ --dump_file results.json
+
+**Main Parameter Description:**
+
+- ``--url``: Service address, supports LightLLM and OpenAI formats
+- ``--tokenizer_path``: Tokenizer path
+- ``--input_num``: Total number of test requests
+- ``--input_qps``: Input QPS limit
+- ``--input_len``: Input sequence length
+- ``--output_len``: Output sequence length
+- ``--server_api``: Service API type (lightllm/openai)
+- ``--data_path``: Custom dataset path
+- ``--continuous_send``: Whether to send continuously (0/1)
+- ``--force_terminate``: Force termination mode (0/1)
+
+**Output Metrics:**
+
+- Total QPS: Overall queries per second
+- Sender QPS: Sender QPS
+- Avg Input Length: Average input length
+- Avg Output Length: Average output length
+- Total Throughput: Overall throughput (token/s)
+- Input Throughput: Input throughput (token/s)
+- Output Throughput: Output throughput (token/s)
+- request_time P{25,50,75,90,95,99,100}: Request latency percentiles
+- first_token_time P{25,50,75,90,95,99,100}: First token latency percentiles
+- decode_token_time P{25,50,75,90,95,99,100}: Decode token latency percentiles
+
+Fixed Concurrency Testing (benchmark_client.py)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Used to evaluate performance under different client concurrency levels.
+
+**Usage:**
+
+.. code-block:: bash
+
+ python test/benchmark/service/benchmark_client.py \
+ --url http://127.0.0.1:8000/generate_stream \
+ --tokenizer_path /path/to/tokenizer \
+ --num_clients 100 \
+ --input_num 2000 \
+ --input_len 1024 \
+ --output_len 128 \
+ --server_api lightllm
+
+ShareGPT Dataset Testing (benchmark_sharegpt.py)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Performance testing using ShareGPT real conversation data.
+
+**Usage:**
+
+.. code-block:: bash
+
+ $ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+.. code-block:: bash
+
+ python test/benchmark/service/benchmark_sharegpt.py \
+ --dataset /path/to/sharegpt_dataset.json \
+ --tokenizer /path/to/tokenizer \
+ --num_prompts 1000 \
+ --request_rate 10.0
+
+**Main Parameters:**
+
+- ``--dataset``: ShareGPT format dataset path
+- ``--tokenizer``: Tokenizer path
+- ``--num_prompts``: Number of test prompts
+- ``--request_rate``: Request rate (requests/s)
+
+Prompt Cache Testing
+~~~~~~~~~~~~~~~~~~~
+
+Evaluate prompt cache performance under different hit rates by adjusting --first_input_len, --output_len --subsequent_input_len to control hit rate.
+Hit rate per round = (first_input_len + (output_len + subsequent_input_len) * (num_turns - 1)) / (first_input_len + (output_len + subsequent_input_len) * num_turns)
+Note: Control concurrency and user numbers based on max_total_token_num to ensure all requests can fit, guaranteeing that the actual hit rate matches your preset hit rate.
+
+.. code-block:: bash
+
+ python test/benchmark/service/benchmark_prompt_cache.py \
+ --model_url http://127.0.0.1:8000/generate_stream \
+ --model_name model \
+ --num_workers 10 \
+ --first_input_len 512 \
+ --subsequent_input_len 512 \
+ --output_len 128 \
+ --num_turns 10 \
+ --num_users 10
+
+Parameter Description:
+
+- ``--model_url``: Service address
+- ``--model_name``: Result save filename
+- ``--num_workers``: Concurrency number
+- ``--first_input_len``: First round input length
+- ``--subsequent_input_len``: Subsequent round input length
+- ``--output_len``: Output length
+- ``--num_turns``: Number of rounds
+- ``--num_users``: Number of users
+
+Static Inference Performance Testing (Static Inference Benchmark)
+---------------------------------------------------------------
+
+Static inference testing is used to evaluate model inference performance under fixed input conditions, mainly evaluating operator quality.
+
+Model Inference Testing (model_infer.py)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Main Features:**
+
+- Supports prefill and decode stage performance testing
+- Supports microbatch overlap optimization
+- Supports multi-GPU parallel inference
+- Provides detailed throughput statistics
+
+**Usage:**
+
+.. code-block:: bash
+
+ python test/benchmark/static_inference/test_model.py \
+ --model_dir /path/to/model \
+ --batch_size 32 \
+ --input_len 1024 \
+ --output_len 128 \
+ --tp 2 \
+ --data_type bf16
+
+**Main Parameters:**
+
+- ``--model_dir``: Model path
+- ``--batch_size``: Batch size
+- ``--input_len``: Input sequence length
+- ``--output_len``: Output sequence length
+- ``--tp``: Tensor Parallel degree
+- ``--data_type``: Data type (bf16/fp16/fp32)
+- ``--enable_prefill_microbatch_overlap``: Enable prefill microbatch overlap, only applicable to DeepSeek model EP mode
+- ``--enable_decode_microbatch_overlap``: Enable decode microbatch overlap, only applicable to DeepSeek model EP mode
+- ``--torch_profile``: Enable torch profiler for performance analysis
+
+.. note::
+ Complete startup parameters are not listed here. Static testing scripts also share Lightllm's startup parameters. For more startup configurations, please refer to :ref:`tutorial/api_server_args_zh`.
+
+**Output Metrics:**
+
+- Prefill stage throughput (tokens/s)
+- Decode stage throughput (tokens/s)
+- Latency statistics for each stage
+
+Multi-Token Prediction Performance Testing (model_infer_mtp.py)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Multi-token prediction static performance testing with 100% acceptance rate by default, used to evaluate the ultimate performance of multi-token prediction. Currently only supports DeepSeek series models.
+
+**Usage:**
+
+.. code-block:: bash
+
+ python test/benchmark/static_inference/test_model.py \
+ --model_dir /path/to/main_model \
+ --mtp_mode deepseekv3 \
+ --mtp_step 1 \
+ --mtp_draft_model_dir /path/to/draft_model \
+ --batch_size 32 \
+ --input_len 1024 \
+ --output_len 128
+
+Parameter Description:
+
+- ``--model_dir``: Main model path
\ No newline at end of file
diff --git a/docs/EN/source/getting_started/faq.rst b/docs/EN/source/getting_started/faq.rst
deleted file mode 100644
index 74845e1a0..000000000
--- a/docs/EN/source/getting_started/faq.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. _faq:
-
-- The LLaMA tokenizer fails to load.
- - Consider resolving this by running the command:
-
- .. code-block:: shell
-
- pip install protobuf==3.20.0
-
-- ``error : PTX .version 7.4 does not support .target sm_89``
- - Launch with:
-
- .. code-block:: shell
-
- bash tools/resolve_ptx_version python -m lightllm.server.api_server ...
\ No newline at end of file
diff --git a/docs/EN/source/getting_started/installation.rst b/docs/EN/source/getting_started/installation.rst
index a76671d4d..35b398287 100755
--- a/docs/EN/source/getting_started/installation.rst
+++ b/docs/EN/source/getting_started/installation.rst
@@ -1,62 +1,60 @@
.. _installation:
-Installation
-============
+Installation Guide
+==================
-Lightllm is a Python-based inference framework, with operators implemented in Triton.
+Lightllm is a pure Python-based inference framework with operators written in Triton.
-Requirements
-------------
+Environment Requirements
+------------------------
* Operating System: Linux
* Python: 3.9
-* GPU: Compute Capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.).
-
+* GPU: Compute Capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
.. _build_from_docker:
-Installing with Docker
+Installation via Docker
-----------------------
-The easiest way to install Lightllm is by using the official image. You can directly pull and run the official image:
+The easiest way to install Lightllm is using the official image. You can directly pull the official image and run it:
.. code-block:: console
$ # Pull the official image
$ docker pull ghcr.io/modeltc/lightllm:main
$
- $ # Run the image
+ $ # Run
$ docker run -it --gpus all -p 8080:8080 \
- $ --shm-size 32g -v your_local_path:/data/ \
+ $ --shm-size 1g -v your_local_path:/data/ \
$ ghcr.io/modeltc/lightllm:main /bin/bash
-You can also manually build and run the image from the source:
-
+You can also manually build the image from source and run it:
.. code-block:: console
$ # Manually build the image
$ docker build -t .
$
- $ # Run the image
+ $ # Run
$ docker run -it --gpus all -p 8080:8080 \
- $ --shm-size 32g -v your_local_path:/data/ \
+ $ --shm-size 1g -v your_local_path:/data/ \
$ /bin/bash
-Alternatively, you can use a script to automatically build and run the image:
-
+Or you can directly use the script to launch the image and run it with one click:
.. code-block:: console
-
+
$ # View script parameters
$ python tools/quick_launch_docker.py --help
.. note::
- If you are using multiple GPUs, you may need to increase the --shm-size parameter setting above.
+ If you use multiple GPUs, you may need to increase the --shm-size parameter setting above. If you need to run DeepSeek models in EP mode, please use the image
+ ghcr.io/modeltc/lightllm:main-deepep.
.. _build_from_source:
-Installing from Source
------------------------
+Installation from Source
+------------------------
You can also install Lightllm from source:
@@ -66,23 +64,31 @@ You can also install Lightllm from source:
$ conda create -n lightllm python=3.9 -y
$ conda activate lightllm
$
- $ # Download the latest source code for Lightllm
+ $ # Download the latest Lightllm source code
$ git clone https://github.com/ModelTC/lightllm.git
$ cd lightllm
$
- $ # Install Lightllm's dependencies
+ $ # Install Lightllm dependencies (cuda 12.4)
$ pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu124
$
$ # Install Lightllm
$ python setup.py install
-NOTE: If you are using torch with cuda 11.x instead, run `pip install nvidia-nccl-cu12==2.20.5` to support torch cuda graph.
+NOTE: If you use torch with cuda 11.x for some reason, please run `pip install nvidia-nccl-cu12==2.20.5` to support torch cuda graph.
.. note::
- The Lightllm code has been tested on various GPUs, including V100, A100, A800, 4090, and H800.
- If you are using A100, A800, or similar GPUs, it is recommended to install triton==3.1.0:
+ Lightllm code has been tested on various GPUs including V100, A100, A800, 4090, and H800.
+ If you use A100, A800 and other graphics cards, it is recommended to install triton==3.0.0:
+
+ .. code-block:: console
+
+ $ pip install triton==3.0.0 --no-deps
+
+ If you use H800, V100 and other graphics cards, it is recommended to install triton-nightly:
.. code-block:: console
- $ pip install triton==3.1.0 --no-deps
\ No newline at end of file
+ $ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly --no-deps
+
+ For specific reasons, please refer to: `issue `_ and `fix PR `_
\ No newline at end of file
diff --git a/docs/EN/source/getting_started/quickstart.rst b/docs/EN/source/getting_started/quickstart.rst
index bf2335658..2a8b9ad42 100755
--- a/docs/EN/source/getting_started/quickstart.rst
+++ b/docs/EN/source/getting_started/quickstart.rst
@@ -3,27 +3,26 @@
Quick Start
===========
-Deploying a model with Lightllm is very straightforward and requires only two steps:
+Deploying models with Lightllm is very simple, requiring only two steps at minimum:
-1. Prepare the weight file for a model supported by Lightllm.
-2. Start the model service using the command line.
+1. Prepare model weight files supported by Lightllm.
+2. Use command line to start the model service.
3. (Optional) Test the model service.
.. note::
- Before continuing with this tutorial, please ensure you have completed the :ref:`installation guide `.
+ Before continuing with this tutorial, please ensure you have completed the :ref:`Installation Guide `.
-1. Prepare the Model File
--------------------------
+1. Prepare Model Files
+----------------------
-The following content will demonstrate Lightllm's support for large language models using `Llama-2-7b-chat `_. You can refer to the article: `How to Quickly Download Hugging Face Models — A Summary of Methods `_ for methods to download models.
+Download `Qwen3-8B `_ first.
+Below is an example code for downloading the model:
-Here is an example of how to download the model:
-
-(1) (Optional) Create a directory
+(1) (Optional) Create folder
.. code-block:: console
- $ mkdir -p ~/models && cd ~/models
+ $ mkdirs ~/models && cd ~/models
(2) Install ``huggingface_hub``
@@ -31,132 +30,37 @@ Here is an example of how to download the model:
$ pip install -U huggingface_hub
-(3) Download the model file
+(3) Download model files
.. code-block:: console
- $ huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir Llama-2-7b-chat
-
-.. tip::
- The above code for downloading the model requires a stable internet connection and may take some time. You can use alternative download methods or other supported models as substitutes. For the latest list of supported models, please refer to the `project homepage `_.
-
+ $ huggingface-cli download Qwen/Qwen3-8B --local-dir Qwen3-8B
-2. Start the Model Service
----------------------------
+2. Start Model Service
+----------------------
-After downloading the Llama-2-7b-chat model, use the following command in the terminal to deploy the API service:
+After downloading the Qwen3-8B model, use the following code in the terminal to deploy the API service:
.. code-block:: console
- $ python -m lightllm.server.api_server --model_dir ~/models/Llama-2-7b-chat
+ $ python -m lightllm.server.api_server --model_dir ~/models/Qwen3-8B
.. note::
- The ``--model_dir`` parameter in the above command should be changed to the actual path of your model on your machine.
-
-For the DeepSeek-R1 model on single H200, it can be launched with the following command:
-
-.. code-block:: console
-
- $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 8 --graph_max_batch_size 100
-
-.. note::
- LOADWORKER specifies the thread for model loading, which can enhance the speed of model loading. The --graph_max_batch_size parameter specifies the number of cudagraphs to be captured, which will capture graphs for batch sizes ranging from 1 to 100.
-
-For the DeepSeek-R1 model on two H100, it can be launched with the following command:
-
-.. code-block:: console
-
- $ # Node 0
- $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 0
- $ # Node 1
- $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 1
-
-3. Start Model Service - Disaggregating Prefill and Decoding
-------------------------------------------------------------
-
-Find Local IP
-
-.. code-block:: console
-
- $ hostname -i
-
-Run MPS (Optional)
-
-.. code-block:: console
-
- $ nvidia-cuda-mps-control -d
-
-Run pd_master Service
-
-.. code-block:: console
-
- $ CUDA_VISIBLE_DEVICES=0 python -m lightllm.server.api_server \
- $ --model_dir /your/model/path \
- $ --run_mode "pd_master" \
- $ --host /your/host/ip \
- $ --port 60011
-
-Open a new terminal and run the prefill service
-
-.. code-block:: console
-
- $ CUDA_VISIBLE_DEVICES=0,1 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \
- $ --run_mode "prefill" \
- $ --host /your/host/ip \
- $ --port 8017 \
- $ --tp 2 \
- $ --nccl_port 2732 \
- $ --max_total_token_num 400000 \
- $ --tokenizer_mode fast \
- $ --pd_master_ip /your/host/ip \
- $ --pd_master_port 60011 \
- $ --max_req_total_len 16000 \
- $ --running_max_req_size 128 \
- $ --disable_cudagraph
-
-Open a new terminal and run the decoding service
-
-.. code-block:: console
-
- $ CUDA_VISIBLE_DEVICES=2,3 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \
- $ --run_mode "decode" \
- $ --host /your/host/ip \
- $ --port 8118 \
- $ --nccl_port 12322 \
- $ --tp 2 \
- $ --max_total_token_num 400000 \
- $ --graph_max_len_in_batch 2048 \
- $ --graph_max_batch_size 16 \
- $ --tokenizer_mode fast \
- $ --pd_master_ip /your/host/ip \
- $ --pd_master_port 60011
-
-.. note::
- The tp size for the prefill and decoding stages should remain consistent.
-
-4. (Optional) Test the Model Service
---------------------------------------
-
-In a new terminal, use the following command to test the model service:
-
-.. code-block:: console
-
- $ curl http://server_ip:server_port/generate \
- $ -H "Content-Type: application/json" \
- $ -d '{
- $ "inputs": "What is AI?",
- $ "parameters":{
- $ "max_new_tokens":17,
- $ "frequency_penalty":1
- $ }
- $ }'
-
+ The ``--model_dir`` parameter in the above code needs to be modified to your actual local model path.
-For DeepSeek-R1 benchmark, use the following command to test the model service:
+3. Test Model Service
+---------------------
.. code-block:: console
- $ cd test
- $ python benchmark_client.py --num_clients 100 --input_num 2000 --tokenizer_path /nvme/DeepSeek-R1/ --url http://127.0.01:8000/generate_stream
+ $ curl http://127.0.0.1:8000/generate \
+ -H "Content-Type: application/json" \
+ -d '{
+ "inputs": "What is AI?",
+ "parameters":{
+ "max_new_tokens":17,
+ "frequency_penalty":1
+ }
+ }'
diff --git a/docs/EN/source/index.rst b/docs/EN/source/index.rst
index 81a0e490d..db4f1d3d1 100755
--- a/docs/EN/source/index.rst
+++ b/docs/EN/source/index.rst
@@ -1,5 +1,5 @@
-Welcome Lightllm!
-==================
+Welcome to Lightllm!
+====================
.. figure:: ./assets/logos/lightllm-logo.png
:width: 100%
@@ -10,7 +10,7 @@ Welcome Lightllm!
.. raw:: html
- A Light and Fast inference Services for LLM
+ A Lightweight and High-Performance Large Language Model Service Framework
@@ -22,94 +22,52 @@ Welcome Lightllm!
-LightLLM is a Python-based LLM (Large Language Model) inference and serving framework, notable for its lightweight design, easy scalability, and high-speed performance. LightLLM harnesses the strengths of numerous well-regarded open-source implementations, including but not limited to FasterTransformer, TGI, vLLM, and FlashAttention.
-
-**Features**:
-
-* Tri-process asynchronous collaboration: tokenization, model inference, and detokenization are performed asynchronously, leading to a considerable improvement in GPU utilization.
-* Nopad (Unpad): offers support for nopad attention operations across multiple models to efficiently handle requests with large length disparities.
-* Dynamic Batch: enables dynamic batch scheduling of requests
-* FlashAttention: incorporates FlashAttention to improve speed and reduce GPU memory footprint during inference.
-* Tensor Parallelism: utilizes tensor parallelism over multiple GPUs for faster inference.
-* Token Attention: implements token-wise's KV cache memory management mechanism, allowing for zero memory waste during inference.
-* High-performance Router: collaborates with Token Attention to meticulously manage the GPU memory of each token, thereby optimizing system throughput.
-* Int8KV Cache: This feature will increase the capacity of tokens to almost twice as much. only llama support.
-
-**Supported Model List**:
-
-- `BLOOM `_
-- `LLaMA `_
-- `LLaMA V2 `_
-- `StarCoder `_
-- `Qwen-7b `_
-- `ChatGLM2-6b `_
-- `Baichuan-7b `_
-- `Baichuan2-7b `_
-- `Baichuan2-13b `_
-- `Baichuan-13b `_
-- `InternLM-7b `_
-- `Yi-34b `_
-- `Qwen-VL `_
-- `Qwen-VL-Chat `_
-- `Llava-7b `_
-- `Llava-13b `_
-- `Mixtral `_
-- `Stablelm `_
-- `MiniCPM `_
-- `Phi-3 `_
-- `CohereForAI `_
-- `DeepSeek-V2-Lite `_
-- `DeepSeek-V2 `_
-
-
-Docs List
--------------
+Lightllm is a pure Python-based large language model inference and serving framework, featuring lightweight design, easy extensibility, and high performance.
+Lightllm integrates the advantages of numerous open-source solutions, including but not limited to FasterTransformer, TGI, vLLM, SGLang, and FlashAttention.
-.. toctree::
- :maxdepth: 1
- :caption: Getting started
-
- getting_started/installation
- getting_started/quickstart
+**Key Features**:
-.. toctree::
- :maxdepth: 1
- :caption: Lightllm
+* Multi-process Collaboration: Input text encoding, language model inference, visual model inference, and output decoding are performed asynchronously, significantly improving GPU utilization.
+* Cross-process Request Object Sharing: Through shared memory, cross-process request object sharing is achieved, reducing inter-process communication latency.
+* Efficient Scheduling Strategy: Peak memory scheduling strategy with prediction, maximizing GPU memory utilization while reducing request eviction.
+* High-performance Inference Backend: Efficient operator implementation, support for multiple parallelization methods (tensor parallelism, data parallelism, and expert parallelism), dynamic KV cache, rich quantization support (int8, fp8, int4), structured output, and multi-result prediction.
- lightllm/lightllm_intro
- lightllm/lightllm_impl
+Documentation List
+------------------
.. toctree::
:maxdepth: 1
- :caption: Model
+ :caption: Quick Start
- Supported Model
- Examples
- Add new models
+ Installation Guide
+ Quick Start
+ Performance Benchmark
.. toctree::
:maxdepth: 1
- :caption: Launching Server
-
- Server Args
- Benchmark
-
-
+ :caption: Deployment Tutorials
+
+ DeepSeek R1 Deployment
+ Multimodal Deployment
+ Reward Model Deployment
+ OpenAI api Usage
+ APIServer Parameters
+ Lightllm API Introduction
+
.. toctree::
:maxdepth: 1
- :caption: Using Server
+ :caption: Model Support
- user/api_param
- user/openapi_docs
-
+ Supported Models List
+ Adding New Models
.. toctree::
:maxdepth: 1
- :caption: development docs
+ :caption: Architecture Introduction
- dev/token_attention
- dev/router
- dev/performance
+ Architecture Overview
+ Token Attention
+ Efficient Router
.. Indices and tables
.. ==================
diff --git a/docs/EN/source/lightllm/lightllm_intro.rst b/docs/EN/source/lightllm/lightllm_intro.rst
deleted file mode 100644
index a073d5941..000000000
--- a/docs/EN/source/lightllm/lightllm_intro.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-.. _lightllm:
-
-LightLLM Overview
-===========================
-
-With the popularity of ChatGPT, large language model, abbreviated as LLM, has received increasing attention. The emergence of such models has greatly improved people's work efficiency. However, the key to further widespread adoption lies in how to deploy models with billons of parameters at low cost and high throughput. To improve the throughput of large model services and enable more interested researchers to quickly get involved, a lightweight LLM inference service framework called LightLLM has emerged. LightLLM introduces a more fine-grained kv cache management algorithm called TokenAttention and designs an Efficient Router scheduling implementation that works efficiently with TokenAttention. Through the interaction of TokenAttention and Efficient Router, LightLLM achieves higher throughput than vLLM and Text Generation Inference in most scenarios, with performance improvements of around 4 times in some cases. LightLLM is flexible, user-friendly, and efficient. Interested friends may want to click on the link below to try it out.
-
-Project:https://github.com/ModelTC/lightllm
-
-.. _challenge:
-
-The challenge of LLM Serving
--------------------------------
-
-Large language models have garnered significant attention from researchers due to their excellent performance. These models not only engage in everyday conversations with humans but also assist in completing various daily tasks, thereby enhancing productivity. However, despite the remarkable performance demonstrated by these models, deploying large-scale models to improve service performance poses the following challenges:
-
-* **Severe fragmentation of memory**: Network weights ranging from tens to hundreds of gigabytes, as well as the constantly dynamic growing KV Cache during inference, easily leads to low memory utilization.
-* **Low efficiency in request scheduling**: The length of requests dynamically changes over time, which can result in GPU idling or low utilization issues.
-* **High difficulty in kernel customization**: Customizing kernels for networks is necessary to efficiently utilize memory and improve service throughput. However, it will require a significant amount of effort from researchers.
-
-.. _solutions_and_problems:
-
-Existing solutions and problems
--------------------------------------
-
-To address the aforementioned challenges, many excellent LLM inference frameworks have emerged, such as FasterTransformer, Text-Generation-Inference (referred to as TGI), vLLM, etc. The core features and capability matrices of these frameworks are shown in the table below:
-
-.. list-table:: Comparison of various frameworks
- :header-rows: 1
-
- * - Framework
- - NV Triton + FasterTransformer
- - TGI
- - vLLM
- - LightLLM
- * - core feature
- - Efficient kernel
- - `Continuous batch `_, Token streaming
- - `PageAttention `_
- - Tri-process asynchronous collaboration,:ref:`TokenAttention`,:ref:`Efficient_Router`
- * - Memory fragmentation
- - low
- - high
- - low
- - low
- * - Request scheduling efficiency
- - low
- - middle
- - middle
- - high
- * - Difficulty of kernel customization
- - high
- - middle
- - middle
- - low
-
-These frameworks all have their own unique features. For example, FasterTransformer has excellent static inference performance but lacks robust service scheduling and is primarily developed in C++, resulting in high secondary development costs. TGI has excellent service interfaces and scheduling features such as Continuous Batch, but its inference performance, scheduling strategy, and memory management have some shortcomings. vLLM has excellent memory management but lacks efficiency in request scheduling, and its overall implementation details are more suitable for deploying small models.
-
-Lightllm
-----------------------
-
-Therefore, to address these issues, we have developed a LLM deployment framework called LightLLM, which is based on the pure Python language. It enables researchers to easily deploy and customize lightweight models locally, allowing for rapid expansion of different models and integration of various excellent open-source features. The core features of LightLLM are as follows:
-
-* Tri-process asynchronous collaboration: tokenization, model inference, and detokenization are performed asynchronously, leading to a considerable improvement in GPU utilization.
-* :ref:`TokenAttention`: implements token-wise's KV cache memory management mechanism, allowing for zero memory waste during inference.
-* :ref:`Efficient_Router`: collaborates with Token Attention to meticulously manage the GPU memory of each token, thereby optimizing system throughput.
-
-With the highly coordinated efficient kernels developed based on OpenAI Triton and service scheduling, LightLLM achieves excellent throughput performance
-
-.. figure:: ../assets/lightllm/arch.png
- :width: 100%
- :align: center
- :alt: Lightllm
- :class: no-scaled-link
-
-
-
-LightLLM is committed to enabling more people to participate, allowing flexible and efficient exploration of various LLM deployment and inference solutions. It also serves as a reference for hardware manufacturers to promote the development of the field. We hope that everyone can give it more stars, fork the project, and contribute. We believe that in the future, more technologies and solutions (such as TensorRT) will emerge, continuously reducing deployment costs and making AGI more accessible to ordinary households.
\ No newline at end of file
diff --git a/docs/EN/source/models/add_new_model.md b/docs/EN/source/models/add_new_model.md
index cc819864f..6127dffaf 100755
--- a/docs/EN/source/models/add_new_model.md
+++ b/docs/EN/source/models/add_new_model.md
@@ -1,25 +1,25 @@
# How to Add New Model Support
-## 1. Introduction of inference architecture
+## 1. Current Inference Architecture Introduction
-In the lightllm/common/basemodel directory, you will find the base class implementation for the entire inference architecture.
+Under the ***lightllm/common/basemodel*** directory is the base class implementation of the entire inference architecture
~~~shell
-├── basemodel.py # Model architecture class
-├── infer_struct.py # State class for inference
+├── basemodel.py # Model framework class
+├── infer_struct.py # Inference state class
├── __init__.py
-├── layer_infer # Inference layer base class
+├── layer_infer # Base class implementation of inference layers
│ ├── base_layer_infer.py
│ ├── __init__.py
│ ├── post_layer_infer.py
│ ├── pre_layer_infer.py
-│ ├── template # Template implementation of the inference layer.
+│ ├── template # Template implementation of inference layers, inheriting from templates can reduce development effort and duplicate code
│ │ ├── __init__.py
│ │ ├── post_layer_infer_template.py
│ │ ├── pre_layer_infer_template.py
│ │ └── transformer_layer_infer_template.py
│ └── transformer_layer_infer.py
-├── layer_weights # base class of weight
+├── layer_weights # Weight base class implementation
│ ├── base_layer_weight.py
│ ├── hf_load_utils.py
│ ├── __init__.py
@@ -31,41 +31,41 @@ In the lightllm/common/basemodel directory, you will find the base class impleme
└── __init__.py
~~~
-As shown above, the current model inference architecture mainly consists of two parts: weight and inference.
+As shown above, the current model inference architecture mainly consists of two parts: weights and inference.
-### Weight
+### Weights
-The layer_weights directory contains weight-related codes. In theory, a newly added model needs to inherit the PreAndPostLayerWeight and TransformerLayerWeight classes in pre_and_post_layer_weight.py and transformer_layer_weight.py to load weights.
+Under the layer_weights directory is the weight-related code. Theoretically, for a newly added model, you need to inherit and implement the PreAndPostLayerWeight and TransformerLayerWeight classes in pre_and_post_layer_weight.py and transformer_layer_weight.py to implement weight loading.
-| Weight base class | Responsibilities |
-| ---------------------- | ------------------------------------------------------------ |
-| PreAndPostLayerWeight | Responsible for loading the weights of the first Embedding layer and the last post-processing layer of the LLM model and splitting the weights according to the tp parameters used |
-| TransformerLayerWeight | Responsible for loading the weights of the LLM model transformer layer and splitting the weights according to the tp parameters used |
+| Weight Base Class | Responsibilities |
+| ---------------------------- | ------------------------------------------------------------ |
+| PreAndPostLayerWeight | Responsible for loading weights of the first Embedding layer and the last post-processing layer of LLM models, and splitting weights according to the tp parameter used |
+| TransformerLayerWeight | Responsible for loading weights of transformer layers of LLM models and splitting weights according to the tp parameter used |
### Inference
-The layer_infer directory contains the base classes for inference processing, and some templates are provided in the template directory. Inheriting from the template class can reduce some unnecessary duplication of code and simplify the implementation. There are three inference classes that need to be inherited in this directory.
+Under the layer_infer directory are the relevant base classes for inference processing, and some templates are provided under the template directory. Inheriting from template classes can reduce some unnecessary duplicate code and simplify implementation. There are three inference classes that need to be inherited and implemented under this directory.
-| Inference base class | Responsibilities |
-| --------------------- | ------------------------------------------ |
-| PreLayerInfer | Responsible for inference of the Embedding layer |
-| TransformerLayerInfer | Responsible for inference of th transformer layer |
-| PostLayerInfer | Responsible for inference of converting the final hidden layer output of the network into logits |
+| Inference Base Class | Responsibilities |
+| ---------------------------- | -------------------------------------------- |
+| PreLayerInfer | Responsible for inference of Embedding layer |
+| TransformerLayerInfer | Responsible for inference of transformer layer |
+| PostLayerInfer | Responsible for converting the final hidden layer output of the network to logits inference |
-The base class BaseLayerInfer of the above three classes provides two most important external service function interfaces. All inference behaviors will be entered through these two interfaces.
+The base class BaseLayerInfer of the above three classes provides two most important external service function interfaces. All inference behaviors will enter through these two interfaces.
-| interface | Responsibilities |
-| ------------------------------------------------------------ | ---------------------------------------------- |
-| def context_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BaseLayerWeight): | the first inference of batch(prefill) |
-| def token_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BaseLayerWeight): | the inference of decode |
+| Interface | Responsibilities |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| def context_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BaseLayerWeight): | First inference of batch (also called prefill in code) |
+| def token_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BaseLayerWeight): | Single step decode stage inference |
-### Operator
+### Operators
-The triton_kernel directory contains some operators needed for inference implemented using openai triton.
+Under the triton_kernel directory are some operators needed for inference implemented using OpenAI triton.
-### State class
+### State Class
-The InferStateInfo class in infer_struct.py is a state class that passes some important information between layers when performing a model inference. Different models can inherit and implement this class to add unique state information that each model needs to pass. The InferStateInfo class provides an inherited init_some_extra_state interface for initializing the transmission of additional unique information.
+The InferStateInfo class in infer_struct.py is a state class that passes some important information between layers during a model inference. Different models can inherit and implement this class to add unique state information that each model needs to pass. The InferStateInfo class provides an inheritable init_some_extra_state interface for initializing additional unique information.
~~~python
def init_some_extra_state(self,
@@ -81,9 +81,9 @@ The InferStateInfo class in infer_struct.py is a state class that passes some im
pass
~~~
-### Model class
+### Model Framework Class
-The TpPartBaseModel class in basemodel.py is the entry point of the entire model. Each type of model needs to inherit and implement this class. This class uses the inference class, weight class, and state class to complete the model loading and inference functions in a similar way to building blocks. Many of its interfaces can be inherited and implemented to complete the unique operations of each model type.
+The TpPartBaseModel class in basemodel.py is the entry point of the entire model. Each type of model needs to inherit and implement this class. This class uses inference classes, weight classes, and state classes in a building block-like manner to complete model loading and inference functions. There are many interfaces that can be inherited and implemented to complete unique operations for each model type.
~~~python
class TpPartBaseModel:
@@ -99,9 +99,7 @@ class TpPartBaseModel:
# infer state class
infer_state_class = InferStateInfo
- def __init__(self, tp_rank, world_size, weight_dir, max_total_token_num, load_way="HF", mode=[]):
- self.tp_rank_ = tp_rank
- self.tp_world_size_ = world_size
+ def __init__(self, weight_dir, max_total_token_num, load_way="HF", mode=[]):
self.weight_dir_ = weight_dir
self.max_total_token_num = max_total_token_num
self.load_way = load_way
@@ -120,21 +118,21 @@ class TpPartBaseModel:
...
~~~
-Common interfaces that need to be inherited and implemented
+Commonly used interfaces that need to be inherited and implemented
-| interfaces | effect |
+| Interface | Function |
| ---------------------------- | ------------------------------------------------------------ |
-| def _init_config(self): | Read the config.json of the initialization model and perform some key name legalization operations |
-| def _verify_params(self): | Verification parameters |
-| def _init_mem_manager(self): | Initialize the mem manager object used by token attention |
-| def _init_some_value(self): | Initialize the values of some member variables used by the inference framework |
-| def _init_custom(self): | Some models have their own personalized initialization, such as llama initializing its own Rotary value |
+| def _init_config(self): | Read the config.json for initializing the model and perform some key name legalization operations |
+| def _verify_params(self): | Validate parameters |
+| def _init_mem_manager(self): | Initialize the mem manager object used by token attention |
+| def _init_some_value(self): | Initialize values of some member variables that the inference framework will use |
+| def _init_custom(self): | Some personalized initialization of the model itself, such as llama initializing its own Rotary values |
-## 2. the example of adding bloom model
+## 2. Example of Adding Bloom Model
-The specific implementation is in the ***lightllm/models/bloom*** directory. Please read the corresponding source code for the following code snippets. The triton_kernel directory contains some kernels used by the inference class, which will not be introduced in detail below. At the same time, the bloom model uses the default state class because it does not need to pass special state information. If you want to understand the entire framework more deeply, you can further refer to the access implementation source code of models such as llama and llama2.
+The specific implementation is under the ***lightllm/models/bloom*** directory. Please read the source code for the code snippets below. The triton_kernel directory contains some kernels used by inference classes, which will not be introduced in detail in this article. At the same time, the bloom model uses the default state class because it doesn't need to pass special state information. For a deeper understanding of the entire framework, you can further refer to the source code implementation of llama and llama2 model integration.
-### (1) Add implementation weight class
+### (1) Add Implementation Weight Classes
***pre_and_post_layer_weight.py***
diff --git a/docs/EN/source/models/supported_models.rst b/docs/EN/source/models/supported_models.rst
index 761348329..bc1945666 100755
--- a/docs/EN/source/models/supported_models.rst
+++ b/docs/EN/source/models/supported_models.rst
@@ -1,24 +1,23 @@
-Supported Models
-================
+Supported Models List
+=====================
-lightllm supports most mainstream open source large language models and multimodal models, and will continue to expand the list of supported models. In later versions, lightllm will support more types of models (such as reward models).
+Lightllm supports most mainstream open-source large language models and multimodal models, and will continue to expand the list of supported models. In future versions, lightllm will support more types of models (such as reward models).
.. note::
- Due to its lightweight design, Lightllm is highly extensible, which means that adding new model support is very simple. For more information, please refer to the **How to Add New Model Support** section.
+ Due to its lightweight design, Lightllm is highly extensible, which means that adding new model support is very simple. For more information, please refer to the **Adding New Models** section.
-----
-LLM
-^^^^^^^^^^^^^^^^^^^^^^
-
+Large Language Models
+^^^^^^^^^^^^^^^^^^^^^
.. list-table::
:widths: 25 25
:header-rows: 1
- * - model
- - note
+ * - Model
+ - Notes
* - `BLOOM `_
-
* - `LLaMA `_
@@ -42,29 +41,29 @@ LLM
* - `MiniCPM `_
-
* - `Phi-3 `_
- - only supports Mini and Small.
+ - Only supports Mini and Small.
* - `CohereForAI `_
- :code:`--data_type bfloat16`
* - `DeepSeek-V2-Lite `_
- :code:`--data_type bfloat16`
* - `DeepSeek-V2 `_
- :code:`--data_type bfloat16`
+ * - `DeepSeek-V3 `_
+ -
* - `Qwen3 `_
-
* - `Qwen3-Moe `_
-
-
-
-VLM
+Multimodal Models
^^^^^^^^^^^^^^^^^
.. list-table::
:widths: 25 25
:header-rows: 1
- * - model
- - note
+ * - Model
+ - Notes
* - `Qwen-VL `_
- :code:`--trust_remote_code --enable_multimodal`
* - `Qwen-VL-Chat `_
@@ -73,19 +72,22 @@ VLM
- :code:`--enable_multimodal`
* - `Llava-13b `_
- :code:`--enable_multimodal`
+ * - `Qwen2-VL `_
+ - :code:`--enable_multimodal`
* - `Google Gemma3 `_
- :code:`--enable_multimodal`
-
-Reward Model
-^^^^^^^^^^^^^^^^^
+Reward Models
+^^^^^^^^^^^^^
.. list-table::
:widths: 25 25
:header-rows: 1
- * - model
- - note
+ * - Model
+ - Notes
* - `internLM-reward `_
- :code:`--use_reward_model`
+ * - `Qwen2-Reward `_
+ - :code:`--use_reward_model`
diff --git a/docs/EN/source/models/test.rst b/docs/EN/source/models/test.rst
deleted file mode 100755
index b599bc72f..000000000
--- a/docs/EN/source/models/test.rst
+++ /dev/null
@@ -1,273 +0,0 @@
-Examples
-================
-
-LLaMA
-^^^^^^^^^^^^^^^^^^^^^
-
-**Launching Server**
-
-.. code-block:: console
-
- $ python -m lightllm.server.api_server --model_dir /path/llama-7B \
- $ --host 0.0.0.0 \
- $ --port 8080 \
- $ --tp 1 \
- $ --max_total_token_num 120000
-
-.. tip::
-
- The parameter `max_total_token_num` is influenced by the GPU memory of the deployment environment. You can also specify `--mem_faction` to have it calculated automatically.
-
-.. code-block:: console
-
- $ python -m lightllm.server.api_server --model_dir /path/llama-7B \
- $ --host 0.0.0.0 \
- $ --port 8080 \
- $ --tp 1 \
- $ --mem_faction 0.9
-
-**Test Server**
-
-.. code-block:: console
-
- $ curl http://127.0.0.1:8080/generate \
- $ -X POST \
- $ -d '{"inputs":"What is AI?","parameters":{"max_new_tokens":17, "frequency_penalty":1}}' \
- $ -H 'Content-Type: application/json'
-
-.. code-block:: python
-
- import time
- import requests
- import json
-
- url = 'http://localhost:8080/generate'
- headers = {'Content-Type': 'application/json'}
- data = {
- 'inputs': 'What is AI?',
- "parameters": {
- 'do_sample': False,
- 'ignore_eos': False,
- 'max_new_tokens': 1024,
- }
- }
- response = requests.post(url, headers=headers, data=json.dumps(data))
- if response.status_code == 200:
- print(response.json())
- else:
- print('Error:', response.status_code, response.text)
-
-Qwen2-0.5B
-^^^^^^^^^^^^^^^^^^^^^
-
-**Launching Server**
-
-.. code-block:: console
-
- $ python -m lightllm.server.api_server --model_dir ~/models/Qwen2-0.5B \
- $ --trust_remote_code
-
-**Test Server**
-
-.. code-block:: console
-
- $ curl http://localhost:8000/generate \
- $ -H "Content-Type: application/json" \
- $ -d '{
- $ "inputs": "What is AI?",
- $ "parameters":{
- $ "max_new_tokens":17,
- $ "frequency_penalty":1
- $ }
- $ }'
-
-
-Qwen-VL-Chat
-^^^^^^^^^^^^^^^^^
-
-**Launching Server**
-
-.. code-block:: console
-
- $ python -m lightllm.server.api_server --model_dir ~/models/Qwen-VL-Chat \
- $ --trust_remote_code \
- $ --enable_multimodal
-
-**Test Server**
-
-.. code-block:: python
-
- import json
- import requests
- import base64
-
- def run(query, uris):
- images = []
- for uri in uris:
- if uri.startswith("http"):
- images.append({"type": "url", "data": uri})
- else:
- with open(uri, 'rb') as fin:
- b64 = base64.b64encode(fin.read()).decode("utf-8")
- images.append({'type': "base64", "data": b64})
-
- data = {
- "inputs": query,
- "parameters": {
- "max_new_tokens": 200,
- # The space before <|endoftext|> is important,
- # the server will remove the first bos_token_id,
- # but QWen tokenizer does not has bos_token_id
- "stop_sequences": [" <|endoftext|>", " <|im_start|>", " <|im_end|>"],
- },
- "multimodal_params": {
- "images": images,
- }
- }
-
- url = "http://127.0.0.1:8000/generate"
- headers = {'Content-Type': 'application/json'}
- response = requests.post(url, headers=headers, data=json.dumps(data))
- return response
-
- query = """
- <|im_start|>system
- You are a helpful assistant.<|im_end|>
- <|im_start|>user
-
- what is this?<|im_end|>
- <|im_start|>assistant
- """
-
- response = run(
- uris = [
- "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
- ],
- query = query
- )
-
- if response.status_code == 200:
- print(f"Result: {response.json()}")
- else:
- print(f"Error: {response.status_code}, {response.text}")
-
-Llava
-^^^^^^^^^^^^^^^^^
-
-**Launching Server**
-
-.. code-block:: console
-
- $ python -m lightllm.server.api_server \
- $ --host 0.0.0.0 \
- $ --port 8080 \
- $ --tp 1 \
- $ --max_total_token_num 12000 \
- $ --trust_remote_code \
- $ --enable_multimodal \
- $ --cache_capacity 1000 \
- $ --model_dir /path/of/llava-v1.5-7b or /path/of/llava-v1.5-13b
-
-**Test Server**
-
-.. code-block:: python
-
- import time
- import requests
- import json
- import base64
-
- url = 'http://localhost:8080/generate'
- headers = {'Content-Type': 'application/json'}
-
- uri = "/local/path/of/image" # or "/http/path/of/image"
- if uri.startswith("http"):
- images = [{"type": "url", "data": uri}]
- else:
- with open(uri, 'rb') as fin:
- b64 = base64.b64encode(fin.read()).decode("utf-8")
- images=[{'type': "base64", "data": b64}]
-
- data = {
- "inputs": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nPlease explain the picture. ASSISTANT:",
- "parameters": {
- "max_new_tokens": 200,
- },
- "multimodal_params": {
- "images": images,
- }
- }
-
- response = requests.post(url, headers=headers, data=json.dumps(data))
- if response.status_code == 200:
- print(response.json())
- else:
- print('Error:', response.status_code, response.text)
-
-
-internlm2-1_8b
-^^^^^^^^^^^^^^^^^^^^^^^
-
-**Launching Server**
-
-.. code-block:: console
-
- $ python -m lightllm.server.api_server --model_dir ~/models/internlm2-1_8b \
- $ --trust_remote_code
-
-
-**Test Server**
-
-.. code-block:: console
-
- $ curl http://localhost:8000/generate \
- $ -H "Content-Type: application/json" \
- $ -d '{
- $ "inputs": "What is LLM?",
- $ "parameters":{
- $ "max_new_tokens":170,
- $ "frequency_penalty":1
- $ }
- $ }'
-
-
-internlm2-1_8b-reward
-^^^^^^^^^^^^^^^^^^^^^^^
-
-**Launching Server**
-
-.. code-block:: console
-
- $ python -m lightllm.server.api_server --model_dir ~/models/internlm2-1_8b-reward \
- $ --use_reward_model \
- $ --trust_remote_code
-
-.. tip::
-
- ``--use_reward_model`` Indicates options that must be turned on to use the reward model.
-
-
-**Test Server**
-
-.. code-block:: python
-
- import json
- import requests
-
- query = "<|im_start|>user\nHello! What's your name?<|im_end|>\n<|im_start|>assistant\nMy name is InternLM2! A helpful AI assistant. What can I do for you?<|im_end|>\n<|reward|>"
-
- url = "http://127.0.0.1:8000/get_score"
- headers = {'Content-Type': 'application/json'}
-
- data = {
- "chat": query,
- "parameters": {
- "frequency_penalty":1
- }
- }
- response = requests.post(url, headers=headers, data=json.dumps(data))
-
- if response.status_code == 200:
- print(f"Result: {response.json()}")
- else:
- print(f"Error: {response.status_code}, {response.text}")
\ No newline at end of file
diff --git a/docs/EN/source/server/api_server_args.rst b/docs/EN/source/server/api_server_args.rst
deleted file mode 100755
index 98c1c61bb..000000000
--- a/docs/EN/source/server/api_server_args.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-APIServer Args
-=============================
-
-
-Usage
-++++++++++++
-
-.. argparse::
- :module: lightllm.server.api_cli
- :func: make_argument_parser
- :prog: python -m lightllm.server.api_server
- :nodefaultconst:
\ No newline at end of file
diff --git a/docs/EN/source/server/benchmark.rst b/docs/EN/source/server/benchmark.rst
deleted file mode 100755
index 8487da111..000000000
--- a/docs/EN/source/server/benchmark.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-Benchmark
-==================
-
-After deploying the model, it is very important to evaluate the service performance. By adjusting the configuration based on the service performance, the graphics card resources can be better utilized.
-In this article, we use the LLaMA-7B model to compare the performance of lightllm and vLLM==0.1.2 on an 80G A800 graphics card.
-For the specific comparison method, please refer to the following steps:
-
-1. Download datasets
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code-block:: console
-
- $ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-
-2. Launching Server
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code-block:: console
-
- $ python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto
-
-
-3. Benchmark
-^^^^^^^^^^^^^^^^
-
-.. code-block:: console
-
- $ cd test
- $ python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200
-
-
-output:
-
-.. code-block:: console
-
- read data set finish
- total tokens: 494250
- Total time: 111.37 s
- Throughput: 8.98 requests/s
- Average latency: 43.52 s
- Average latency per token: 0.15 s
- Average latency per output token: 0.73 s
\ No newline at end of file
diff --git a/docs/EN/source/user/api_param.rst b/docs/EN/source/tutorial/api_param.rst
old mode 100755
new mode 100644
similarity index 76%
rename from docs/EN/source/user/api_param.rst
rename to docs/EN/source/tutorial/api_param.rst
index 96577ede2..89474e617
--- a/docs/EN/source/user/api_param.rst
+++ b/docs/EN/source/tutorial/api_param.rst
@@ -1,56 +1,50 @@
-API parameter
-==========================
-
+API Call Details
+================
:code:`GET /health`
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~
:code:`HEAD /health`
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~
:code:`GET /healthz`
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~
Get the current server running status
-**Usage Examples**:
+**Call Example**:
.. code-block:: console
$ curl http://0.0.0.0:8080/health
-
-**Output Examples**:
+**Output Example**:
.. code-block:: python
{"message":"Ok"}
-
-
:code:`GET /token_load`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~
-Get the current server token usage
+Get the current server token usage status
-**Usage Examples**:
+**Call Example**:
.. code-block:: console
$ curl http://0.0.0.0:8080/token_load
-
-**Output Examples**:
+**Output Example**:
.. code-block:: python
{"current_load":0.0,"logical_max_load":0.0,"dynamic_max_load":0.0}
-
:code:`POST /generate`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~
-Calling the model to implement text completion
+Call the model to implement text completion
-**Usage Examples**:
+**Call Example**:
.. code-block:: console
@@ -65,21 +59,18 @@ Calling the model to implement text completion
$ "multimodal_params":{}
$ }'
-
-**Output Examples**:
+**Output Example**:
.. code-block:: python
{"generated_text": [" What is the difference between AI and ML? What are the differences between AI and ML"], "count_output_tokens": 17, "finish_reason": "length", "prompt_tokens": 4}
-
:code:`POST /generate_stream`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Streaming returns text completion results
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Stream return text completion results
-**Usage Examples**:
+**Call Example**:
.. code-block:: console
@@ -94,7 +85,7 @@ Streaming returns text completion results
$ "multimodal_params":{}
$ }'
-**Output Examples**:
+**Output Example**:
::
@@ -104,12 +95,11 @@ Streaming returns text completion results
data:{"token": {"id": 279, "text": " the", "logprob": -1.5594439506530762, "special": false, "count_output_tokens": 3, "prompt_tokens": 4}, "generated_text": null, "finished": true, "finish_reason": "length", "details": null}
-
:code:`POST /get_score`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Reward model, get the dialogue score.
+~~~~~~~~~~~~~~~~~~~~~~~
+Reward model, get conversation score
-**Usage Examples**:
+**Call Example**:
.. code-block:: python
@@ -134,14 +124,8 @@ Reward model, get the dialogue score.
else:
print(f"Error: {response.status_code}, {response.text}")
-**Output Examples**:
+**Output Example**:
::
- Result: {'score': 0.4892578125, 'prompt_tokens': 39, 'finish_reason': 'stop'}
-
-
-:code:`POST /v1/chat/completions`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-openai type api, see `openai API docs `_ for details.
+ Result: {'score': 0.4892578125, 'prompt_tokens': 39, 'finish_reason': 'stop'}
\ No newline at end of file
diff --git a/docs/EN/source/tutorial/api_server_args_zh.rst b/docs/EN/source/tutorial/api_server_args_zh.rst
new file mode 100644
index 000000000..a409777e8
--- /dev/null
+++ b/docs/EN/source/tutorial/api_server_args_zh.rst
@@ -0,0 +1,199 @@
+APIServer Parameter Details
+==========================
+
+This document provides detailed information about all startup parameters and their usage for LightLLM APIServer.
+
+Basic Configuration Parameters
+-----------------------------
+
+.. option:: --run_mode
+
+ Set the running mode, optional values:
+
+ * ``normal``: Single server mode (default)
+ * ``prefill``: Prefill mode (for pd separation running mode)
+ * ``decode``: Decode mode (for pd separation running mode)
+ * ``pd_master``: pd master node mode (for pd separation running mode)
+ * ``config_server``: Configuration server mode (for pd separation mode, used to register pd_master nodes and get pd_master node list), specifically designed for large-scale, high-concurrency scenarios, used when `pd_master` encounters significant CPU bottlenecks.
+
+.. option:: --host
+
+ Server listening address, default is ``127.0.0.1``
+
+.. option:: --port
+
+ Server listening port, default is ``8000``
+
+.. option:: --httpserver_workers
+
+ HTTP server worker process count, default is ``1``
+
+.. option:: --zmq_mode
+
+ ZMQ communication mode, optional values:
+
+ * ``tcp://``: TCP mode
+ * ``ipc:///tmp/``: IPC mode (default)
+
+ Can only choose from ``['tcp://', 'ipc:///tmp/']``
+
+PD Separation Mode Parameters
+----------------------------
+
+.. option:: --pd_master_ip
+
+ PD master node IP address, default is ``0.0.0.0``
+
+ This parameter needs to be set when run_mode is set to prefill or decode
+
+.. option:: --pd_master_port
+
+ PD master node port, default is ``1212``
+
+ This parameter needs to be set when run_mode is set to prefill or decode
+
+.. option:: --pd_decode_rpyc_port
+
+ Port used by decode nodes for kv move manager rpyc server in PD mode, default is ``42000``
+
+.. option:: --config_server_host
+
+ Host address in configuration server mode
+
+.. option:: --config_server_port
+
+ Port number in configuration server mode
+
+Model Configuration Parameters
+-----------------------------
+
+.. option:: --model_name
+
+ Model name, used to distinguish internal model names, default is ``default_model_name``
+
+ Can be obtained via ``host:port/get_model_name``
+
+.. option:: --model_dir
+
+ Model weight directory path, the application will load configuration, weights, and tokenizer from this directory
+
+.. option:: --tokenizer_mode
+
+ Tokenizer loading mode, optional values:
+
+ * ``slow``: Slow mode, loads fast but runs slow, suitable for debugging and testing
+ * ``fast``: Fast mode (default), achieves best performance
+ * ``auto``: Auto mode, tries to use fast mode, falls back to slow mode if it fails
+
+.. option:: --load_way
+
+ Model weight loading method, default is ``HF`` (Huggingface format)
+
+ Llama models also support ``DS`` (Deepspeed) format
+
+.. option:: --trust_remote_code
+
+ Whether to allow using custom model definition files on Hub
+
+Memory and Batch Processing Parameters
+------------------------------------
+
+.. option:: --max_total_token_num
+
+ Total token count supported by GPU and model, equals max_batch * (input_len + output_len)
+
+ If not specified, will be automatically calculated based on mem_fraction
+
+.. option:: --mem_fraction
+
+ Memory usage ratio, default is ``0.9``
+
+ If OOM occurs during runtime, you can specify a smaller value
+
+.. option:: --batch_max_tokens
+
+ Maximum token count for new batches, controls prefill batch size to prevent OOM
+
+.. option:: --running_max_req_size
+
+ Maximum number of requests for simultaneous forward inference, default is ``1000``
+
+.. option:: --max_req_total_len
+
+ Maximum value of request input length + request output length, default is ``16384``
+
+.. option:: --eos_id
+
+ End stop token ID, can specify multiple values. If None, will be loaded from config.json
+
+.. option:: --tool_call_parser
+
+ OpenAI interface tool call parser type, optional values:
+
+ * ``qwen25``
+ * ``llama3``
+ * ``mistral``
+
+Different Parallel Mode Setting Parameters
+----------------------------------------
+
+.. option:: --nnodes
+
+ Number of nodes, default is ``1``
+
+.. option:: --node_rank
+
+ Current node rank, default is ``0``
+
+.. option:: --multinode_httpmanager_port
+
+ Multi-node HTTP manager port, default is ``12345``
+
+.. option:: --multinode_router_gloo_port
+
+ Multi-node router gloo port, default is ``20001``
+
+.. option:: --tp
+
+ Model tensor parallelism size, default is ``1``
+
+.. option:: --dp
+
+ Data parallelism size, default is ``1``
+
+ This is a useful parameter for deepseekv2. When using deepseekv2 model, set dp equal to the tp parameter.
+ In other cases, please do not set it, keep the default value of 1.
+
+.. option:: --nccl_host
+
+ nccl_host used to build PyTorch distributed environment, default is ``127.0.0.1``
+
+ For multi-node deployment, should be set to the master node's IP
+
+.. option:: --nccl_port
+
+ nccl_port used to build PyTorch distributed environment, default is ``28765``
+
+.. option:: --use_config_server_to_init_nccl
+
+ Use tcp store server started by config_server to initialize nccl, default is False
+
+ When set to True, --nccl_host must equal config_server_host, --nccl_port must be unique for config_server,
+ do not use the same nccl_port for different inference nodes, this will be a serious error
+
+Attention Type Selection Parameters
+---------------------------------
+
+.. option:: --mode
+
+ Model inference mode, can specify multiple values:
+
+ * ``triton_int8kv``: Use int8 to store kv cache, can increase token capacity, uses triton kernel
+ * ``ppl_int8kv``: Use int8 to store kv cache, uses ppl fast kernel
+ * ``ppl_fp16``: Use ppl fast fp16 decode attention kernel
+ * ``triton_flashdecoding``: Flashdecoding mode for long context, currently supports llama llama2 qwen
+ * ``triton_gqa_attention``: Fast kernel for models using GQA
+ * ``triton_gqa_flashdecoding``: Fast flashdecoding kernel for models using GQA
+ * ``triton_fp8kv``: Use float8 to store kv cache, currently only used for deepseek2
+
+ Need to read source code to confirm specific modes supported by all models
\ No newline at end of file
diff --git a/docs/EN/source/tutorial/deepseek_deployment.rst b/docs/EN/source/tutorial/deepseek_deployment.rst
new file mode 100644
index 000000000..35f54ea1a
--- /dev/null
+++ b/docs/EN/source/tutorial/deepseek_deployment.rst
@@ -0,0 +1,200 @@
+.. _deepseek_deployment:
+
+DeepSeek Model Deployment Guide
+===============================
+
+LightLLM supports various deployment solutions for DeepSeek models, including DeepSeek-R1, DeepSeek-V2, DeepSeek-V3, etc. This document provides detailed information on various deployment modes and configuration solutions.
+
+Deployment Mode Overview
+-----------------------
+
+LightLLM supports the following deployment modes:
+
+1. **Single Machine TP Mode**: Deploy using tensor parallelism on a single machine
+2. **Single Machine EP Mode**: Deploy using expert parallelism on a single machine
+3. **Multi-Machine TP Mode**: Use tensor parallelism across multiple machines
+4. **Multi-Machine EP Mode**: Use expert parallelism across multiple machines
+5. **PD Separation Mode**: Separate prefill and decode deployment
+6. **Multi PD Master Mode**: Support multiple PD Master nodes
+
+1. Single Machine Deployment Solutions
+-------------------------------------
+
+1.1 Single Machine TP Mode (Tensor Parallel)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Suitable for deploying DeepSeek-R1 model on a single H200 machine.
+
+**Launch Command:**
+
+.. code-block:: bash
+
+ # H200 Single Machine DeepSeek-R1 TP Mode
+ LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+ --model_dir /path/DeepSeek-R1 \
+ --tp 8 \
+ --enable_fa3
+
+**Parameter Description:**
+- `LOADWORKER=18`: Model loading thread count, improves loading speed
+- `--tp 8`: Tensor parallelism degree, using 8 GPUs
+- `--enable_fa3`: Enable Flash Attention 3.0
+- `--port 8088`: Service port
+
+1.2 Single Machine DP + EP Mode (Data Parallel + Expert Parallel)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Suitable for expert parallelism deployment of MoE models like DeepSeek-V2/V3.
+
+**Launch Command:**
+
+.. code-block:: bash
+
+ # H200 Single Machine DeepSeek-R1 DP + EP Mode
+ MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+ --model_dir /path/DeepSeek-R1 \
+ --tp 8 \
+ --dp 8 \
+ --enable_fa3
+
+**Parameter Description:**
+- `MOE_MODE=EP`: Set expert parallelism mode
+- `--tp 8`: Tensor parallelism degree
+- `--dp 8`: Data parallelism degree, usually set to the same value as tp
+- `--enable_fa3`: Enable Flash Attention 3.0
+
+**Optional Optimization Parameters:**
+- `--enable_prefill_microbatch_overlap`: Enable prefill microbatch overlap
+- `--enable_decode_microbatch_overlap`: Enable decode microbatch overlap
+
+2. Multi-Machine Deployment Solutions
+------------------------------------
+
+2.1 Multi-Machine TP Mode
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Suitable for deployment across multiple H200/H100 machines.
+
+**Node 0 Launch Command:**
+
+.. code-block:: bash
+
+ # H200/H100 Multi-Machine DeepSeek-R1 TP Mode Node 0
+ # Usage: sh multi_node_tp_node0.sh
+ export nccl_host=$1
+ LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+ --model_dir /path/DeepSeek-R1 \
+ --tp 16 \
+ --enable_fa3 \
+ --nnodes 2 \
+ --node_rank 0 \
+ --nccl_host $nccl_host \
+ --nccl_port 2732
+
+**Node 1 Launch Command:**
+
+.. code-block:: bash
+
+ # H200/H100 Multi-Machine DeepSeek-R1 TP Mode Node 1
+ # Usage: sh multi_node_tp_node1.sh
+ export nccl_host=$1
+ LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+ --model_dir /path/DeepSeek-R1 \
+ --tp 16 \
+ --enable_fa3 \
+ --nnodes 2 \
+ --node_rank 1 \
+ --nccl_host $nccl_host \
+ --nccl_port 2732
+
+**Parameter Description:**
+- `--nnodes 2`: Total number of nodes
+- `--node_rank 0/1`: Current node rank
+- `--nccl_host`: NCCL communication host address
+- `--nccl_port 2732`: NCCL communication port
+
+2.2 Multi-Machine EP Mode
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Suitable for deploying MoE models across multiple machines.
+
+**Node 0 Launch Command:**
+
+.. code-block:: bash
+
+ # H200 Multi-Machine DeepSeek-R1 EP Mode Node 0
+ # Usage: sh multi_node_ep_node0.sh
+ export nccl_host=$1
+ MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+ --model_dir /path/DeepSeek-R1 \
+ --tp 16 \
+ --dp 16 \
+ --enable_fa3 \
+ --nnodes 2 \
+ --node_rank 0 \
+ --nccl_host $nccl_host \
+ --nccl_port 2732
+
+**Node 1 Launch Command:**
+
+.. code-block:: bash
+
+ # H200 Multi-Machine DeepSeek-R1 EP Mode Node 1
+ # Usage: sh multi_node_ep_node1.sh
+ export nccl_host=$1
+ MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+ --model_dir /path/DeepSeek-R1 \
+ --tp 16 \
+ --dp 16 \
+ --enable_fa3 \
+ --nnodes 2 \
+ --node_rank 1 \
+ --nccl_host $nccl_host \
+ --nccl_port 2732
+
+**Optional Optimization Parameters:**
+- `--enable_prefill_microbatch_overlap`: Enable prefill microbatch overlap
+- `--enable_decode_microbatch_overlap`: Enable decode microbatch overlap
+
+3. PD Separation Deployment Solutions
+------------------------------------
+
+PD (Prefill-Decode) separation mode separates prefill and decode stages for deployment, which can better utilize hardware resources.
+
+3.1 Single PD Master Mode
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Step 1: Launch PD Master Service**
+
+.. code-block:: bash
+
+ # PD Master for DeepSeek-R1
+ # Usage: sh pd_master.sh
+ export pd_master_ip=$1
+ python -m lightllm.server.api_server --model_dir /path/DeepSeek-R1 \
+ --run_mode "pd_master" \
+ --host $pd_master_ip \
+ --port 60011
+
+**Step 2: Launch Prefill Service**
+
+.. code-block:: bash
+
+ # PD prefill mode for DeepSeek-R1 (DP+EP) on H200
+ # Usage: sh pd_prefill.sh
+ # nvidia-cuda-mps-control -d, run MPS (optional, performance will be much better with mps support, but some graphics cards and driver environments may encounter errors when enabling mps, it's recommended to upgrade to a higher driver version, especially for H-series cards)
+
+ export host=$1
+ export pd_master_ip=$2
+ nvidia-cuda-mps-control -d
+ MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
+ --model_dir /path/DeepSeek-R1 \
+ --run_mode "prefill" \
+ --tp 8 \
+ --dp 8 \
+ --host $host \
+ --port 8019 \
+ --nccl_port 2732 \
+ --enable_fa3 \
+ --disable_cudagraph \
+ --pd_master_ip $pd_master_ip
\ No newline at end of file
diff --git a/docs/EN/source/tutorial/multimodal.rst b/docs/EN/source/tutorial/multimodal.rst
new file mode 100644
index 000000000..1b25fae88
--- /dev/null
+++ b/docs/EN/source/tutorial/multimodal.rst
@@ -0,0 +1,139 @@
+Multimodal Model Launch Configuration
+====================================
+
+LightLLM supports inference for various multimodal models. Below, using InternVL as an example, we explain the launch commands for multimodal services.
+
+Basic Launch Command
+-------------------
+
+.. code-block:: bash
+
+ INTERNVL_IMAGE_LENGTH=256 \
+ LOADWORKER=12 \
+ python -m lightllm.server.api_server \
+ --port 8080 \
+ --tp 2 \
+ --model_dir ${MODEL_PATH} \
+ --mem_fraction 0.8 \
+ --trust_remote_code \
+ --enable_multimodal
+
+Core Parameter Description
+-------------------------
+
+Environment Variables
+^^^^^^^^^^^^^^^^^^^^
+
+- **INTERNVL_IMAGE_LENGTH**: Set the image token length for InternVL model, default is 256
+- **LOADWORKER**: Set the number of worker processes for model loading
+
+Basic Service Parameters
+^^^^^^^^^^^^^^^^^^^^^^^
+
+- **--port 8080**: API server listening port
+- **--tp 2**: Tensor parallelism degree
+- **--model_dir**: InternVL model file path
+- **--mem_fraction 0.8**: GPU memory usage ratio
+- **--trust_remote_code**: Allow loading custom model code
+- **--enable_multimodal**: Enable multimodal functionality
+
+Advanced Configuration Parameters
+--------------------------------
+
+.. code-block:: bash
+
+ --visual_infer_batch_size 2 \
+ --cache_capacity 500 \
+ --visual_dp dp_size \
+ --visual_tp tp_size
+
+- **--visual_infer_batch_size 2**: Visual inference batch size
+- **--cache_capacity 500**: Image embedding cache capacity
+- **--visual_dp 2**: Visual model data parallelism degree
+- **--visual_tp 2**: Visual model tensor parallelism degree
+
+.. note:: To ensure equal memory load on each GPU, visual_dp * visual_tp = tp is required. For example, if tp=2, then visual_dp=1, visual_tp=2.
+
+ViT Deployment Methods
+----------------------
+
+ViT TP (Tensor Parallel)
+^^^^^^^^^^^^^^^^^^^^^^^
+
+- Default usage
+- --visual_tp tp_size enables tensor parallelism
+
+ViT DP (Data Parallel)
+^^^^^^^^^^^^^^^^^^^^^
+
+- Distribute different image batches to multiple GPUs
+- Each GPU runs a complete ViT model copy
+- --visual_dp dp_size enables data parallelism
+
+Image Caching Mechanism
+----------------------
+LightLLM caches embeddings of input images. In multi-turn conversations, if the images are the same, cached embeddings can be used directly, avoiding repeated inference.
+
+- **--cache_capacity**: Controls the number of cached image embeds
+- Matching based on image MD5 hash value
+- Uses LRU (Least Recently Used) eviction mechanism
+- Hit image cache can directly skip ViT inference
+
+Testing
+-------
+
+.. code-block:: python
+
+ import json
+ import requests
+ import base64
+
+ def run(query, uris):
+ images = []
+ for uri in uris:
+ if uri.startswith("http"):
+ images.append({"type": "url", "data": uri})
+ else:
+ with open(uri, 'rb') as fin:
+ b64 = base64.b64encode(fin.read()).decode("utf-8")
+ images.append({'type': "base64", "data": b64})
+
+ data = {
+ "inputs": query,
+ "parameters": {
+ "max_new_tokens": 200,
+ # The space before <|endoftext|> is important,
+ # the server will remove the first bos_token_id,
+ # but QWen tokenizer does not has bos_token_id
+ "stop_sequences": [" <|endoftext|>", " <|im_start|>", " <|im_end|>"],
+ },
+ "multimodal_params": {
+ "images": images,
+ }
+ }
+
+ url = "http://127.0.0.1:8000/generate"
+ headers = {'Content-Type': 'application/json'}
+ response = requests.post(url, headers=headers, data=json.dumps(data))
+ return response
+
+ query = """
+ <|im_start|>system
+ You are a helpful assistant.<|im_end|>
+ <|im_start|>user
+
+ What is this?<|im_end|>
+ <|im_start|>assistant
+ """
+
+ response = run(
+ uris = [
+ "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+ ],
+ query = query
+ )
+
+ if response.status_code == 200:
+ print(f"Result: {response.json()}")
+ else:
+ print(f"Error: {response.status_code}, {response.text}")
\ No newline at end of file
diff --git a/docs/EN/source/tutorial/openai.rst b/docs/EN/source/tutorial/openai.rst
new file mode 100644
index 000000000..270e29802
--- /dev/null
+++ b/docs/EN/source/tutorial/openai.rst
@@ -0,0 +1,203 @@
+.. _openai_api:
+
+LightLLM OpenAI API Usage Examples
+==================================
+
+LightLLM provides an interface that is fully compatible with OpenAI API, supporting all standard OpenAI features including function calling. This document provides detailed information on how to use LightLLM's OpenAI interface.
+
+Basic Configuration
+------------------
+
+First, ensure that the LightLLM service is started:
+
+.. code-block:: bash
+
+ # Start LightLLM service
+ python -m lightllm.server.api_server \
+ --model_dir /path/to/your/model \
+ --port 8088 \
+ --tp 1
+
+Basic Conversation Examples
+--------------------------
+
+1. Simple Conversation
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+ import requests
+ import json
+
+ # Configuration
+ url = "http://localhost:8088/v1/chat/completions"
+ headers = {"Content-Type": "application/json"}
+
+ # Request data
+ data = {
+ "model": "your_model_name",
+ "messages": [
+ {"role": "user", "content": "Hello, please introduce yourself"}
+ ],
+ "temperature": 0.7,
+ "max_tokens": 1000
+ }
+
+ # Send request
+ response = requests.post(url, headers=headers, json=data)
+
+ if response.status_code == 200:
+ result = response.json()
+ print("Reply:", result["choices"][0]["message"]["content"])
+ else:
+ print("Error:", response.status_code, response.text)
+
+2. Streaming Conversation
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+ import requests
+ import json
+
+ url = "http://localhost:8088/v1/chat/completions"
+ headers = {"Content-Type": "application/json"}
+
+ data = {
+ "model": "your_model_name",
+ "messages": [
+ {"role": "user", "content": "Please write a short essay about artificial intelligence"}
+ ],
+ "stream": True,
+ "temperature": 0.7,
+ "max_tokens": 1000
+ }
+
+ # Streaming request
+ response = requests.post(url, headers=headers, json=data, stream=True)
+
+ if response.status_code == 200:
+ for line in response.iter_lines():
+ if line:
+ line = line.decode('utf-8')
+ if line.startswith('data: '):
+ data_str = line[6:] # Remove "data: " prefix
+ if data_str == '[DONE]':
+ break
+ try:
+ chunk = json.loads(data_str)
+ if chunk['choices'][0]['delta'].get('content'):
+ print(chunk['choices'][0]['delta']['content'], end='', flush=True)
+ except json.JSONDecodeError:
+ continue
+ else:
+ print("Error:", response.status_code, response.text)
+
+Function Calling Examples
+------------------------
+
+LightLLM supports OpenAI's function calling functionality, providing function call parsing for three models. Specify the --tool_call_parser parameter when starting the service to choose. The service launch command is:
+
+.. code-block:: bash
+
+ python -m lightllm.server.api_server \
+ --model_dir /path/to/your/model \
+ --port 8088 \
+ --tp 1 \
+ --tool_call_parser qwen25
+ # Optional parameters are qwen25, llama3, mistral
+
+1. Basic Function Calling
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+ import requests
+ import json
+
+ url = "http://localhost:8088/v1/chat/completions"
+ headers = {"Content-Type": "application/json"}
+
+ # Define functions
+ tools = [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_current_weather",
+ "description": "Get current weather information for a specified city",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "city": {
+ "type": "string",
+ "description": "City name, e.g.: Beijing, Shanghai"
+ },
+ "unit": {
+ "type": "string",
+ "enum": ["celsius", "fahrenheit"],
+ "description": "Temperature unit"
+ }
+ },
+ "required": ["city"]
+ }
+ }
+ }
+ ]
+
+ # Request data
+ data = {
+ "model": "your_model_name",
+ "messages": [
+ {"role": "user", "content": "What's the weather like in Beijing today?"}
+ ],
+ "tools": tools,
+ "tool_choice": "auto", # Let the model automatically decide whether to call functions
+ "temperature": 0.7,
+ "max_tokens": 1000
+ }
+
+ # Send request
+ response = requests.post(url, headers=headers, json=data)
+
+ if response.status_code == 200:
+ result = response.json()
+ message = result["choices"][0]["message"]
+
+ # Check if there are function calls
+ if message.get("tool_calls"):
+ print("Model decided to call functions:")
+ for tool_call in message["tool_calls"]:
+ print(f"Function name: {tool_call['function']['name']}")
+ print(f"Arguments: {tool_call['function']['arguments']}")
+ else:
+ print("Reply:", message["content"])
+ else:
+ print("Error:", response.status_code, response.text)
+
+2. Streaming Function Calling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+ import requests
+ import json
+
+ url = "http://localhost:8088/v1/chat/completions"
+ headers = {"Content-Type": "application/json"}
+
+ tools = [
+ {
+ "type": "function",
+ "function": {
+ "name": "calculate",
+ "description": "Perform mathematical calculations",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "expression": {"type": "string", "description": "Mathematical expression"}
+ },
+ "required": ["expression"]
+ }
+ }
+ }
+ ]
\ No newline at end of file
diff --git a/docs/EN/source/tutorial/reward_model.rst b/docs/EN/source/tutorial/reward_model.rst
new file mode 100644
index 000000000..d00f8b9fd
--- /dev/null
+++ b/docs/EN/source/tutorial/reward_model.rst
@@ -0,0 +1,62 @@
+Reward Model Deployment Configuration
+====================================
+
+LightLLM supports inference for various reward models, used for evaluating conversation quality and generating reward scores. Currently supported reward models include InternLM2 Reward and Qwen2 Reward, etc.
+
+Basic Launch Command
+---------------------
+
+.. code-block:: bash
+
+ python -m lightllm.server.api_server \
+ --port 8080 \
+ --model_dir ${MODEL_PATH} \
+ --trust_remote_code \
+ --use_reward_model # Enable reward model functionality (required parameter)
+
+Testing Examples
+----------------
+
+Python Testing Code
+^^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+ import json
+ import requests
+
+ # InternLM2 Reward test
+ query = "<|im_start|>user\nHello! What's your name?<|im_end|>\n<|im_start|>assistant\nMy name is InternLM2! A helpful AI assistant. What can I do for you?<|im_end|>\n<|reward|>"
+
+ url = "http://127.0.0.1:8000/get_score"
+ headers = {'Content-Type': 'application/json'}
+
+ data = {
+ "chat": query,
+ "parameters": {
+ "frequency_penalty": 1
+ }
+ }
+
+ response = requests.post(url, headers=headers, data=json.dumps(data))
+
+ if response.status_code == 200:
+ result = response.json()
+ print(f"Reward score: {result['score']}")
+ print(f"Input tokens: {result['prompt_tokens']}")
+ else:
+ print(f"Error: {response.status_code}, {response.text}")
+
+cURL Testing Command
+^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+ curl http://localhost:8000/get_score \
+ -H "Content-Type: application/json" \
+ -d '{
+ "chat": "<|im_start|>user\nHello! What is AI?<|im_end|>\n<|im_start|>assistant\nAI stands for Artificial Intelligence, which refers to the simulation of human intelligence in machines.<|im_end|>\n<|reward|>",
+ "parameters": {
+ "frequency_penalty": 1
+ }
+ }'
\ No newline at end of file
diff --git a/docs/EN/source/user/openapi_docs.rst b/docs/EN/source/user/openapi_docs.rst
deleted file mode 100755
index 5af28ac4a..000000000
--- a/docs/EN/source/user/openapi_docs.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-OpenApi docs
-=================================
-
-The following documentation is automatically generated by openapi. After deploying with Lightllm, you can open it using ``host:port/docs``
-
-.. raw:: html
-
-
-
-
-
- FastAPI - Swagger UI
-
-
-
-
-
-
-
-
-
-
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index e9943b05f..601b2a48a 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -375,7 +375,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
type=str,
default=None,
help="""Path of quantization config. It can be used for mixed quantization.
- Examples can be found in lightllm/common/quantization/configs.""",
+ Examples can be found in test/advanced_config/mixed_quantization/llamacls-mix-down.yaml.""",
)
parser.add_argument(
"--vit_quant_type",
diff --git a/lightllm/common/quantization/configs/llamacls-mix-down.yaml b/test/advanced_config/mixed_quantization/llamacls-mix-down.yaml
similarity index 100%
rename from lightllm/common/quantization/configs/llamacls-mix-down.yaml
rename to test/advanced_config/mixed_quantization/llamacls-mix-down.yaml
diff --git a/test/test_redundancy_expert_config.json b/test/advanced_config/redundancy_expert/test_redundancy_expert_config.json
similarity index 100%
rename from test/test_redundancy_expert_config.json
rename to test/advanced_config/redundancy_expert/test_redundancy_expert_config.json
diff --git a/test/benchmark/kernel/benchmark_fused_moe_triton.py b/test/benchmark/kernel/benchmark_fused_moe_triton.py
new file mode 100644
index 000000000..6f7a5ee39
--- /dev/null
+++ b/test/benchmark/kernel/benchmark_fused_moe_triton.py
@@ -0,0 +1,330 @@
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/v0.4.6.post5/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
+import argparse
+
+import torch
+import triton
+import vllm
+from transformers import AutoConfig
+from lightllm.common.fused_moe.topk_select import select_experts
+from lightllm.common.fused_moe.grouped_fused_moe import fused_experts_impl
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe as fused_moe_vllm
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+ fused_moe as fused_moe_sglang,
+)
+
+
+def get_model_config(model_name: str, tp_size: int):
+ """Get model configuration parameters"""
+ config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+
+ if config.architectures[0] == "DbrxForCausalLM":
+ E = config.ffn_config.moe_num_experts
+ topk = config.ffn_config.moe_top_k
+ intermediate_size = config.ffn_config.ffn_hidden_size
+ shard_intermediate_size = 2 * intermediate_size // tp_size
+ elif config.architectures[0] == "JambaForCausalLM":
+ E = config.num_experts
+ topk = config.num_experts_per_tok
+ intermediate_size = config.intermediate_size
+ shard_intermediate_size = 2 * intermediate_size // tp_size
+ elif config.architectures[0] == "Qwen2MoeForCausalLM":
+ E = config.num_experts
+ topk = config.num_experts_per_tok
+ intermediate_size = config.moe_intermediate_size
+ shard_intermediate_size = 2 * intermediate_size // tp_size
+ elif config.architectures[0] == "Qwen3MoeForCausalLM":
+ E = config.num_experts
+ topk = config.num_experts_per_tok
+ intermediate_size = config.moe_intermediate_size
+ shard_intermediate_size = 2 * intermediate_size // tp_size
+ elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
+ E = config.n_routed_experts
+ topk = config.num_experts_per_tok
+ intermediate_size = config.moe_intermediate_size
+ shard_intermediate_size = 2 * intermediate_size // tp_size
+ elif config.architectures[0] in [
+ "Grok1ForCausalLM",
+ "Grok1ImgGen",
+ "Grok1AForCausalLM",
+ ]:
+ E = config.num_local_experts
+ topk = config.num_experts_per_tok
+ intermediate_size = config.moe_intermediate_size
+ shard_intermediate_size = 2 * intermediate_size // tp_size
+ else:
+ # Default: Mixtral
+ E = config.num_local_experts
+ topk = config.num_experts_per_tok
+ intermediate_size = config.intermediate_size
+ shard_intermediate_size = 2 * intermediate_size // tp_size
+
+ vllm_version_num = vllm.__version_tuple__[0] * 100 + vllm.__version_tuple__[1] * 10 + vllm.__version_tuple__[2]
+ block_shape = None
+ if hasattr(config, "quantization_config") and "weight_block_size" in config.quantization_config:
+ block_shape = config.quantization_config["weight_block_size"]
+ assert len(block_shape) == 2
+ assert vllm_version_num >= 66, "Block-wise quantized fp8 fused_moe is only supported for VLLM>=0.6.6.post1"
+
+ shape_configs = {
+ "num_experts": E,
+ "topk": topk,
+ "hidden_size": config.hidden_size,
+ "shard_intermediate_size": shard_intermediate_size,
+ "dtype": config.torch_dtype,
+ "block_shape": block_shape,
+ }
+ print(f"{shape_configs=}")
+ return shape_configs
+
+
+def fused_moe_lightllm_api(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ use_fp8_w8a8=False,
+ w1_scale=None,
+ w2_scale=None,
+ a1_scale=None,
+ a2_scale=None,
+ block_shape=None,
+):
+
+ topk_weights, topk_ids = select_experts(
+ hidden_states=x,
+ router_logits=input_gating,
+ correction_bias=None,
+ use_grouped_topk=False,
+ top_k=topk,
+ renormalize=True,
+ topk_group=None,
+ num_expert_group=None,
+ scoring_func="softmax",
+ )
+ use_fp8_w8a8 = use_fp8_w8a8
+
+ return fused_experts_impl(
+ hidden_states=x,
+ w1=w1,
+ w2=w2,
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ inplace=True,
+ use_fp8_w8a8=use_fp8_w8a8,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ )
+
+
+def fused_moe_vllm_api(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ use_fp8_w8a8=False,
+ w1_scale=None,
+ w2_scale=None,
+ a1_scale=None,
+ a2_scale=None,
+ block_shape=None,
+):
+ if block_shape is not None:
+ return fused_moe_vllm(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ renormalize=True,
+ inplace=True,
+ use_fp8_w8a8=use_fp8_w8a8,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_shape,
+ )
+ else:
+ return fused_moe_vllm(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ renormalize=True,
+ inplace=True,
+ use_fp8_w8a8=use_fp8_w8a8,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ )
+
+
+def fused_moe_sglang_api(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ use_fp8_w8a8=False,
+ w1_scale=None,
+ w2_scale=None,
+ a1_scale=None,
+ a2_scale=None,
+ block_shape=None,
+):
+ return fused_moe_sglang(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ renormalize=True,
+ inplace=True,
+ use_fp8_w8a8=use_fp8_w8a8,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_shape,
+ )
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[1, 8, 16, 32, 64, 128],
+ line_arg="provider",
+ line_vals=[
+ "vllm_fused_moe_triton",
+ "sglang_fused_moe_triton",
+ "lightllm_fused_moe_triton",
+ ],
+ line_names=[
+ "vllm_fused_moe_triton",
+ "sglang_fused_moe_triton",
+ "lightllm_fused_moe_triton",
+ ],
+ styles=[
+ ("blue", "-"),
+ ("green", "-"),
+ ("red", "-"),
+ ],
+ ylabel="Time (ms)",
+ plot_name="fused-moe-performance",
+ args={},
+ )
+)
+def benchmark(batch_size, provider, model_config, use_fp8=False):
+ torch.set_default_device("cuda")
+ torch.cuda.manual_seed_all(0)
+
+ num_tokens = batch_size
+ num_experts = model_config["num_experts"]
+ hidden_size = model_config["hidden_size"]
+ shard_intermediate_size = model_config["shard_intermediate_size"]
+ topk = model_config["topk"]
+ dtype = model_config["dtype"]
+ block_shape = getattr(model_config, "block_shape", None)
+ block_shape = [128, 128]
+ x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+ w1_scale = w2_scale = a1_scale = a2_scale = None
+
+ if use_fp8:
+ init_dtype = dtype
+ w1 = torch.randn(num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype)
+ w2 = torch.randn(num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype)
+ w1 = w1.to(torch.float8_e4m3fn)
+ w2 = w2.to(torch.float8_e4m3fn)
+
+ if block_shape is None:
+ w1_scale = torch.randn(num_experts, dtype=torch.float32)
+ w2_scale = torch.randn(num_experts, dtype=torch.float32)
+ a1_scale = torch.randn(1, dtype=torch.float32)
+ a2_scale = torch.randn(1, dtype=torch.float32)
+ else:
+ block_n, block_k = block_shape[0], block_shape[1]
+ n_tiles_w1 = (shard_intermediate_size + block_n - 1) // block_n
+ n_tiles_w2 = (hidden_size + block_n - 1) // block_n
+ k_tiles_w1 = (hidden_size + block_k - 1) // block_k
+ k_tiles_w2 = (shard_intermediate_size // 2 + block_k - 1) // block_k
+ w1_scale = torch.rand((num_experts, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+ w2_scale = torch.rand((num_experts, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+ else:
+ w1 = torch.randn(num_experts, shard_intermediate_size, hidden_size, dtype=dtype)
+ w2 = torch.randn(num_experts, hidden_size, shard_intermediate_size // 2, dtype=dtype)
+
+ input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+
+ # Warmup
+ api_func = (
+ fused_moe_vllm_api
+ if provider == "vllm_fused_moe_triton"
+ else fused_moe_sglang_api
+ if provider == "lightllm_fused_moe_triton"
+ else fused_moe_lightllm_api
+ )
+ for _ in range(10):
+ api_func(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ use_fp8_w8a8=use_fp8,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_shape,
+ )
+ torch.cuda.synchronize()
+
+ quantiles = [0.5, 0.2, 0.8]
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: api_func(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ use_fp8_w8a8=use_fp8,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_shape,
+ )[0],
+ quantiles=quantiles,
+ )
+ return ms, min_ms, max_ms
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1")
+ parser.add_argument("--tp-size", type=int, default=8)
+ parser.add_argument("--use-fp8", action="store_true")
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default="./configs/benchmark_ops/vllm_sglang_fused_moe/",
+ )
+ args = parser.parse_args()
+
+ model_config = get_model_config(args.model, args.tp_size)
+ benchmark.run(
+ show_plots=True,
+ print_data=True,
+ save_path=args.save_path,
+ model_config=model_config,
+ use_fp8=args.use_fp8,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/test/benchmark_client.py b/test/benchmark/service/benchmark_client.py
similarity index 100%
rename from test/benchmark_client.py
rename to test/benchmark/service/benchmark_client.py
diff --git a/test/benchmark_mcq.py b/test/benchmark/service/benchmark_mcq.py
similarity index 68%
rename from test/benchmark_mcq.py
rename to test/benchmark/service/benchmark_mcq.py
index 51cdee830..828a970cc 100644
--- a/test/benchmark_mcq.py
+++ b/test/benchmark/service/benchmark_mcq.py
@@ -26,13 +26,13 @@
import aiohttp
import numpy as np
-from transformers import PreTrainedTokenizerBase
from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
- PreTrainedTokenizerFast)
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
QUESTION = {}
+
+
def get_tokenizer(
tokenizer_name: str,
tokenizer_mode: str = "slow",
@@ -42,25 +42,21 @@ def get_tokenizer(
"""Gets a tokenizer for the given model name via Huggingface."""
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
- raise ValueError(
- "Cannot use the fast tokenizer in slow tokenizer mode.")
+ raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = True
if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True):
pass
try:
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args,
- **kwargs)
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args, **kwargs)
except TypeError as e:
- err_msg = (
- "Failed to load the tokenizer. If you are using a LLaMA-based "
- f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original "
- "tokenizer.")
+ err_msg = "Failed to load the tokenizer. {e}"
raise RuntimeError(err_msg) from e
if not isinstance(tokenizer, PreTrainedTokenizerFast):
pass
return tokenizer
+
# (prompt len, output len, latency)
REQUEST_LATENCY: List[Tuple[int, int, float]] = []
@@ -73,11 +69,10 @@ def sample_requests(
data = []
with open(dataset_path, "r") as f:
questions = f.readlines()
- gts = {}
for question in questions:
question = json.loads(question.strip())
file_name = question["file_name"].split(".")[0]
- data.append((file_name, question['question_id'], question['instruction'], question['answer']))
+ data.append((file_name, question["question_id"], question["instruction"], question["answer"]))
if file_name not in QUESTION:
QUESTION[file_name] = {}
QUESTION[file_name][question["question_id"]] = [question["answer"]]
@@ -107,25 +102,22 @@ async def send_request(
output_len: int,
port: int,
) -> None:
- request_start_time = time.time()
- headers = {'Content-Type': 'application/json'}
+ headers = {"Content-Type": "application/json"}
headers = {"User-Agent": "Benchmark Client"}
- file_name, question_id, inputs, answer = request
- prompt = f"<系统> <对话历史> <知识> <最新问题> 用户:给出以下问题的答案:\n{inputs} SenseChat:"
- print(prompt)
- # prompt= "[Round {}]\n\n问:{}\n\n答:".format(1, inputs)
- url = f'http://localhost:{port}/generate'
+ file_name, question_id, inputs, answer = request
+ prompt = "[Round {}]\n\n问:{}\n\n答:".format(1, inputs)
+ url = f"http://localhost:{port}/generate"
data = {
- 'inputs': prompt,
- 'parameters': {
- 'do_sample': False,
- 'ignore_eos': True,
- 'max_new_tokens': output_len,
- # 'do_sample':True,
+ "inputs": prompt,
+ "parameters": {
+ "do_sample": False,
+ "ignore_eos": True,
+ "max_new_tokens": output_len,
+ # 'do_sample':True,
# 'top_p':0.8,
# 'temperature':0.8
- # 'temperature': 0.1,
- }
+ # 'temperature': 0.1,
+ },
}
timeout = aiohttp.ClientTimeout(total=3 * 3600)
async with aiohttp.ClientSession(timeout=timeout) as session:
@@ -140,6 +132,7 @@ async def send_request(
if "error" not in output:
break
+
async def benchmark(
input_requests: List[Tuple[str, int, int]],
request_rate: float,
@@ -153,18 +146,18 @@ async def benchmark(
def IsOpen(ip, port):
- s = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
- index=1
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
- s.connect((ip,int(port)))
+ s.connect((ip, int(port)))
s.shutdown(2)
- print('successfully launch model')
+ print("successfully launch model")
return True
except:
time.sleep(10)
return False
+
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
@@ -172,7 +165,6 @@ def main(args: argparse.Namespace):
tokenizer = get_tokenizer(args.tokenizer, "slow")
input_requests = sample_requests(args.dataset, tokenizer)
- benchmark_start_time = time.time()
asyncio.run(benchmark(input_requests, args.request_rate, args.port))
rights, alls = 0, 0
for file_name in QUESTION:
@@ -186,19 +178,19 @@ def main(args: argparse.Namespace):
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Benchmark the online serving throughput.")
- parser.add_argument("--dataset", type=str, required=True,
- help="Path to the dataset.")
- parser.add_argument("--tokenizer", type=str, required=True,
- help="Name or path of the tokenizer.")
- parser.add_argument("--request-rate", type=float, default=float("inf"),
- help="Number of requests per second. If this is inf, "
- "then all the requests are sent at time 0. "
- "Otherwise, we use Poisson process to synthesize "
- "the request arrival times.")
- parser.add_argument("--port", type=int, default=8000,
- help="port number")
+ parser = argparse.ArgumentParser(description="Benchmark the online serving throughput.")
+ parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset.")
+ parser.add_argument("--tokenizer", type=str, required=True, help="Name or path of the tokenizer.")
+ parser.add_argument(
+ "--request-rate",
+ type=float,
+ default=float("inf"),
+ help="Number of requests per second. If this is inf, "
+ "then all the requests are sent at time 0. "
+ "Otherwise, we use Poisson process to synthesize "
+ "the request arrival times.",
+ )
+ parser.add_argument("--port", type=int, default=8000, help="port number")
parser.add_argument("--seed", type=int, default=0)
args = parser.parse_args()
main(args)
diff --git a/test/server/benchmark_prompt_cache.py b/test/benchmark/service/benchmark_prompt_cache.py
similarity index 87%
rename from test/server/benchmark_prompt_cache.py
rename to test/benchmark/service/benchmark_prompt_cache.py
index 7a52420b0..66fcb5296 100644
--- a/test/server/benchmark_prompt_cache.py
+++ b/test/benchmark/service/benchmark_prompt_cache.py
@@ -1,3 +1,25 @@
+"""
+This script benchmarks the performance of a large language model inference service via HTTP API,
+supporting multi-user and multi-turn dialogue scenarios.
+
+Main arguments:
+- --model_url: Service address
+- --model_name: Model name (for result file naming)
+- --num_workers: Number of concurrent processes
+- --first_input_len: Input length for the first turn
+- --subsequent_input_len: Input length for subsequent turns
+- --output_len: Number of tokens generated per turn
+- --num_turns: Number of dialogue turns per user
+- --num_users: Number of users
+- --result_dir: Directory to save results
+- --print: Whether to print the result
+- --cache: Whether to cache the result
+- --use_cache: Whether to use cached results
+
+Example usage:
+python benchmark_prompt_cache.py --address http://localhost:8090 --model_name llama \\
+--num_workers 1 --first_input_len 512 --subsequent_input_len 32 --output_len 32 --num_turns 5 --num_users 1
+"""
import requests
import json
import operator
diff --git a/test/server/test_settings.py b/test/benchmark/service/benchmark_prompt_cache_multi_server.py
similarity index 71%
rename from test/server/test_settings.py
rename to test/benchmark/service/benchmark_prompt_cache_multi_server.py
index 3acf17376..ac22a56cc 100644
--- a/test/server/test_settings.py
+++ b/test/benchmark/service/benchmark_prompt_cache_multi_server.py
@@ -1,3 +1,28 @@
+"""
+benchmark_multi_server.py
+
+This script is used for automated benchmarking of multiple model services (e.g., llama-7b, llama-13b),
+evaluating their performance under different input lengths, output lengths, number of turns, concurrent users,
+and worker threads.
+
+Main features:
+- Supports automated testing for multiple models and parameter combinations.
+- Collects and outputs various performance metrics, including throughput, QPS, and latency.
+- Saves results as a Markdown table for easy analysis.
+
+Parameter description:
+- models: Model names and their service URLs to be tested.
+- first_input_lens: List of token lengths for the first input.
+- subsequent_input_lens: List of token lengths for subsequent inputs.
+- output_lens: List of output token lengths.
+- num_turns: List of dialogue turns.
+- num_workers: List of concurrent worker counts.
+- num_users: List of concurrent user counts.
+- result_dir: Directory to save results.
+
+Example:
+ python benchmark_multi_server.py
+"""
import os
import itertools
from easydict import EasyDict
diff --git a/test/benchmark_qps.py b/test/benchmark/service/benchmark_qps.py
similarity index 100%
rename from test/benchmark_qps.py
rename to test/benchmark/service/benchmark_qps.py
diff --git a/test/benchmark_serving.py b/test/benchmark/service/benchmark_sharegpt.py
similarity index 71%
rename from test/benchmark_serving.py
rename to test/benchmark/service/benchmark_sharegpt.py
index 9cde7fd8d..c9f92f098 100644
--- a/test/benchmark_serving.py
+++ b/test/benchmark/service/benchmark_sharegpt.py
@@ -25,11 +25,10 @@
import aiohttp
import numpy as np
-from transformers import PreTrainedTokenizerBase
from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
- PreTrainedTokenizerFast)
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
def get_tokenizer(
tokenizer_name: str,
@@ -40,26 +39,22 @@ def get_tokenizer(
"""Gets a tokenizer for the given model name via Huggingface."""
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
- raise ValueError(
- "Cannot use the fast tokenizer in slow tokenizer mode.")
+ raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True):
pass
try:
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args,
- **kwargs)
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args, **kwargs)
except TypeError as e:
- err_msg = (
- "Failed to load the tokenizer. If you are using a LLaMA-based "
- f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original "
- "tokenizer.")
+ err_msg = "Failed to load the tokenizer. {e}"
raise RuntimeError(err_msg) from e
if not isinstance(tokenizer, PreTrainedTokenizerFast):
pass
return tokenizer
+
# (prompt len, output len, latency)
REQUEST_LATENCY: List[Tuple[int, int, float]] = []
@@ -73,23 +68,18 @@ def sample_requests(
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
- dataset = [
- data for data in dataset
- if len(data["conversations"]) >= 2
- ]
+ dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
- dataset = [
- (data["conversations"][0]["value"], data["conversations"][1]["value"])
- for data in dataset
- ]
-
+ dataset = [(data["conversations"][0]["value"], data["conversations"][1]["value"]) for data in dataset]
+
print("read data set finish")
# Tokenize the prompts and completions.
import random
+
dataset = random.sample(dataset, num_requests * 3)
prompts = [prompt for prompt, _ in dataset]
completions = [completion for _, completion in dataset]
-
+
prompt_token_ids = tokenizer(prompts).input_ids
completion_token_ids = tokenizer(completions).input_ids
tokenized_dataset = []
@@ -135,26 +125,21 @@ async def get_request(
await asyncio.sleep(interval)
-async def send_request(
- prompt: str,
- prompt_len: int,
- output_len: int
-) -> None:
+async def send_request(prompt: str, prompt_len: int, output_len: int) -> None:
request_start_time = time.time()
- headers = {'Content-Type': 'application/json'}
+ headers = {"Content-Type": "application/json"}
headers = {"User-Agent": "Benchmark Client"}
- url = 'http://localhost:8000/generate'
-
+ url = "http://localhost:8000/generate"
+
data = {
- 'inputs': prompt,
- 'parameters': {
- 'do_sample': False,
- 'ignore_eos': True,
- 'max_new_tokens': output_len,
- # 'temperature': 0.1,
- }
+ "inputs": prompt,
+ "parameters": {
+ "do_sample": False,
+ "ignore_eos": True,
+ "max_new_tokens": output_len,
+ # 'temperature': 0.1,
+ },
}
-
timeout = aiohttp.ClientTimeout(total=3 * 3600)
async with aiohttp.ClientSession(timeout=timeout) as session:
@@ -165,7 +150,7 @@ async def send_request(
chunks.append(chunk)
output = b"".join(chunks).decode("utf-8")
output = json.loads(output)
-
+
if "error" not in output:
break
@@ -181,8 +166,7 @@ async def benchmark(
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request
- task = asyncio.create_task(send_request(prompt,
- prompt_len, output_len))
+ task = asyncio.create_task(send_request(prompt, prompt_len, output_len))
tasks.append(task)
await asyncio.gather(*tasks)
@@ -204,33 +188,28 @@ def main(args: argparse.Namespace):
# Compute the latency statistics.
avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
print(f"Average latency: {avg_latency:.2f} s")
- avg_per_token_latency = np.mean([
- latency / (prompt_len + output_len)
- for prompt_len, output_len, latency in REQUEST_LATENCY
- ])
+ avg_per_token_latency = np.mean(
+ [latency / (prompt_len + output_len) for prompt_len, output_len, latency in REQUEST_LATENCY]
+ )
print(f"Average latency per token: {avg_per_token_latency:.2f} s")
- avg_per_output_token_latency = np.mean([
- latency / output_len
- for _, output_len, latency in REQUEST_LATENCY
- ])
- print("Average latency per output token: "
- f"{avg_per_output_token_latency:.2f} s")
+ avg_per_output_token_latency = np.mean([latency / output_len for _, output_len, latency in REQUEST_LATENCY])
+ print("Average latency per output token: " f"{avg_per_output_token_latency:.2f} s")
if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Benchmark the online serving throughput.")
- parser.add_argument("--dataset", type=str, required=True,
- help="Path to the dataset.")
- parser.add_argument("--tokenizer", type=str, required=True,
- help="Name or path of the tokenizer.")
- parser.add_argument("--request-rate", type=float, default=float("inf"),
- help="Number of requests per second. If this is inf, "
- "then all the requests are sent at time 0. "
- "Otherwise, we use Poisson process to synthesize "
- "the request arrival times.")
- parser.add_argument("--num-prompts", type=int, default=1000,
- help="Number of prompts to process.")
+ parser = argparse.ArgumentParser(description="Benchmark the online serving throughput.")
+ parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset.")
+ parser.add_argument("--tokenizer", type=str, required=True, help="Name or path of the tokenizer.")
+ parser.add_argument(
+ "--request-rate",
+ type=float,
+ default=float("inf"),
+ help="Number of requests per second. If this is inf, "
+ "then all the requests are sent at time 0. "
+ "Otherwise, we use Poisson process to synthesize "
+ "the request arrival times.",
+ )
+ parser.add_argument("--num-prompts", type=int, default=1000, help="Number of prompts to process.")
parser.add_argument("--seed", type=int, default=0)
args = parser.parse_args()
main(args)
diff --git a/test/model/model_infer.py b/test/benchmark/static_inference/model_infer.py
similarity index 55%
rename from test/model/model_infer.py
rename to test/benchmark/static_inference/model_infer.py
index 3fe91d716..6cd97cfdf 100644
--- a/test/model/model_infer.py
+++ b/test/benchmark/static_inference/model_infer.py
@@ -7,7 +7,7 @@
from lightllm.utils.dist_utils import init_distributed_env, get_current_rank_in_dp
from lightllm.utils.envs_utils import get_env_start_args
from lightllm.models import get_model
-from lightllm.common.basemodel.microbatch_overlap_objs import DecodeMicroBatch, PrefillMicroBatch
+from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
from torch.profiler import profile, record_function, ProfilerActivity
from lightllm.utils.log_utils import init_logger
import torch.cuda as cuda
@@ -35,9 +35,9 @@ def test_model_inference(args):
"max_total_token_num": args.max_total_token_num,
"graph_max_len_in_batch": args.max_req_total_len,
"graph_max_batch_size": args.graph_max_batch_size,
- "mem_faction": args.mem_fraction,
- "max_req_num": max(args.batch_size, 2048),
- "batch_max_tokens": args.batch_size * args.input_len,
+ "mem_fraction": args.mem_fraction,
+ "max_req_num": 2048,
+ "batch_max_tokens": 1024,
"run_mode": "normal",
"max_seq_length": args.max_req_total_len,
"disable_cudagraph": args.disable_cudagraph,
@@ -77,7 +77,7 @@ def overlap_prefill(
_0_b_req_idx = b_req_idx[: batch_size // 2]
_0_b_seq_len = b_seq_len[: batch_size // 2]
_o_b_ready_cache_len = b_ready_cache_len[: batch_size // 2]
- micro_batch1 = PrefillMicroBatch(
+ micro_batch1 = ModelInput(
_0_batch_size,
_0_total_token_num,
_0_max_len_in_batch,
@@ -85,6 +85,7 @@ def overlap_prefill(
_0_mem_indexes,
_0_b_req_idx,
_0_b_seq_len,
+ True,
_o_b_ready_cache_len,
{},
)
@@ -98,7 +99,7 @@ def overlap_prefill(
_1_b_seq_len = b_seq_len[batch_size // 2 :]
_1_b_ready_cache_len = b_ready_cache_len[batch_size // 2 :]
- micro_batch2 = PrefillMicroBatch(
+ micro_batch2 = ModelInput(
_1_batch_size,
_1_total_token_num,
_1_max_len_in_batch,
@@ -106,11 +107,14 @@ def overlap_prefill(
_1_mem_indexes,
_1_b_req_idx,
_1_b_seq_len,
+ True,
_1_b_ready_cache_len,
{},
)
- logits, logits1 = model_part.microbatch_overlap_prefill(micro_batch1, micro_batch2)
+ output, output1 = model_part.microbatch_overlap_prefill(micro_batch1, micro_batch2)
+ logits = output.logits
+ logits1 = output1.logits
return torch.cat((logits, logits1), dim=0)
@@ -124,7 +128,7 @@ def overlap_decode(
_0_mem_indexes = mem_indexes[: batch_size // 2]
_0_b_req_idx = b_req_idx[: batch_size // 2]
_0_b_seq_len = b_seq_len[: batch_size // 2]
- micro_batch1 = DecodeMicroBatch(
+ micro_batch1 = ModelInput(
_0_batch_size,
_0_total_token_num,
_0_max_len_in_batch,
@@ -142,7 +146,7 @@ def overlap_decode(
_1_b_req_idx = b_req_idx[batch_size // 2 :]
_1_b_seq_len = b_seq_len[batch_size // 2 :]
- micro_batch2 = DecodeMicroBatch(
+ micro_batch2 = ModelInput(
_1_batch_size,
_1_total_token_num,
_1_max_len_in_batch,
@@ -152,12 +156,40 @@ def overlap_decode(
_1_b_seq_len,
)
- logits, logits1 = model_part.microbatch_overlap_decode(micro_batch1, micro_batch2)
+ output, output1 = model_part.microbatch_overlap_decode(micro_batch1, micro_batch2)
+ logits = output.logits
+ logits1 = output1.logits
return torch.cat((logits, logits1), dim=0)
+def prefill(
+ model_part,
+ batch_size,
+ max_len_in_batch,
+ input_ids,
+ mem_indexes,
+ b_req_idx,
+ b_seq_len,
+ total_token_num,
+ b_ready_cache_len,
+):
+ model_input = ModelInput(
+ batch_size,
+ total_token_num,
+ max_len_in_batch,
+ input_ids,
+ mem_indexes,
+ b_req_idx,
+ b_seq_len,
+ is_prefill=True,
+ b_ready_cache_len=b_ready_cache_len,
+ )
+ model_output = model_part.forward(model_input)
+ return model_output.logits
+
+
def decode(model_part, batch_size, max_len_in_batch, input_ids, mem_indexes, b_req_idx, b_seq_len, total_token_num):
- logits = model_part.forward(
+ model_input = ModelInput(
batch_size,
total_token_num,
max_len_in_batch,
@@ -167,7 +199,8 @@ def decode(model_part, batch_size, max_len_in_batch, input_ids, mem_indexes, b_r
b_seq_len,
is_prefill=False,
)
- return logits
+ model_output = model_part.forward(model_input)
+ return model_output.logits
def torch_profile(fn, log_dir=None):
@@ -183,123 +216,25 @@ def torch_profile(fn, log_dir=None):
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, ans_queue):
- args = get_env_start_args()
- import triton.profiler as proton
- import torch
- from lightllm.distributed import dist_group_manager
- from lightllm.utils.dist_utils import set_current_device_id
-
- import torch.distributed as dist
-
- enable_decode_overlap = args.enable_decode_microbatch_overlap
- group_size = 1
- if enable_decode_overlap or args.enable_prefill_microbatch_overlap:
- assert batch_size % 2 == 0, "batch size must be even number"
- group_size = 2
- init_distributed_env(model_kvargs)
- dist_group_manager.create_groups(group_size=group_size)
- model_cfg, _ = PretrainedConfig.get_config_dict(model_kvargs["weight_dir"])
- dist.barrier()
-
- torch.cuda.empty_cache()
-
- model_part, _ = get_model(model_cfg, model_kvargs)
-
- # warm up
- # test_data = np.vstack([np.arange(5, input_len + 5) for _ in range(batch_size)])
+def run_forward_once(model_kvargs, input_len, output_len, batch_size, model_part, enable_overlap, torch_profile=False):
test_data = np.vstack([np.random.randint(0, 50256, input_len) for _ in range(batch_size)])
test_data = test_data.reshape(-1)
test_data = torch.from_numpy(test_data).cuda()
-
- b_req_idx = torch.tensor(
- [model_part.req_manager.alloc() for _ in range(batch_size)], dtype=torch.int32, device="cuda"
- )
- b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
- b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
- for i in range(batch_size):
- b_seq_len[i] = input_len
-
- total_token_num = input_len * batch_size
- mem_indexes = model_part.req_manager.mem_manager.alloc(test_data.shape[0]).cuda()
- if args.enable_prefill_microbatch_overlap:
- logics = overlap_prefill(
- model_part,
- batch_size,
- input_len,
- test_data,
- mem_indexes,
- b_req_idx,
- b_seq_len,
- total_token_num,
- b_ready_cache_len,
- )
- else:
- logics = model_part.forward(
- batch_size,
- total_token_num,
- input_len,
- test_data,
- mem_indexes,
- b_req_idx,
- b_seq_len,
- b_ready_cache_len=b_ready_cache_len,
- is_prefill=True,
- )
- prob_out = torch.softmax(logics, dim=-1)
- predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
- predict_ids = predict_ids.detach().cpu().numpy()
-
- for i in range(output_len):
- total_token_num += batch_size
- b_seq_len += 1
- mem_indexes = model_part.req_manager.mem_manager.alloc(predict_ids.shape[0]).cuda()
- max_len_in_batch = input_len + i + 1
- if enable_decode_overlap:
- logits = overlap_decode(
- model_part,
- batch_size,
- max_len_in_batch,
- torch.from_numpy(predict_ids).cuda().reshape(-1),
- mem_indexes,
- b_req_idx,
- b_seq_len,
- total_token_num,
- )
- else:
- logits = decode(
- model_part,
- batch_size,
- max_len_in_batch,
- torch.from_numpy(predict_ids).cuda().reshape(-1),
- mem_indexes,
- b_req_idx,
- b_seq_len,
- total_token_num,
- )
-
- prob_out = torch.softmax(logits, dim=-1)
- predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
- predict_ids = predict_ids.detach().cpu().numpy()
-
- model_part.mem_manager.free_all()
- model_part.req_manager.free_all()
-
- b_req_idx = None
- b_seq_len = None
+ import torch.distributed as dist
dist.barrier()
import time
- torch.cuda.synchronize()
- start_time = time.time()
+ dp_size = model_kvargs["dp_size"]
+ torch.cuda.synchronize()
prefill_start_time = time.time()
b_req_idx = torch.tensor(
[model_part.req_manager.alloc() for _ in range(batch_size)], dtype=torch.int32, device="cuda"
)
b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+ b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
for i in range(batch_size):
b_seq_len[i] = input_len
@@ -307,86 +242,59 @@ def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, an
mem_indexes = model_part.req_manager.mem_manager.alloc(test_data.shape[0]).cuda()
rank_id = model_kvargs["rank_id"]
- if rank_id == 0:
- if args.profile:
- proton.start(name="forward_prefill", context="python")
- if args.enable_prefill_microbatch_overlap:
- logics = overlap_prefill(
- model_part,
- batch_size,
- input_len,
- test_data,
- mem_indexes,
- b_req_idx,
- b_seq_len,
- total_token_num,
- b_ready_cache_len,
- )
+ if enable_overlap:
+ prefill_fn = overlap_prefill
+ decode_fn = overlap_decode
else:
- logics = model_part.forward(
- batch_size,
- total_token_num,
- input_len,
- test_data,
- mem_indexes,
- b_req_idx,
- b_seq_len,
- b_ready_cache_len=b_ready_cache_len,
- is_prefill=True,
- )
- prob_out = torch.softmax(logics, dim=-1)
+ prefill_fn = prefill
+ decode_fn = decode
+
+ logits = prefill_fn(
+ model_part,
+ batch_size,
+ input_len,
+ test_data,
+ mem_indexes,
+ b_req_idx,
+ b_seq_len,
+ total_token_num,
+ b_ready_cache_len, # b_ready_cache_len
+ )
+
+ prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
- predict_ids = predict_ids.detach().cpu().numpy()
+ _ = predict_ids.detach().cpu().numpy()
torch.cuda.synchronize()
if rank_id == 0:
- if args.profile:
- proton.finalize()
- print("prefill time cost:", (time.time() - prefill_start_time) * 1000)
+ print(
+ f"prefill time cost: {(time.time() - prefill_start_time) * 1000}, "
+ f"prefill throughput: {dp_size * batch_size * input_len / (time.time() - prefill_start_time)} tokens/s"
+ )
- if args.torch_profile:
+ if torch_profile:
print("Profile Prefill")
try:
- if args.enable_prefill_microbatch_overlap:
- torch_profile(
- lambda: overlap_prefill(
- model_part,
- batch_size,
- input_len,
- test_data,
- mem_indexes,
- b_req_idx,
- b_seq_len,
- total_token_num,
- b_ready_cache_len,
- ),
- log_dir=f"./logs/forward_prefill_{model_kvargs['rank_id']}",
- )
- else:
- torch_profile(
- lambda: model_part.forward(
- batch_size,
- total_token_num,
- input_len,
- test_data,
- mem_indexes,
- b_req_idx,
- b_seq_len,
- b_ready_cache_len=b_ready_cache_len,
- is_prefill=True,
- ),
- log_dir=f"./logs/forward_prefill_{model_kvargs['rank_id']}",
- )
+ torch_profile(
+ lambda: prefill_fn(
+ model_part,
+ batch_size,
+ input_len,
+ test_data,
+ mem_indexes,
+ b_req_idx,
+ b_seq_len,
+ total_token_num,
+ b_ready_cache_len, # b_ready_cache_len
+ ),
+ log_dir=f"./logs/forward_prefill_{model_kvargs['rank_id']}",
+ )
except Exception as e:
print(str(e))
raise
- if rank_id == 0:
- if args.profile:
- proton.start(name="forward_decode", context="python")
-
for i in range(output_len):
torch.cuda.synchronize()
step_start = time.time()
@@ -394,49 +302,24 @@ def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, an
b_seq_len += 1
mem_indexes = model_part.req_manager.mem_manager.alloc(predict_ids.shape[0]).cuda()
max_len_in_batch = input_len + i + 1
- if enable_decode_overlap:
- logits = overlap_decode(
- model_part,
- batch_size,
- max_len_in_batch,
- torch.from_numpy(predict_ids).cuda().reshape(-1),
- mem_indexes,
- b_req_idx,
- b_seq_len,
- total_token_num,
- )
- if i == 0 and args.torch_profile:
- torch_profile(
- lambda: overlap_decode(
- model_part,
- batch_size,
- max_len_in_batch,
- torch.from_numpy(predict_ids).cuda().reshape(-1),
- mem_indexes,
- b_req_idx,
- b_seq_len,
- total_token_num,
- ),
- log_dir=f"./logs/forward_decode_{model_kvargs['rank_id']}",
- )
- else:
- logits = decode(
- model_part,
- batch_size,
- max_len_in_batch,
- torch.from_numpy(predict_ids).cuda().reshape(-1),
- mem_indexes,
- b_req_idx,
- b_seq_len,
- total_token_num,
- )
- if i == 0 and args.torch_profile:
+ logits = decode_fn(
+ model_part,
+ batch_size,
+ max_len_in_batch,
+ predict_ids.view(-1),
+ mem_indexes,
+ b_req_idx,
+ b_seq_len,
+ total_token_num,
+ )
+ if torch_profile:
+ try:
torch_profile(
- lambda: decode(
+ lambda: decode_fn(
model_part,
batch_size,
max_len_in_batch,
- torch.from_numpy(predict_ids).cuda().reshape(-1),
+ predict_ids.view(-1),
mem_indexes,
b_req_idx,
b_seq_len,
@@ -444,26 +327,86 @@ def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, an
),
log_dir=f"./logs/forward_decode_{model_kvargs['rank_id']}",
)
+ except Exception as e:
+ print(str(e))
+ raise
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
- predict_ids = predict_ids.detach().cpu().numpy()
+ _ = predict_ids.detach().cpu().numpy()
torch.cuda.synchronize()
if i % 100 == 0 or i == output_len - 1:
if rank_id == 0:
- print(i, "step cost time:", (time.time() - step_start) * 1000)
+ print(
+ f"i: {i}, step cost time: {(time.time() - step_start) * 1000} ms, "
+ f"throughput: {dp_size * batch_size / (time.time() - step_start)} tokens/s"
+ )
+ model_part.mem_manager.free_all()
+ model_part.req_manager.free_all()
torch.cuda.synchronize()
- end_time = time.time()
+ torch.cuda.empty_cache()
+
+
+def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, ans_queue):
+ args = get_env_start_args()
+ import triton.profiler as proton
+ import torch
+ from lightllm.distributed import dist_group_manager
+ from lightllm.utils.dist_utils import set_current_device_id
+
+ if isinstance(batch_size, int):
+ batch_size = [batch_size]
+ else:
+ batch_size = [2, 8, 16, 32, 64, 128]
+ print(batch_size)
+
+ import torch.distributed as dist
+
+ enable_decode_overlap = args.enable_decode_microbatch_overlap
+ group_size = 1
+ if enable_decode_overlap or args.enable_prefill_microbatch_overlap:
+ assert batch_size % 2 == 0, "batch size must be even number"
+ group_size = 2
+ init_distributed_env(model_kvargs)
+ dist_group_manager.create_groups(group_size=group_size)
+ model_cfg, _ = PretrainedConfig.get_config_dict(model_kvargs["weight_dir"])
+ dist.barrier()
+
+ torch.cuda.empty_cache()
+ enable_overlap = args.enable_decode_microbatch_overlap or args.enable_prefill_microbatch_overlap
+
+ model_part, _ = get_model(model_cfg, model_kvargs)
+
+ rank_id = model_kvargs["rank_id"]
+ for b in batch_size:
+ if rank_id == 0:
+ print(f"Testing batch size {b}")
+
+ # warm up
+ run_forward_once(
+ model_kvargs,
+ input_len,
+ output_len=10,
+ batch_size=b,
+ model_part=model_part,
+ enable_overlap=enable_overlap,
+ torch_profile=False,
+ )
+
+ # test
+ run_forward_once(
+ model_kvargs,
+ input_len,
+ output_len,
+ batch_size=b,
+ model_part=model_part,
+ enable_overlap=enable_overlap,
+ torch_profile=False,
+ )
+ if rank_id == 0:
+ print("=" * 50)
- if rank_id == 0:
- if args.profile:
- proton.finalize()
- # triton version need >= 3.2.0
- # pip install llnl-hatchet
- # proton-viewer -m time/ms,time/% forward_prefill.hatchet
- # proton-viewer -m time/ms,time/% forward_decode.hatchet
- print("time total cost(ms):", (end_time - start_time) * 1000)
ans_queue.put(True)
return
diff --git a/test/model/model_infer_mtp.py b/test/benchmark/static_inference/model_infer_mtp.py
similarity index 100%
rename from test/model/model_infer_mtp.py
rename to test/benchmark/static_inference/model_infer_mtp.py
diff --git a/test/model/test_model.py b/test/benchmark/static_inference/test_model.py
similarity index 89%
rename from test/model/test_model.py
rename to test/benchmark/static_inference/test_model.py
index bf7d0ac43..5b3751bcc 100644
--- a/test/model/test_model.py
+++ b/test/benchmark/static_inference/test_model.py
@@ -27,8 +27,8 @@ def test_model_infer(self):
import torch
parser = make_argument_parser()
- parser.add_argument("--batch_size", type=int, default=2, help="batch size")
- parser.add_argument("--input_len", type=int, default=4096, help="input sequence length")
+ parser.add_argument("--batch_size", type=int, default=None, help="batch size")
+ parser.add_argument("--input_len", type=int, default=64, help="input sequence length")
parser.add_argument("--output_len", type=int, default=128, help="output sequence length")
parser.add_argument(
"--profile",
diff --git a/test/model/model_infer_vit.py b/test/benchmark/static_inference/test_vit.py
similarity index 65%
rename from test/model/model_infer_vit.py
rename to test/benchmark/static_inference/test_vit.py
index 556795c8e..279542ddd 100644
--- a/test/model/model_infer_vit.py
+++ b/test/benchmark/static_inference/test_vit.py
@@ -6,9 +6,10 @@
from lightllm.models.vit.model import VisionTransformer
from lightllm.utils.dist_utils import init_vision_distributed_env
+import argparse
-def test_model_inference(world_size, weight_dir, quant_type=None):
+def test_model_inference(world_size, weight_dir, quant_type=None, batch_size=1, image_size=448):
workers = []
for rank_id in range(world_size):
kvargs = {
@@ -23,7 +24,7 @@ def test_model_inference(world_size, weight_dir, quant_type=None):
"quant_cfg": None,
}
- proc = multiprocessing.Process(target=tppart_model_infer, args=(kvargs,))
+ proc = multiprocessing.Process(target=tppart_model_infer, args=(kvargs, batch_size, image_size))
proc.start()
workers.append(proc)
@@ -32,7 +33,7 @@ def test_model_inference(world_size, weight_dir, quant_type=None):
return
-def tppart_model_infer(model_kvargs):
+def tppart_model_infer(model_kvargs, batch_size, image_size):
import torch
import torch.distributed as dist
@@ -41,7 +42,7 @@ def tppart_model_infer(model_kvargs):
torch.cuda.empty_cache()
model_part = VisionTransformer(model_kvargs)
- test_data = torch.randn((13, 3, 448, 448)).cuda().to(torch.bfloat16)
+ test_data = torch.randn((batch_size, 3, image_size, image_size)).cuda().to(torch.bfloat16)
# warm up
torch.cuda.synchronize()
for i in range(10):
@@ -56,6 +57,7 @@ def tppart_model_infer(model_kvargs):
end_time = time.time()
if rank_id == 0:
print("time total cost(ms):", (end_time - start_time) / 50 * 1000)
+ print("image per second:", batch_size * 50 / (end_time - start_time))
return
@@ -63,7 +65,13 @@ def tppart_model_infer(model_kvargs):
if __name__ == "__main__":
import torch
- world_size = 2
- weight_dir = "/nvme/models/InternVL2/InternVL2-8B/"
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_dir", type=str, default="./InternVL2/InternVL2-8B/")
+ parser.add_argument("--world_size", type=int, default=2)
+ parser.add_argument("--quant_type", type=str, default="none")
+ parser.add_argument("--batch_size", type=int, default=1)
+ parser.add_argument("--image_size", type=int, default=448)
+ args = parser.parse_args()
+
torch.multiprocessing.set_start_method("spawn")
- test_model_inference(world_size, weight_dir, "none")
+ test_model_inference(args.world_size, args.model_dir, args.quant_type, args.batch_size, args.image_size)
diff --git a/test/compare_with_previous_commit.py b/test/compare_with_previous_commit.py
new file mode 100644
index 000000000..3004e6146
--- /dev/null
+++ b/test/compare_with_previous_commit.py
@@ -0,0 +1,198 @@
+"""
+This script starts the inference server, sends a set of prompts,
+collects the outputs, and supports comparing the results between
+the current commit and a specified historical commit for accuracy testing.
+
+The command is:
+python compare_with_previous_commit..py --tp 2 --model_dir /xx/xx --compare_commit_id xxxx
+
+"""
+import difflib
+import argparse
+import subprocess
+import time
+import os
+import requests
+import sys
+import json
+import shutil
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--tp", type=int, required=True, help="Number of GPUs to use.")
+ parser.add_argument("--model_dir", type=str, required=True, help="Directory of the model.")
+ parser.add_argument("--compare_commit_id", type=str, default=None, help="The commit id of the baseline.")
+ return parser.parse_args()
+
+
+def start_server(tp, model_dir):
+ cmd = [
+ "python",
+ "-m",
+ "lightllm.server.api_server",
+ "--tp",
+ str(tp),
+ "--model_dir",
+ model_dir,
+ "--data_type",
+ "fp16",
+ "--mode",
+ "triton_gqa_flashdecoding",
+ "--trust_remote_code",
+ "--tokenizer_mode",
+ "fast",
+ "--host",
+ "0.0.0.0",
+ "--port",
+ "8080",
+ ]
+ process = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
+ return process
+
+
+def check_health():
+ health_url = "http://localhost:8080/health"
+ try:
+ r = requests.get(health_url, timeout=2)
+ return r.status_code == 200
+ except Exception:
+ return False
+
+
+def send_prompts(prompts, output_file):
+ for prompt in prompts:
+ while not check_health():
+ time.sleep(1)
+
+ request_data = {
+ "inputs": prompt,
+ "parameters": {"max_new_tokens": 1024, "frequency_penalty": 1, "do_sample": False},
+ "multimodal_params": {},
+ }
+
+ try:
+ r = requests.post("http://localhost:8080/generate", json=request_data, timeout=10)
+ response_json = json.loads(r.text)
+ generated_text = (
+ response_json["generated_text"][0] if "generated_text" in response_json else "No generated_text."
+ )
+ except Exception as e:
+ generated_text = f"ERROR: {str(e)}"
+
+ with open(output_file, "a", encoding="utf-8") as f:
+ f.write(f"===== prompt: {prompt} =====\n")
+ f.write(f"{generated_text}\n\n")
+
+ print(f"===================Ouput saved in {output_file}===========================")
+
+
+def compare_files(file1, file2, diff_output_file="diff.txt"):
+ with open(file1, "r", encoding="utf-8") as f1, open(file2, "r", encoding="utf-8") as f2:
+ lines1 = f1.readlines()
+ lines2 = f2.readlines()
+
+ diff = difflib.unified_diff(lines1, lines2, fromfile=file1, tofile=file2, lineterm="")
+
+ diff_list = list(diff)
+ if not diff_list:
+ print("两个文件内容完全一致。")
+ return
+
+ # 打印到终端
+ for line in diff_list:
+ if line.startswith("+") and not line.startswith("+++"):
+ print("\033[32m" + line + "\033[0m", end="") # 绿色
+ elif line.startswith("-") and not line.startswith("---"):
+ print("\033[31m" + line + "\033[0m", end="") # 红色
+ else:
+ print(line, end="")
+
+ # 保存到文件
+ with open(diff_output_file, "w", encoding="utf-8") as f:
+ for line in diff_list:
+ f.write(line + "\n")
+ print(f"\n差异已保存到 {diff_output_file}")
+
+
+def run_and_save(tp, model_dir, output_file, prompts):
+ """
+ Start the server, send prompts, and save the results to output_file.
+ """
+ # Remove the old result file if it exists
+ if os.path.exists(output_file):
+ os.remove(output_file)
+
+ process = None
+ try:
+ # Start the inference server
+ process = start_server(tp, model_dir)
+ # Send prompts and save results
+ send_prompts(prompts, output_file)
+ finally:
+ # Shutdown the server
+ if process is not None:
+ process.terminate()
+ process.wait()
+
+
+def main():
+ # Parse arguments
+ args = parse_args()
+ tp = args.tp
+ model_dir = args.model_dir
+ compare_commit_id = args.compare_commit_id
+
+ # Prompts to test
+ prompts = [
+ "What is the machine learning?",
+ "1+1等于几",
+ "What role does attention play in transformer architectures?",
+ "西红柿炒鸡蛋怎么做?",
+ "Describe the concept of overfitting and underfitting.",
+ "CPU和GPU的区别是什么?",
+ "What is the role of a loss function in machine learning?",
+ ]
+
+ # Run and save results for the current commit
+ current_output_file = "test_results_current.txt"
+ run_and_save(tp, model_dir, current_output_file, prompts)
+
+ # If compare_commit_id is provided, run and save results for the baseline commit
+ if compare_commit_id:
+ # Get the absolute path of the current script
+ script_path = os.path.abspath(__file__)
+ script_name = os.path.basename(script_path)
+ tmp_script = f"/tmp/{script_name}"
+ # Copy the current script to /tmp to ensure it exists in the baseline commit
+ shutil.copy(script_path, tmp_script)
+ # Save current commit id
+ current_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
+ # Save current branch name (if any)
+ current_branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode().strip()
+ # Stash any local changes
+ subprocess.run(["git", "stash"])
+ # Checkout the baseline commit
+ subprocess.run(["git", "checkout", compare_commit_id])
+ # Copy the script back to the original location in case it does not exist in the baseline commit
+ shutil.copy(tmp_script, script_path)
+ try:
+ compare_output_file = "test_results_compare.txt"
+ run_and_save(tp, model_dir, compare_output_file, prompts)
+ finally:
+ # Checkout back to the original branch or commit
+ if current_branch != "HEAD":
+ subprocess.run(["git", "checkout", current_branch])
+ else:
+ subprocess.run(["git", "checkout", current_commit])
+ # Pop the stashed changes
+ subprocess.run(["git", "stash", "pop"])
+ # Remove the temporary script file
+ if os.path.exists(tmp_script):
+ os.remove(tmp_script)
+ # Compare the results
+ compare_files(current_output_file, compare_output_file)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/test/deepseek.sh b/test/deepseek.sh
deleted file mode 100644
index 78e40a116..000000000
--- a/test/deepseek.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-# 单机 deepseek V3 ep 运行模式启动示例, 启动参数中的tp含义发生了变化,代表使用的所有卡数量,并不是tp推理。
-# max_total_token_num 可以按照实际场景调节。
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 --model_dir /dev/shm/DeepSeek-R1 \
---tp 8 \
---dp 8 \
---max_total_token_num 200000 \
---graph_max_batch_size 64 \
---batch_max_tokens 8192 \
---enable_flashinfer_prefill \
---enable_flashinfer_decode \
---enable_prefill_microbatch_overlap \
---disable_aggressive_schedule
-
-# H800 双机 deepseek V3 ep 运行模式启动实列
-# 启动命令中的 nccl_host 和 nccl_port 两个节点的必须一致,一般nccl_host设置为 node 0的ip。
-# max_total_token_num 最佳设置需要按照使用场景和显存情况配置。
-# 启动后两个节点的8088端口都可以接收访问的请求
-# node 0
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 --model_dir /dev/shm/DeepSeek-R1 \
---tp 16 \
---dp 16 \
---max_total_token_num 200000 \
---graph_max_batch_size 64 \
---batch_max_tokens 8192 \
---enable_flashinfer_prefill \
---enable_flashinfer_decode \
---enable_prefill_microbatch_overlap \
---nnodes 2 \
---node_rank 0 \
---nccl_host \
---nccl_port 2732
-# node 1
-MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 --model_dir /dev/shm/DeepSeek-R1 \
---tp 16 \
---dp 16 \
---max_total_token_num 200000 \
---graph_max_batch_size 64 \
---batch_max_tokens 8192 \
---enable_flashinfer_prefill \
---enable_flashinfer_decode \
---enable_prefill_microbatch_overlap \
---nnodes 2 \
---node_rank 1 \
---nccl_host \
---nccl_port 2732
-
-# pd 分离启动示列, 单机 做 P 和 D, 也支持多机组成的D和单机的P混合。
-# 目前 P D 分离的 PD master可能存在并发处理问题,还需提升。
-
-# pd master 启动
-python -m lightllm.server.api_server --model_dir /dev/shm/DeepSeek-R1 --run_mode "pd_master" --host `hostname -i` --port 60011
-
-# p 启动
-nvidia-cuda-mps-control -d
-MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server --model_dir /dev/shm/DeepSeek-R1 \
---run_mode "prefill" \
---tp 8 \
---dp 8 \
---host `hostname -i` \
---port 8019 \
---nccl_port 2732 \
---max_total_token_num 200000 \
---batch_max_tokens 8192 \
---enable_flashinfer_prefill \
---enable_flashinfer_decode \
---enable_prefill_microbatch_overlap \
---disable_cudagraph \
---pd_master_ip \
---pd_master_port 60011
-
-# d 启动
-nvidia-cuda-mps-control -d
-MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server --model_dir /dev/shm/DeepSeek-R1 \
---run_mode "decode" \
---tp 8 \
---dp 8 \
---host `hostname -i` \
---port 8121 \
---nccl_port 12322 \
---max_total_token_num 200000 \
---graph_max_batch_size 64 \
---enable_flashinfer_prefill \
---enable_flashinfer_decode \
---enable_prefill_microbatch_overlap \
---pd_master_ip \
---pd_master_port 60011
-
diff --git a/test/kernel/alignment/llama_gqa_decode_vsm.py b/test/kernel/alignment/llama_gqa_decode_vsm.py
deleted file mode 100644
index f124a28eb..000000000
--- a/test/kernel/alignment/llama_gqa_decode_vsm.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import unittest
-import random
-import torch
-from tqdm import tqdm
-from lightllm.common.basemodel.infer_struct import InferStateInfo
-from lightllm.common.req_manager import ReqManager
-from lightllm.models.llama.triton_kernel.gqa_flash_decoding_vsm import (
- gqa_token_decode_attention_flash_decoding_vsm,
-)
-from lightllm.models.llama.triton_kernel.gqa_flash_decoding import (
- gqa_token_decode_attention_flash_decoding,
-)
-
-
-class TestVSMGQADecoding(unittest.TestCase):
- def test_vsm_gqa_decoding_align(self):
- random.seed(0)
- torch.manual_seed(0)
- torch.cuda.manual_seed(0)
- torch.cuda.manual_seed_all(0)
- torch.backends.cudnn.deterministic = True
- torch.backends.cudnn.benchmark = False
-
- bs_list = [1, 8, 16, 32, 64, 128, 256]
- group_size_list = [16, 32, 64]
- seq_len_list = [128, 512, 1024, 2048, 4096, 8192]
- q_head_dim_list = [64, 128]
- q_head_num_list = [8, 16, 32]
-
- def get_test_configs():
- for bs in bs_list:
- for group_size in group_size_list:
- for seq_len_m in seq_len_list:
- for q_head_dim in q_head_dim_list:
- for q_head_num in q_head_num_list:
- if q_head_num < group_size:
- continue
- yield bs, group_size, seq_len_m, q_head_dim, q_head_num
-
- for bs, group_size, seq_len_m, q_head_dim, q_head_num in tqdm(list(get_test_configs())):
- kv_head_num = q_head_num // group_size
- q_head_dim = q_head_dim
- kv_head_dim = q_head_dim
- seq_len = (torch.zeros(bs, dtype=torch.int32) + seq_len_m).to(torch.int32)
- total_token_in_the_batch = seq_len.sum().item()
- rounded_total_token_in_the_batch = (total_token_in_the_batch + 128 - 1) // 128 * 128
-
- q_shape = [bs, q_head_num, q_head_dim]
- kv_shape = [
- rounded_total_token_in_the_batch,
- kv_head_num,
- kv_head_dim,
- ]
- qkv_dtype = torch.float16
-
- q, k, v = (
- torch.randn(q_shape, dtype=qkv_dtype, device="cuda"),
- torch.randn(kv_shape, dtype=qkv_dtype, device="cuda"),
- torch.randn(kv_shape, dtype=qkv_dtype, device="cuda"),
- )
- q, k, v = q / 10, k / 10, v / 10
-
- req_to_token_index = torch.zeros((bs, seq_len_m)) - 1
- token_index = torch.arange(rounded_total_token_in_the_batch)
-
- total_count = 0
- for i in range(bs):
- req_to_token_index[i, : seq_len[i]] = token_index[total_count : total_count + seq_len[i]]
- total_count += seq_len[i]
-
- req_to_token_index = req_to_token_index.long().cuda()
-
- b_req_idx = torch.arange(bs, device="cuda")
- infer_state = InferStateInfo()
- infer_state.req_manager = ReqManager(bs, 2048, None)
- infer_state.req_manager.req_to_token_indexs = req_to_token_index
- infer_state.b_req_idx = b_req_idx.cuda()
- infer_state.b_seq_len = seq_len.cuda()
- infer_state.max_len_in_batch = seq_len_m
- infer_state.batch_size = bs
- infer_state.q_head_num = q_head_num
- infer_state.q_head_dim = q_head_dim
- infer_state.kv_head_num = kv_head_num
- infer_state.softmax_scale = 1 / (q_head_dim ** 0.5)
- infer_state.total_token_num = torch.tensor([total_token_in_the_batch], dtype=torch.int32).cuda()
- new_out = gqa_token_decode_attention_flash_decoding_vsm(q, k, v, infer_state)
- old_out = gqa_token_decode_attention_flash_decoding(
- q,
- infer_state,
- infer_state.q_head_num,
- infer_state.q_head_dim,
- k,
- v,
- )
- cos_sim = torch.nn.functional.cosine_similarity(new_out, old_out, dim=-1).mean().cpu().item()
- self.assertGreaterEqual(
- cos_sim,
- 0.9,
- f"bs={bs},group_size={group_size},seq_len={seq_len_m},q_head_dim={q_head_dim},q_head_num={q_head_num}",
- )
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/test/kernel/tuning/deepseekv2_gqa_decode_tuning.py b/test/kernel/deepseekv2_gqa_decode_tuning.py
similarity index 100%
rename from test/kernel/tuning/deepseekv2_gqa_decode_tuning.py
rename to test/kernel/deepseekv2_gqa_decode_tuning.py
diff --git a/test/kernel/fuse_moe_tuning_fp8.py b/test/kernel/fuse_moe_tuning.py
similarity index 80%
rename from test/kernel/fuse_moe_tuning_fp8.py
rename to test/kernel/fuse_moe_tuning.py
index a30de8d03..6e971573a 100644
--- a/test/kernel/fuse_moe_tuning_fp8.py
+++ b/test/kernel/fuse_moe_tuning.py
@@ -1,10 +1,12 @@
import os
+import argparse
import torch
import time
import torch.multiprocessing as mp
from lightllm.common.fused_moe.grouped_fused_moe import fused_experts_impl, moe_align, moe_align1, grouped_matmul
from typing import List
from lightllm.utils.log_utils import init_logger
+from transformers import AutoConfig
logger = init_logger(__name__)
@@ -58,14 +60,37 @@ def test_kernel(
test_count: int,
use_fp8_w8a8: bool,
is_up: bool,
+ block_shape,
**config,
):
set_seed()
input_tuples = []
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
- w1 = torch.randn((expert_num, 2 * n, k), device="cuda", dtype=dtype) / 10
- w2 = torch.randn((expert_num, k, n), device="cuda", dtype=dtype) / 10
+ w1_scale = w2_scale = None
+
+ if use_fp8_w8a8:
+ init_dtype = dtype
+ w1 = torch.randn(expert_num, 2 * n, k, dtype=init_dtype).cuda()
+ w2 = torch.randn(expert_num, k, 2 * n // 2, dtype=init_dtype).cuda()
+ w1 = w1.to(torch.float8_e4m3fn)
+ w2 = w2.to(torch.float8_e4m3fn)
+
+ if block_shape is None:
+ w1_scale = torch.randn(expert_num, dtype=torch.float32).cuda()
+ w2_scale = torch.randn(expert_num, dtype=torch.float32).cuda()
+ else:
+ block_n, block_k = block_shape[0], block_shape[1]
+ n_tiles_w1 = (2 * n + block_n - 1) // block_n
+ n_tiles_w2 = (k + block_n - 1) // block_n
+ k_tiles_w1 = (k + block_k - 1) // block_k
+ k_tiles_w2 = (2 * n // 2 + block_k - 1) // block_k
+ w1_scale = torch.rand((expert_num, n_tiles_w1, k_tiles_w1), dtype=torch.float32).cuda()
+ w2_scale = torch.rand((expert_num, n_tiles_w2, k_tiles_w2), dtype=torch.float32).cuda()
+ else:
+ w1 = torch.randn(expert_num, 2 * n, k, dtype=dtype).cuda()
+ w2 = torch.randn(expert_num, k, 2 * n // 2, dtype=dtype).cuda()
+
rnd_logics = torch.randn(m, expert_num, device="cuda")
topk_values, topk_ids = torch.topk(rnd_logics, topk, dim=1)
topk_weights = torch.randn((m, topk), device="cuda", dtype=dtype) / 10
@@ -75,12 +100,6 @@ def test_kernel(
moe_align(topk_ids=topk_ids, out=expert_to_tokens)
expert_to_token_num = torch.empty((expert_num,), dtype=torch.int32, device="cuda")
moe_align1(expert_to_tokens, topk_weights, expert_to_weights, expert_to_token_num, topk=topk)
- if use_fp8_w8a8:
- w1, w1_scale = quantize_moe(w1)
- w2, w2_scale = quantize_moe(w2)
- else:
- w1_scale = torch.empty((0,))
- w2_scale = torch.empty((0,))
out1 = torch.zeros((m * topk, 2 * n), dtype=torch.bfloat16, device="cuda")
down_in = torch.zeros((m * topk, n), dtype=torch.bfloat16, device="cuda")
@@ -142,6 +161,7 @@ def test_kernel(
a, w1, w2, w1_scale, w2_scale, topk_ids, topk_weights, out1, out2, down_in = input_tuples[index]
if is_up:
grouped_matmul(
+ topk_ids.numel(),
a,
None,
expert_to_token_num,
@@ -158,6 +178,7 @@ def test_kernel(
)
else:
grouped_matmul(
+ topk_ids.numel(),
down_in,
None,
expert_to_token_num,
@@ -197,6 +218,7 @@ def worker(
test_count: int,
use_fp8_w8a8: bool,
is_up: bool,
+ block_shape,
test_configs,
queue,
):
@@ -212,6 +234,7 @@ def worker(
test_count=test_count,
use_fp8_w8a8=use_fp8_w8a8,
is_up=is_up,
+ block_shape=block_shape,
**test_configs[index],
)
queue.put(cost_time) # Put result in queue
@@ -278,6 +301,7 @@ def tuning_configs(
test_count: int,
use_fp8_w8a8: bool,
is_up: bool,
+ block_shape,
):
os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
best_config, best_cost_time = None, 10000000
@@ -300,6 +324,7 @@ def tuning_configs(
test_count,
use_fp8_w8a8,
is_up,
+ block_shape,
test_configs,
queue,
),
@@ -333,6 +358,7 @@ def tuning_configs(
test_count,
use_fp8_w8a8,
is_up,
+ block_shape,
test_configs,
queue,
),
@@ -358,16 +384,30 @@ def tuning_configs(
return best_config, best_cost_time
-if __name__ == "__main__":
+def main(args):
torch.multiprocessing.set_start_method("spawn")
from lightllm.utils.tuning_utils import mp_tuning
from lightllm.common.fused_moe.moe_kernel_configs import MoeGroupedGemmKernelConfig
- # tuning to get deepseekv2 large configs and store in H800, tp 8
- expert_num = 160
- n = 192 # up is n * 2
- hidden_dim = 5120
- topk_num = 6
+ config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
+ if config.architectures[0] == "Qwen3MoeForCausalLM":
+ expert_num = config.num_experts
+ topk_num = config.num_experts_per_tok
+ n = 2 * config.moe_intermediate_size // args.tp
+ elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
+ expert_num = config.n_routed_experts
+ topk_num = config.num_experts_per_tok
+ n = 2 * config.moe_intermediate_size // args.tp
+ else:
+ pass
+
+ hidden_dim = getattr(config, "hidden_size", None) or config.text_config.hidden_size
+ use_fp8_w8a8 = args.use_fp8_w8a8
+ block_shape = None
+ if hasattr(config, "quantization_config") and "weight_block_size" in config.quantization_config:
+ block_shape = config.quantization_config["weight_block_size"]
+ assert len(block_shape) == 2
+ use_fp8_w8a8 = True
up_dict = {}
for m in [1, 8, 64, 128, 256, 512, 1024, 4096, 8192]:
@@ -381,8 +421,9 @@ def tuning_configs(
"topk": topk_num,
"dtype": torch.bfloat16,
"test_count": 20,
- "use_fp8_w8a8": True,
+ "use_fp8_w8a8": use_fp8_w8a8,
"is_up": True,
+ "block_shape": block_shape,
},
)
up_dict[m] = ans
@@ -392,7 +433,7 @@ def tuning_configs(
topk_num=topk_num,
expert_num=expert_num,
mul_routed_weight=False,
- use_fp8_w8a8=True,
+ use_fp8_w8a8=use_fp8_w8a8,
out_dtype=str(torch.bfloat16),
config_json=up_dict,
)
@@ -409,8 +450,9 @@ def tuning_configs(
"topk": topk_num,
"dtype": torch.bfloat16,
"test_count": 20,
- "use_fp8_w8a8": True,
+ "use_fp8_w8a8": use_fp8_w8a8,
"is_up": False,
+ "block_shape": block_shape,
},
)
down_dict[m] = ans
@@ -421,7 +463,16 @@ def tuning_configs(
topk_num=1,
expert_num=expert_num,
mul_routed_weight=True,
- use_fp8_w8a8=True,
+ use_fp8_w8a8=use_fp8_w8a8,
out_dtype=str(torch.bfloat16),
config_json=down_dict,
)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_dir", type=str, default="deepseek-ai/DeepSeek-R1")
+ parser.add_argument("--tp", type=int, default=8)
+ parser.add_argument("--use_fp8_w8a8", action="store_true")
+ args = parser.parse_args()
+ main(args)
diff --git a/test/kernel/fuse_moe_tuning_bf16.py b/test/kernel/fuse_moe_tuning_bf16.py
deleted file mode 100644
index 712f2ab29..000000000
--- a/test/kernel/fuse_moe_tuning_bf16.py
+++ /dev/null
@@ -1,423 +0,0 @@
-import os
-import torch
-import time
-import torch.multiprocessing as mp
-from lightllm.common.fused_moe.grouped_fused_moe import fused_experts_impl, moe_align, moe_align1, grouped_matmul
-from typing import List
-from lightllm.utils.log_utils import init_logger
-
-logger = init_logger(__name__)
-
-
-def set_seed():
- import torch
- import random
- import numpy as np
-
- seed = 42
- torch.manual_seed(seed)
- random.seed(seed)
- np.random.seed(seed)
- if torch.cuda.is_available():
- torch.cuda.manual_seed(seed)
- torch.cuda.manual_seed_all(seed)
- return
-
-
-def quantize_moe(weight):
-
- from lightllm.utils.vllm_utils import vllm_ops
-
- assert (
- vllm_ops is not None
- ), "vllm is not installed, you can't use the api of it. \
- You can solve it by running `pip install vllm`."
-
- num_experts = weight.shape[0]
- qweights = []
- weight_scales = []
- qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda()
- for i in range(num_experts):
- qweight, weight_scale = vllm_ops.scaled_fp8_quant(
- weight[i].contiguous().cuda(), scale=None, use_per_token_if_dynamic=False
- )
- qweights[i] = qweight
- weight_scales.append(weight_scale)
- weight_scale = torch.cat(weight_scales, dim=0).reshape(-1)
- return qweights, weight_scale
-
-
-@torch.no_grad()
-def test_kernel(
- expert_num: int,
- m: int,
- n: int,
- k: int,
- topk: int,
- dtype: torch.dtype,
- test_count: int,
- use_fp8_w8a8: bool,
- is_up: bool,
- **config,
-):
- set_seed()
- input_tuples = []
-
- a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
- w1 = torch.randn((expert_num, 2 * n, k), device="cuda", dtype=dtype) / 10
- w2 = torch.randn((expert_num, k, n), device="cuda", dtype=dtype) / 10
- rnd_logics = torch.randn(m, expert_num, device="cuda")
- topk_values, topk_ids = torch.topk(rnd_logics, topk, dim=1)
- topk_weights = torch.randn((m, topk), device="cuda", dtype=dtype) / 10
-
- expert_to_tokens = torch.empty((expert_num, topk * m), dtype=torch.int32, device="cuda")
- expert_to_weights = torch.empty((expert_num, topk * m), dtype=torch.float32, device="cuda")
- moe_align(topk_ids=topk_ids, out=expert_to_tokens)
- expert_to_token_num = torch.empty((expert_num,), dtype=torch.int32, device="cuda")
- moe_align1(expert_to_tokens, topk_weights, expert_to_weights, expert_to_token_num, topk=topk)
- if use_fp8_w8a8:
- w1, w1_scale = quantize_moe(w1)
- w2, w2_scale = quantize_moe(w2)
- else:
- w1_scale = torch.empty((0,))
- w2_scale = torch.empty((0,))
-
- out1 = torch.zeros((m * topk, 2 * n), dtype=torch.bfloat16, device="cuda")
- down_in = torch.zeros((m * topk, n), dtype=torch.bfloat16, device="cuda")
- out2 = torch.zeros((m * topk, k), dtype=torch.bfloat16, device="cuda")
-
- for _ in range(test_count):
- input_tuples.append(
- (
- a.clone(),
- w1.clone(),
- w2.clone(),
- w1_scale.clone(),
- w2_scale.clone(),
- topk_ids.clone(),
- topk_weights.clone(),
- out1.clone(),
- out2.clone(),
- down_in.clone(),
- )
- )
-
- if is_up:
- grouped_matmul(
- topk_ids.numel(),
- a,
- None,
- expert_to_token_num,
- expert_to_tokens,
- expert_to_weights=expert_to_weights,
- expert_weights=w1,
- expert_to_weights_scale=w1_scale,
- topk_num=topk,
- out=out1,
- mul_routed_weight=False,
- use_fp8_w8a8=use_fp8_w8a8,
- **config,
- )
- else:
- grouped_matmul(
- topk_ids.numel(),
- down_in,
- None,
- expert_to_token_num,
- expert_to_tokens,
- expert_to_weights=expert_to_weights,
- expert_weights=w2,
- expert_to_weights_scale=w2_scale,
- topk_num=1,
- out=out2,
- mul_routed_weight=True,
- use_fp8_w8a8=use_fp8_w8a8,
- **config,
- )
-
- graph = torch.cuda.CUDAGraph()
-
- with torch.cuda.graph(graph):
- for index in range(test_count):
- a, w1, w2, w1_scale, w2_scale, topk_ids, topk_weights, out1, out2, down_in = input_tuples[index]
- if is_up:
- grouped_matmul(
- a,
- None,
- expert_to_token_num,
- expert_to_tokens,
- expert_to_weights=expert_to_weights,
- expert_weights=w1,
- expert_to_weights_scale=w1_scale,
- topk_num=topk,
- out=out1,
- expert_token_limit=2 ** 31 - 1,
- mul_routed_weight=False,
- use_fp8_w8a8=use_fp8_w8a8,
- **config,
- )
- else:
- grouped_matmul(
- down_in,
- None,
- expert_to_token_num,
- expert_to_tokens,
- expert_to_weights=expert_to_weights,
- expert_weights=w2,
- expert_to_weights_scale=w2_scale,
- topk_num=1,
- out=out2,
- expert_token_limit=2 ** 31 - 1,
- mul_routed_weight=True,
- use_fp8_w8a8=use_fp8_w8a8,
- **config,
- )
-
- graph.replay()
-
- torch.cuda.synchronize()
- start = time.time()
- graph.replay()
- torch.cuda.synchronize()
-
- cost_time = (time.time() - start) * 1000
-
- logger.info(str(config))
- logger.info(f"bf16 {m} cost time: {cost_time} ms")
- return cost_time
-
-
-def worker(
- expert_num: int,
- m: int,
- n: int,
- k: int,
- topk: int,
- dtype: torch.dtype,
- test_count: int,
- use_fp8_w8a8: bool,
- is_up: bool,
- test_configs,
- queue,
-):
- try:
- for index in range(len(test_configs)):
- cost_time = test_kernel(
- expert_num=expert_num,
- m=m,
- n=n,
- k=k,
- topk=topk,
- dtype=dtype,
- test_count=test_count,
- use_fp8_w8a8=use_fp8_w8a8,
- is_up=is_up,
- **test_configs[index],
- )
- queue.put(cost_time) # Put result in queue
-
- except Exception as ex:
- logger.error(str(ex))
- logger.exception(str(ex))
- import sys
-
- sys.exit(-1)
- pass
-
-
-def get_test_configs(split_id, split_count):
- index = 0
- for num_stages in range(1, 6):
- for GROUP_SIZE_M in [
- 1,
- 2,
- 4,
- 8,
- ]:
- for num_warps in [
- 2,
- 4,
- 8,
- 16,
- ]:
- for BLOCK_SIZE_M in [
- 16,
- 32,
- 64,
- 128,
- ]:
- for BLOCK_SIZE_N in [16, 32, 64, 128]:
- for BLOCK_SIZE_K in [16, 32, 64, 128]:
- t_config = {
- "BLOCK_SIZE_M": BLOCK_SIZE_M,
- "BLOCK_SIZE_N": BLOCK_SIZE_N,
- "BLOCK_SIZE_K": BLOCK_SIZE_K,
- "GROUP_SIZE_M": GROUP_SIZE_M,
- "num_warps": num_warps,
- "num_stages": num_stages,
- }
- if index % split_count == split_id:
- yield t_config
- index += 1
- else:
- index += 1
-
-
-def tuning_configs(
- device_id: int, # use for mult mp tunning
- device_count: int,
- expert_num: int,
- m: int,
- n: int,
- k: int,
- topk: int,
- dtype: torch.dtype,
- test_count: int,
- use_fp8_w8a8: bool,
- is_up: bool,
-):
- os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
- best_config, best_cost_time = None, 10000000
- queue = mp.Queue()
- test_configs = []
- for t_config in get_test_configs(device_id, device_count):
- test_configs.append(t_config)
- if len(test_configs) < 256:
- continue
-
- p = mp.Process(
- target=worker,
- args=(
- expert_num,
- m,
- n,
- k,
- topk,
- dtype,
- test_count,
- use_fp8_w8a8,
- is_up,
- test_configs,
- queue,
- ),
- )
- p.start()
- p.join()
- while len(test_configs) != 0:
- try:
- cost_time = queue.get_nowait()
- logger.info(f"get {test_configs[0]} cost_time: {cost_time}")
- if cost_time < best_cost_time:
- best_config = test_configs[0]
- best_cost_time = cost_time
- logger.info(f"cur best : {best_config} {best_cost_time}")
- del test_configs[0:1]
- except:
- del test_configs[0:16]
- logger.info(f"cur best : {best_config} {best_cost_time}")
- break
-
- while len(test_configs) != 0:
- p = mp.Process(
- target=worker,
- args=(
- expert_num,
- m,
- n,
- k,
- topk,
- dtype,
- test_count,
- use_fp8_w8a8,
- is_up,
- test_configs,
- queue,
- ),
- )
- p.start()
- p.join()
-
- while len(test_configs) != 0:
- try:
- cost_time = queue.get_nowait()
- logger.info(f"get {test_configs[0]} cost_time: {cost_time}")
- if cost_time < best_cost_time:
- best_config = test_configs[0]
- best_cost_time = cost_time
- logger.info(f"cur best : {best_config} {best_cost_time}")
- del test_configs[0:1]
- except:
- del test_configs[0:16]
- logger.info(f"cur best : {best_config} {best_cost_time}")
- break
-
- logger.info(f"{best_config} best cost: {best_cost_time}")
- return best_config, best_cost_time
-
-
-if __name__ == "__main__":
- torch.multiprocessing.set_start_method("spawn")
- from lightllm.utils.tuning_utils import mp_tuning
- from lightllm.common.fused_moe.moe_kernel_configs import MoeGroupedGemmKernelConfig
-
- # tuning to get deepseekv2 lite configs and store tp 1
- expert_num = 64
- n = 1408 # up is n * 2
- hidden_dim = 2048
- topk_num = 6
-
- up_dict = {}
- for m in [1, 8, 64, 128, 256, 512, 1024, 4096, 8192]:
- ans = mp_tuning(
- tuning_configs,
- {
- "expert_num": expert_num,
- "m": m,
- "n": n,
- "k": hidden_dim,
- "topk": topk_num,
- "dtype": torch.bfloat16,
- "test_count": 20,
- "use_fp8_w8a8": False,
- "is_up": True,
- },
- )
- up_dict[m] = ans
- MoeGroupedGemmKernelConfig.save_config(
- N=n * 2,
- K=hidden_dim,
- topk_num=topk_num,
- expert_num=expert_num,
- mul_routed_weight=False,
- use_fp8_w8a8=False,
- out_dtype=str(torch.bfloat16),
- config_json=up_dict,
- )
-
- down_dict = {}
- for m in [1, 8, 64, 128, 256, 512, 1024, 4096, 8192]:
- ans = mp_tuning(
- tuning_configs,
- {
- "expert_num": expert_num,
- "m": m,
- "n": n,
- "k": hidden_dim,
- "topk": topk_num,
- "dtype": torch.bfloat16,
- "test_count": 20,
- "use_fp8_w8a8": False,
- "is_up": False,
- },
- )
- down_dict[m] = ans
- MoeGroupedGemmKernelConfig.save_config(
- N=hidden_dim,
- K=n,
- topk_num=1,
- expert_num=expert_num,
- mul_routed_weight=True,
- use_fp8_w8a8=False,
- out_dtype=str(torch.bfloat16),
- config_json=down_dict,
- )
diff --git a/test/kernel/tuning/llama_gqa_decode_vsm_tuning.py b/test/kernel/llama_gqa_decode_vsm_tuning.py
similarity index 100%
rename from test/kernel/tuning/llama_gqa_decode_vsm_tuning.py
rename to test/kernel/llama_gqa_decode_vsm_tuning.py
diff --git a/test/model/test_script.sh b/test/model/test_script.sh
deleted file mode 100755
index 985868b59..000000000
--- a/test/model/test_script.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-
-DATASET_PATH="/your/date/path" # 你的数据集路径
-FILE_PATH="/model/root" # 确保这里是所有模型的上级目录
-HOST="0.0.0.0"
-NCCL_PORT=28000
-CUDA_LIST=(0 1 2 3)
-PORT=8000
-MAX_PORT=65535
-NUM_PROMPTS=100
-REQUEST_RATE=20
-
-test_models() {
- local -a models=("${!1}")
- local -a modes=("${!2}")
- echo "models: ${models[@]}"
- echo "modes: ${modes[@]}"
- local model_num=${#models[@]}
- local loop_num=${#modes[@]}
-
- for model in "${models[@]}"; do
- local model_dir="${FILE_PATH}/${model}"
- # export CUDA_VISIBLE_DEVICES=${CUDA_LIST[i]}
-
- for ((i = 0; i <= loop_num; i++)); do
- local current_port=$PORT
- local current_nccl_port=$((NCCL_PORT+i))
-
- # 检查端口是否被占用
- while lsof -i:$current_nccl_port &>/dev/null || lsof -i:$current_port &>/dev/null; do
- current_nccl_port=$((current_nccl_port+1))
- current_port=$((current_port+1))
- if [ "$current_port" -gt "$MAX_PORT" ] || [ "$current_nccl_port" -gt "$MAX_PORT" ]; then
- echo "No available ports found."
- exit 1
- fi
- done
-
- echo "Start ${model_dir} on port ${current_port} with GPU ${CUDA_LIST[i]} and NCCL_PORT ${current_nccl_port} with mode ${modes[i]}"
- if [ "$i" -eq 0 ]; then
- nohup python -m lightllm.server.api_server --model_dir "${model_dir}" --host ${HOST} --port ${current_port} --tp 1 --trust_remote_code --nccl_port ${current_nccl_port} > server_output.log 2>&1 &
- else
- echo "idx:${i} with mode ${modes[i-1]}"
- nohup python -m lightllm.server.api_server --model_dir "${model_dir}" --mode "${modes[i-1]}" --host ${HOST} --port ${current_port} --tp 1 --trust_remote_code --nccl_port ${current_nccl_port} > server_output.log 2>&1 &
- fi
- local server_pid=$!
-
- # 等待服务器启动并监控输出
- echo "Waiting for server to start..."
- tail -f server_output.log | while read line; do
- echo "${line}"
- if [[ "${line}" == *"Uvicorn running on http://0.0.0.0"* ]]; then
- echo "Server is ready. Starting the client..."
- pkill -P $$ tail # 终止 tail 进程 继续执行后面的命令
- break
- fi
- done
-
- # 启动接收端程序
- echo "Starting the client to send requests..."
- python test/benchmark_serving.py --tokenizer "${model_dir}" --dataset "${DATASET_PATH}" --num-prompts ${NUM_PROMPTS} --request-rate ${REQUEST_RATE} --port ${current_port} --model "${model}"
- echo "Client finished."
-
- # 接收端程序完成后,关闭服务器
- echo "Shutting down the server: pid=${server_pid}"
- kill "${server_pid}"
- sleep 1
- # 检查进程是否仍然存在
- if ps -p "${server_pid}" > /dev/null; then # 尝试获取特定 PID 的进程信息
- echo "The server is still running."
- kill -9 "${server_pid}"
- else
- echo "The server has been stopped."
- fi
- done
- done
-}
-
-# 示例调用
-MODEL_ARRAY_LLAMA=("llama2-13b-chat")
-MODE_ARRAY_LLAMA=("triton_int8weight" "triton_int4weight")
-test_models MODEL_ARRAY_LLAMA[@] MODE_ARRAY_LLAMA[@]
\ No newline at end of file
diff --git a/test/model/test_settings/model_infer_batchs.py b/test/model/test_settings/model_infer_batchs.py
deleted file mode 100644
index 4a780ac69..000000000
--- a/test/model/test_settings/model_infer_batchs.py
+++ /dev/null
@@ -1,223 +0,0 @@
-import os
-import numpy as np
-from multiprocessing import Queue
-import multiprocessing
-
-
-def test_model_inference(world_size, model_dir, model_class, batch_sizes, input_len, output_len, mode, log_path):
- ans_queue = Queue()
- workers = []
- for rank_id in range(world_size):
- model_kvargs = {
- "run_mode": "normal",
- "tp_rank": rank_id,
- "world_size": world_size,
- "weight_dir": model_dir,
- "max_total_token_num": None,
- "mem_faction": 0.8,
- "load_way": "HF",
- "batch_max_tokens": (input_len + output_len),
- "mode": mode,
- "max_req_num": max(batch_sizes),
- "graph_max_batch_size": max(batch_sizes),
- "graph_max_len_in_batch": (input_len + output_len),
- "max_seq_length": (input_len + output_len),
- }
-
- proc = multiprocessing.Process(
- target=tppart_model_infer,
- args=(model_class, model_kvargs, batch_sizes, input_len, output_len, ans_queue, log_path),
- )
- proc.start()
- workers.append(proc)
-
- while True:
- import time
-
- exist_dead = any([not proc.is_alive() for proc in workers])
- if exist_dead:
- time.sleep(4)
- exist_err = any([proc.exitcode != 0 for proc in workers])
- if exist_err:
- return -1
- else:
- break
- time.sleep(1)
-
- while not ans_queue.empty():
- if not ans_queue.get():
- return -1
- return 0
-
-
-def tppart_model_infer(model_class, model_kvargs, batch_sizes, input_len, output_len, ans_queue, log_path):
- assert log_path is not None
- need_run_batch_sizes = []
- for batch_size in batch_sizes:
- new_log_path = log_path.replace("batch_size", str(batch_size))
- if os.path.exists(new_log_path):
- with open(new_log_path, "r") as fp_file:
- lines = fp_file.readlines()
- if len(lines) >= 2 and "time total cost(ms):" in lines[-1]: # 说明已经跑过了,直接过滤掉。
- continue
- else:
- need_run_batch_sizes.append(batch_size)
- else:
- need_run_batch_sizes.append(batch_size)
-
- if len(need_run_batch_sizes) == 0:
- return
-
- import torch
- import torch.distributed as dist
-
- rank_id = model_kvargs["tp_rank"]
- world_size = model_kvargs["world_size"]
-
- torch.cuda.set_device(rank_id)
- dist.init_process_group("nccl", init_method="tcp://127.0.0.1:28765", rank=rank_id, world_size=world_size)
- dist.barrier()
-
- torch.cuda.empty_cache()
-
- model_part = model_class(model_kvargs)
-
- for batch_size in need_run_batch_sizes:
- model_part.mem_manager.free_all()
- model_part.req_manager.free_all()
- model_part.mem_manager.resize_mem(batch_size * (input_len + output_len))
- # warm up
- test_data = np.vstack([np.arange(5, input_len + 5) for _ in range(batch_size)])
- test_data = test_data.reshape(-1)
- test_data = torch.from_numpy(test_data).cuda()
-
- b_req_idx = model_part.req_manager.alloc(batch_size).int()
- b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
- for i in range(batch_size):
- b_seq_len[i] = input_len
-
- total_token_num = input_len * batch_size
- mem_indexes = model_part.req_manager.mem_manager.alloc(test_data.shape[0])
- logics = model_part.forward(
- batch_size,
- total_token_num,
- input_len,
- test_data,
- mem_indexes,
- b_req_idx,
- b_seq_len,
- is_prefill=True,
- )
- prob_out = torch.softmax(logics, dim=-1)
- predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
- predict_ids = predict_ids.detach().cpu().numpy()
-
- for i in range(output_len):
- total_token_num += batch_size
- b_seq_len += 1
- mem_indexes = model_part.req_manager.mem_manager.alloc(predict_ids.shape[0])
- logics = model_part.forward(
- batch_size,
- total_token_num,
- input_len + i + 1,
- torch.from_numpy(predict_ids).cuda().reshape(-1),
- mem_indexes,
- b_req_idx,
- b_seq_len,
- is_prefill=False,
- )
- prob_out = torch.softmax(logics, dim=-1)
- predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
- predict_ids = predict_ids.detach().cpu().numpy()
-
- model_part.mem_manager.free_all()
- model_part.req_manager.free_all()
-
- if rank_id == 0:
- print("can use mem size:", model_part.mem_manager.can_use_mem_size)
- print("can use req size:", model_part.req_manager.can_use_req_size)
-
- b_req_idx = None
- b_seq_len = None
-
- dist.barrier()
- if rank_id == 0:
- new_log_path = log_path.replace("batch_size", str(batch_size))
- fp_file = open(new_log_path, "w+")
-
- import time
-
- torch.cuda.synchronize()
- start_time = time.time()
-
- prefill_start_time = time.time()
-
- b_req_idx = model_part.req_manager.alloc(batch_size).int()
- b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
- for i in range(batch_size):
- b_seq_len[i] = input_len
-
- total_token_num = batch_size * input_len
- mem_indexes = model_part.req_manager.mem_manager.alloc(test_data.shape[0])
- logics = model_part.forward(
- batch_size,
- total_token_num,
- input_len,
- test_data,
- mem_indexes,
- b_req_idx,
- b_seq_len,
- is_prefill=True,
- )
- prob_out = torch.softmax(logics, dim=-1)
- predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
- predict_ids = predict_ids.detach().cpu().numpy()
-
- torch.cuda.synchronize()
- if rank_id == 0:
- print("prefill time cost:", (time.time() - prefill_start_time) * 1000, file=fp_file)
-
- for i in range(output_len):
- torch.cuda.synchronize()
- step_start = time.time()
- total_token_num += batch_size
- b_seq_len += 1
- mem_indexes = model_part.req_manager.mem_manager.alloc(predict_ids.shape[0])
- logics = model_part.forward(
- batch_size,
- total_token_num,
- input_len + i + 1,
- torch.from_numpy(predict_ids).cuda().reshape(-1),
- mem_indexes,
- b_req_idx,
- b_seq_len,
- is_prefill=False,
- )
- prob_out = torch.softmax(logics, dim=-1)
- predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
- predict_ids = predict_ids.detach().cpu().numpy()
- torch.cuda.synchronize()
- if i % 100 == 0 or i == output_len - 1:
- if rank_id == 0:
- print(i, "step cost time:", (time.time() - step_start) * 1000, file=fp_file)
-
- torch.cuda.synchronize()
- end_time = time.time()
-
- if rank_id == 0:
- print("time total cost(ms):", (end_time - start_time) * 1000, file=fp_file)
- import sys
-
- if fp_file is not sys.stdout:
- fp_file.flush()
- fp_file.close()
- while not fp_file.closed:
- fp_file.close()
-
- b_req_idx = None
- b_seq_len = None
- test_data = None
-
- ans_queue.put(True)
-
- return
diff --git a/test/model/test_settings/process_utils.py b/test/model/test_settings/process_utils.py
deleted file mode 100644
index 352e6f03d..000000000
--- a/test/model/test_settings/process_utils.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import subprocess
-import re
-
-
-def kill_gpu_processes():
- try:
- output = subprocess.check_output(["nvidia-smi", "-q", "-x"])
- output = output.decode("utf-8")
-
- # 使用正则表达式提取进程信息
- process_info = re.findall(r"(.*?)", output, re.DOTALL)
-
- if process_info:
- print("找到以下占用显卡的进程:")
- for info in process_info:
- pid = re.search(r"(.*?)", info).group(1)
- process_name = re.search(r"(.*?)", info).group(1)
- print("进程ID:", pid)
- print("进程名字:", process_name)
-
- for info in process_info:
- pid = re.search(r"(.*?)", info).group(1)
- subprocess.call(["sudo", "kill", "-9", pid])
- print("进程ID", pid, "被终止")
- else:
- print("没有找到占用显卡的进程")
-
- except subprocess.CalledProcessError:
- print("无法执行nvidia-smi命令")
-
-
-if __name__ == "__main__":
- kill_gpu_processes()
diff --git a/test/model/test_settings/test_settings.py b/test/model/test_settings/test_settings.py
deleted file mode 100644
index 2890c0c18..000000000
--- a/test/model/test_settings/test_settings.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import os
-import sys
-from model_infer_batchs import test_model_inference
-from process_utils import kill_gpu_processes
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
-from datetime import datetime
-
-
-from lightllm.models.bloom.model import BloomTpPartModel
-from lightllm.models.llama.model import LlamaTpPartModel
-from lightllm.models.starcoder.model import StarcoderTpPartModel
-from lightllm.models.qwen.model import QWenTpPartModel
-from lightllm.models.chatglm2.model import ChatGlm2TpPartModel
-from lightllm.models.internlm.model import InternlmTpPartModel
-
-
-base_dir = "/nvme/"
-
-model_to_class_and_path = {
- "llama-7b": (LlamaTpPartModel, os.path.join(base_dir, "llama-7b")),
- "llama-13b": (LlamaTpPartModel, os.path.join(base_dir, "")),
- "internal-20b": (InternlmTpPartModel, os.path.join(base_dir, "")),
- "llama-65b": (LlamaTpPartModel, os.path.join(base_dir, "")),
- "llama2-70b": (LlamaTpPartModel, os.path.join(base_dir, "")),
- "chatglm2-6b": (ChatGlm2TpPartModel, os.path.join(base_dir, "")),
-}
-
-
-def test_all_setting(gpu_name, model_name, mode, log_dir, world_sizes, in_out_lens, batch_sizes):
- log_dir = os.path.join(log_dir, gpu_name, str(model_name))
- os.makedirs(log_dir, exist_ok=True)
-
- model_class, model_path = model_to_class_and_path[model_name]
- kill_gpu_processes()
- for world_size in world_sizes:
- for in_len, out_len in in_out_lens:
- kill_gpu_processes()
- mode_str = "_".join(mode)
- log_file_name = f"{model_name}##{mode_str}##{world_size}##{in_len}##{out_len}##batch_size##.log"
- log_path = os.path.join(log_dir, log_file_name)
- print(log_path)
- test_model_inference(world_size, model_path, model_class, batch_sizes, in_len, out_len, mode, log_path)
- log_md_file = log_dir + ".md"
- md_file = open(log_md_file, "w")
- # write head
- heads = [
- "mode",
- "world_size",
- "batch_size",
- "input_len",
- "output_len",
- "prefill_cost",
- "first_step_latency",
- "last_step_latency",
- "mean_latency",
- "prefill_throughput",
- "decode_throughput",
- "total_throughput",
- "card_num_per_qps",
- ]
- md_file.write(f"test model: {model_name} \r\n")
- md_file.write("|")
- for head in heads:
- md_file.write(head + "|")
- md_file.write("\r\n")
- md_file.write("|")
- for _ in range(len(heads)):
- md_file.write("------|")
- md_file.write("\r\n")
- log_files = list(os.listdir(log_dir))
- sorted(log_files, key=lambda x: tuple(map(int, x.split("##")[2:6])))
- for log_file in log_files:
- _, mode, world_size, input_len, output_len, batch_size, _ = log_file.split("##")
- fp_file = open(os.path.join(log_dir, log_file), "r")
- all_lines = fp_file.readlines()
- fp_file.close()
- if len(all_lines) <= 2:
- continue
- prefill_cost = float(all_lines[0].split(":")[1].strip())
- firststep_cost = float(all_lines[1].split(":")[1].strip())
- laststep_cost = float(all_lines[-2].split(":")[1].strip())
- all_step_cost = float(all_lines[-1].split(":")[1].strip())
- mean_step_cost = (all_step_cost - prefill_cost) / float(output_len)
- card_num_per_qps = float(world_size) / (float(batch_size) / (all_step_cost / 1000))
- prefill_throughput = float(batch_size) * float(input_len) / (prefill_cost / 1000)
- decode_throughput = float(batch_size) * float(output_len) / ((all_step_cost - prefill_cost) / 1000)
- total_throughput = float(batch_size) * (float(input_len) + float(output_len)) / (all_step_cost / 1000)
- md_file.write("|")
- infos = [
- mode,
- world_size,
- batch_size,
- input_len,
- output_len,
- prefill_cost,
- firststep_cost,
- laststep_cost,
- mean_step_cost,
- prefill_throughput,
- decode_throughput,
- total_throughput,
- card_num_per_qps,
- ]
- for info in infos:
- md_file.write(str(format(info, ".4f")) if isinstance(info, float) else str(info))
- md_file.write("|")
- md_file.write("\r\n")
- md_file.close()
-
-
-gpu_name = "A800"
-in_out_lens = [(128, 128), (256, 256)] # in_out_lens 中的数据必须以从短到长的顺序排列,否则可能有问题。
-batch_sizes = [1, 2] # batch_sizes 中的数字也必须从小到大排列。
-
-
-if __name__ == "__main__":
- import torch
-
- torch.multiprocessing.set_start_method("spawn")
-
- test_all_setting(
- gpu_name,
- "llama3-8b",
- # mode=["triton_int8weight", "ppl_fp16_flashdecoding"], # mode 为 【】 为普通 fp16 的格式。
- mode=["triton_gqa_flashdecoding"],
- log_dir="./",
- world_sizes=[1],
- in_out_lens=in_out_lens,
- batch_sizes=batch_sizes,
- )
diff --git a/test/server/readme.md b/test/server/readme.md
deleted file mode 100644
index 0b8d53903..000000000
--- a/test/server/readme.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# prompt cache 测试:
-
-- benchmark_prompt_cache.py: 单次测试脚本。
-
- 例子:
- ```shell
- python benchmark_prompt_cache.py --address http://localhost:8090 --model_name llama --num_workers 1 --first_input_len 512 --subsequent_input_len 32 --output_len 32 --num_turns 5 --num_users 1
- ```
-
- 使用方法详细说明:
- ```shell
- python benchmark_prompt_cache.py -h
- ```
-
-- test_settings.py: 批量测试脚本,可测试多个配置并汇总为md
diff --git a/test/start_scripts/README.md b/test/start_scripts/README.md
new file mode 100644
index 000000000..aff1d973f
--- /dev/null
+++ b/test/start_scripts/README.md
@@ -0,0 +1,202 @@
+# LightLLM DeepSeek Model Startup Scripts
+
+This directory contains various startup scripts for deploying DeepSeek models with LightLLM, covering different deployment modes and hardware configurations.
+
+## Script Categories
+
+### Single Node Deployment Scripts
+
+- `single_node_tp.sh` - Single node tensor parallelism (TP) mode
+- `single_node_ep.sh` - Single node expert parallelism (EP) mode
+
+### Multi-Node Deployment Scripts
+
+- `multi_node_tp_node0.sh` - Multi-node tensor parallelism node 0
+- `multi_node_tp_node1.sh` - Multi-node tensor parallelism node 1
+- `multi_node_ep_node0.sh` - Multi-node expert parallelism node 0
+- `multi_node_ep_node1.sh` - Multi-node expert parallelism node 1
+
+### PD Separated Deployment Scripts
+
+#### Single PD Master Mode
+- `single_pd_master/pd_master.sh` - PD Master service
+- `single_pd_master/pd_prefill.sh` - Prefill service
+- `single_pd_master/pd_decode.sh` - Decode service
+
+#### Multi PD Master Mode
+- `multi_pd_master/config_server.sh` - Configuration server
+- `multi_pd_master/pd_master_1.sh` - PD Master 1
+- `multi_pd_master/pd_master_2.sh` - PD Master 2
+- `multi_pd_master/pd_prefill.sh` - Prefill service
+- `multi_pd_master/pd_decode.sh` - Decode service
+
+## Usage Instructions
+
+### 1. Single Node TP Mode
+
+```bash
+# Modify model path and run directly
+sh single_node_tp.sh
+```
+
+### 2. Single Node EP Mode
+
+```bash
+# Modify model path and run directly
+sh single_node_ep.sh
+```
+
+### 3. Multi-Node TP Mode
+
+```bash
+# Run on node 0
+sh multi_node_tp_node0.sh
+
+# Run on node 1
+sh multi_node_tp_node1.sh
+```
+
+### 4. Multi-Node EP Mode
+
+```bash
+# Run on node 0
+sh multi_node_ep_node0.sh
+
+# Run on node 1
+sh multi_node_ep_node1.sh
+```
+
+### 5. Single PD Master Mode
+
+```bash
+# Step 1: Start PD Master
+sh single_pd_master/pd_master.sh
+
+# Step 2: Start Prefill service
+sh single_pd_master/pd_prefill.sh
+
+# Step 3: Start Decode service
+sh single_pd_master/pd_decode.sh
+```
+
+### 6. Multi PD Master Mode
+
+```bash
+# Step 1: Start configuration server
+sh multi_pd_master/config_server.sh
+
+# Step 2: Start multiple PD Masters
+sh multi_pd_master/pd_master_1.sh
+sh multi_pd_master/pd_master_2.sh
+
+# Step 3: Start Prefill and Decode services
+sh multi_pd_master/pd_prefill.sh
+sh multi_pd_master/pd_decode.sh
+```
+
+## Configuration Guide
+
+### Environment Variables
+
+- `LOADWORKER`: Model loading thread count, recommended 8-18
+- `MOE_MODE`: Expert parallelism mode, set to EP to enable expert parallelism
+- `KV_TRANS_USE_P2P`: Enable P2P communication optimization
+- `CUDA_VISIBLE_DEVICES`: Specify GPU devices to use
+
+### Important Parameters
+
+- `--model_dir`: Model file path
+- `--tp`: Tensor parallelism degree
+- `--dp`: Data parallelism degree
+- `--enable_fa3`: Enable Flash Attention 3.0
+- `--nnodes`: Total number of nodes
+- `--node_rank`: Current node rank
+- `--nccl_host`: NCCL communication host address
+- `--nccl_port`: NCCL communication port
+
+## Hardware Configuration Recommendations
+
+### H200 Single Node
+- Recommended 8 GPUs, TP=8
+- Memory: At least 128GB system memory
+
+### H100 Dual Node
+- Recommended 16 GPUs, TP=16
+- Network: High bandwidth, low latency network connection
+
+### General Recommendations
+- Ensure GPU drivers and CUDA versions are compatible
+- Check network connectivity and firewall settings
+- Monitor GPU utilization and memory usage
+
+## Troubleshooting
+
+### Common Issues
+
+1. **NCCL Communication Errors**
+ - Check network connectivity
+ - Verify firewall settings
+ - Validate IP address configuration
+
+2. **Insufficient GPU Memory**
+ - Reduce batch_size
+ - Use more GPUs
+ - Enable KV cache optimization
+
+3. **Model Loading Failures**
+ - Check model path
+ - Verify file integrity
+ - Confirm permission settings
+
+### Performance Optimization
+
+1. **Enable MPS Service**
+ ```bash
+ nvidia-cuda-mps-control -d
+ ```
+
+2. **Enable Micro-batch Overlap**
+ ```bash
+ --enable_prefill_microbatch_overlap
+ --enable_decode_microbatch_overlap
+ ```
+
+3. **Adjust CUDA Graph Parameters**
+ ```bash
+ --graph_max_batch_size 100
+ ```
+
+## Testing and Validation
+
+### Basic Functionality Test
+
+```bash
+curl http://server_ip:server_port/generate \
+ -H "Content-Type: application/json" \
+ -d '{
+ "inputs": "What is AI?",
+ "parameters":{
+ "max_new_tokens":17,
+ "frequency_penalty":1
+ }
+ }'
+```
+
+### Performance Benchmark Test
+
+```bash
+cd test
+python benchmark_client.py \
+--num_clients 100 \
+--input_num 2000 \
+--tokenizer_path /path/DeepSeek-R1/ \
+--url http://127.0.0.1:8088/generate_stream
+```
+
+## Important Notes
+
+1. Please modify the model path in scripts before use
+2. Adjust parameters according to actual hardware configuration
+3. Ensure network environment meets multi-node deployment requirements
+4. Recommend thorough testing before production deployment
+5. Regularly monitor service status and performance metrics
\ No newline at end of file
diff --git a/test/start_scripts/multi_node_ep_node0.sh b/test/start_scripts/multi_node_ep_node0.sh
new file mode 100644
index 000000000..3a139968a
--- /dev/null
+++ b/test/start_scripts/multi_node_ep_node0.sh
@@ -0,0 +1,16 @@
+# H200 multi node deepseek R1 ep mode node 0
+# nccl_host: the ip of the nccl host
+# sh multi_node_ep_node0.sh
+export nccl_host=$1
+MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+--model_dir /path/DeepSeek-R1 \
+--tp 16 \
+--dp 16 \
+--enable_fa3 \
+--nnodes 2 \
+--node_rank 0 \
+--nccl_host $nccl_host \
+--nccl_port 2732
+# if you want to enable microbatch overlap, you can uncomment the following lines
+#--enable_prefill_microbatch_overlap
+#--enable_decode_microbatch_overlap
\ No newline at end of file
diff --git a/test/start_scripts/multi_node_ep_node1.sh b/test/start_scripts/multi_node_ep_node1.sh
new file mode 100644
index 000000000..b24a59868
--- /dev/null
+++ b/test/start_scripts/multi_node_ep_node1.sh
@@ -0,0 +1,16 @@
+# H200 multi node deepseek R1 ep mode node 1
+# nccl_host: the ip of the nccl host
+# sh multi_node_ep_node1.sh
+export nccl_host=$1
+MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+--model_dir /path/DeepSeek-R1 \
+--tp 16 \
+--dp 16 \
+--enable_fa3 \
+--nnodes 2 \
+--node_rank 1 \
+--nccl_host $nccl_host \
+--nccl_port 2732
+# if you want to enable microbatch overlap, you can uncomment the following lines
+#--enable_prefill_microbatch_overlap
+#--enable_decode_microbatch_overlap
\ No newline at end of file
diff --git a/test/start_scripts/multi_node_tp_node0.sh b/test/start_scripts/multi_node_tp_node0.sh
new file mode 100644
index 000000000..b86bdeb35
--- /dev/null
+++ b/test/start_scripts/multi_node_tp_node0.sh
@@ -0,0 +1,12 @@
+# H200/H100 multi node deepseek R1 tp mode node 0
+# nccl_host: the ip of the nccl host
+# sh multi_node_tp_node0.sh
+export nccl_host=$1
+LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+--model_dir /path/DeepSeek-R1 \
+--tp 16 \
+--enable_fa3 \
+--nnodes 2 \
+--node_rank 0 \
+--nccl_host $nccl_host \
+--nccl_port 2732
\ No newline at end of file
diff --git a/test/start_scripts/multi_node_tp_node1.sh b/test/start_scripts/multi_node_tp_node1.sh
new file mode 100644
index 000000000..378977ab2
--- /dev/null
+++ b/test/start_scripts/multi_node_tp_node1.sh
@@ -0,0 +1,12 @@
+# H200/H100 multi node deepseek R1 tp mode node 1
+# nccl_host: the ip of the nccl host
+# sh multi_node_tp_node1.sh
+export nccl_host=$1
+LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+--model_dir /path/DeepSeek-R1 \
+--tp 16 \
+--enable_fa3 \
+--nnodes 2 \
+--node_rank 1 \
+--nccl_host $nccl_host \
+--nccl_port 2732
\ No newline at end of file
diff --git a/test/start_scripts/multi_pd_master.sh b/test/start_scripts/multi_pd_master.sh
new file mode 100644
index 000000000..c4e8c21fb
--- /dev/null
+++ b/test/start_scripts/multi_pd_master.sh
@@ -0,0 +1,34 @@
+# 多 pd_master 节点部署示例
+python -m lightllm.server.api_server --run_mode "config_server" --config_server_host 10.120.114.74 --config_server_port 60088
+
+python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60011 --config_server_host 10.120.114.74 --config_server_port 60088
+
+python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60012 --config_server_host 10.120.114.74 --config_server_port 60088
+
+nvidia-cuda-mps-control -d
+CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
+--run_mode "prefill" \
+--host 10.120.178.74 \
+--port 8019 \
+--tp 1 \
+--nccl_port 2732 \
+--max_total_token_num 40000 \
+--tokenizer_mode fast \
+--max_req_total_len 16000 \
+--running_max_req_size 128 \
+--disable_cudagraph \
+--config_server_host 10.120.114.74 \
+--config_server_port 60088
+
+CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
+--run_mode "decode" \
+--host 10.120.178.74 \
+--port 8121 \
+--nccl_port 12322 \
+--tp 1 \
+--max_total_token_num 40000 \
+--graph_max_len_in_batch 2048 \
+--graph_max_batch_size 16 \
+--tokenizer_mode fast \
+--config_server_host 10.120.114.74 \
+--config_server_port 60088
\ No newline at end of file
diff --git a/test/start_scripts/multi_pd_master/config_server.sh b/test/start_scripts/multi_pd_master/config_server.sh
new file mode 100644
index 000000000..3771cd1cd
--- /dev/null
+++ b/test/start_scripts/multi_pd_master/config_server.sh
@@ -0,0 +1,5 @@
+# config_server
+# config_server_host: the host of the config server
+# sh config_server.sh
+export config_server_host=$1
+python -m lightllm.server.api_server --run_mode "config_server" --config_server_host $config_server_host --config_server_port 60088
diff --git a/test/start_scripts/multi_pd_master/pd_decode.sh b/test/start_scripts/multi_pd_master/pd_decode.sh
new file mode 100644
index 000000000..4cefef6fb
--- /dev/null
+++ b/test/start_scripts/multi_pd_master/pd_decode.sh
@@ -0,0 +1,20 @@
+# decode
+# host: the host of the decode server
+# config_server_host: the host of the config server
+# sh decode.sh
+export host=$1
+export config_server_host=$2
+nvidia-cuda-mps-control -d
+MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+--model_dir /path/DeepSeek-R1 \
+--run_mode "decode" \
+--host $host \
+--port 8121 \
+--nccl_port 12322 \
+--tp 8 \
+--dp 8 \
+--enable_fa3 \
+--config_server_host $config_server_host \
+--config_server_port 60088
+# if you want to enable microbatch overlap, you can uncomment the following lines
+#--enable_decode_microbatch_overlap
\ No newline at end of file
diff --git a/test/start_scripts/multi_pd_master/pd_master_1.sh b/test/start_scripts/multi_pd_master/pd_master_1.sh
new file mode 100644
index 000000000..b71024525
--- /dev/null
+++ b/test/start_scripts/multi_pd_master/pd_master_1.sh
@@ -0,0 +1,7 @@
+# pd_master 1
+# host: the host of the pd master
+# config_server_host: the host of the config server
+# sh pd_master_1.sh
+export host=$1
+export config_server_host=$2
+python -m lightllm.server.api_server --model_dir /path/DeepSeek-R1 --run_mode "pd_master" --host $host --port 60011 --config_server_host $config_server_host --config_server_port 60088
diff --git a/test/start_scripts/multi_pd_master/pd_master_2.sh b/test/start_scripts/multi_pd_master/pd_master_2.sh
new file mode 100644
index 000000000..f3a474d95
--- /dev/null
+++ b/test/start_scripts/multi_pd_master/pd_master_2.sh
@@ -0,0 +1,7 @@
+# pd_master 2
+# host: the host of the pd master
+# config_server_host: the host of the config server
+# sh pd_master_2.sh
+export host=$1
+export config_server_host=$2
+python -m lightllm.server.api_server --model_dir /path/DeepSeek-R1 --run_mode "pd_master" --host $host --port 60012 --config_server_host $config_server_host --config_server_port 60088
diff --git a/test/start_scripts/multi_pd_master/pd_prefill.sh b/test/start_scripts/multi_pd_master/pd_prefill.sh
new file mode 100644
index 000000000..b845da435
--- /dev/null
+++ b/test/start_scripts/multi_pd_master/pd_prefill.sh
@@ -0,0 +1,21 @@
+# prefill
+# host: the host of the prefill server
+# config_server_host: the host of the config server
+# sh pd_prefill.sh
+export host=$1
+export config_server_host=$2
+nvidia-cuda-mps-control -d
+MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
+--model_dir /path/DeepSeek-R1 \
+--run_mode "prefill" \
+--host $host \
+--port 8019 \
+--tp 8 \
+--dp 8 \
+--nccl_port 2732 \
+--enable_fa3 \
+--disable_cudagraph \
+--config_server_host $config_server_host \
+--config_server_port 60088
+# if you want to enable microbatch overlap, you can uncomment the following lines
+#--enable_prefill_microbatch_overlap
\ No newline at end of file
diff --git a/test/start_scripts/single_node_ep.sh b/test/start_scripts/single_node_ep.sh
new file mode 100644
index 000000000..cad172d51
--- /dev/null
+++ b/test/start_scripts/single_node_ep.sh
@@ -0,0 +1,9 @@
+# H200 single node deepseek R1 dpep mode
+MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+--model_dir /path/DeepSeek-R1 \
+--tp 8 \
+--dp 8 \
+--enable_fa3
+# if you want to enable microbatch overlap, you can uncomment the following lines
+#--enable_prefill_microbatch_overlap \
+#--enable_decode_microbatch_overlap \
diff --git a/test/start_scripts/single_node_tp.sh b/test/start_scripts/single_node_tp.sh
new file mode 100644
index 000000000..1fb461bb1
--- /dev/null
+++ b/test/start_scripts/single_node_tp.sh
@@ -0,0 +1,8 @@
+# H200 single node deepseek R1 tp mode
+LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \
+--model_dir /path/DeepSeek-R1 \
+--tp 8 \
+--enable_fa3
+# if you want to enable microbatch overlap, you can uncomment the following lines
+#--enable_prefill_microbatch_overlap \
+#--enable_decode_microbatch_overlap \
diff --git a/test/start_scripts/single_pd_master/pd_decode.sh b/test/start_scripts/single_pd_master/pd_decode.sh
new file mode 100644
index 000000000..3bef53875
--- /dev/null
+++ b/test/start_scripts/single_pd_master/pd_decode.sh
@@ -0,0 +1,21 @@
+# PD decode mode for deepseek R1 (DP+EP) on H200
+# host: the host of the current node
+# pd_master_ip: the ip of the pd master
+# sh pd_decode.sh
+export host=$1
+export pd_master_ip=$2
+nvidia-cuda-mps-control -d
+MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
+--model_dir /path/DeepSeek-R1 \
+--run_mode "decode" \
+--tp 8 \
+--dp 8 \
+--host $host \
+--port 8121 \
+--nccl_port 12322 \
+--enable_fa3 \
+--disable_cudagraph \
+--pd_master_ip $pd_master_ip \
+--pd_master_port 60011
+# if you want to enable microbatch overlap, you can uncomment the following lines
+#--enable_decode_microbatch_overlap
\ No newline at end of file
diff --git a/test/start_scripts/single_pd_master/pd_master.sh b/test/start_scripts/single_pd_master/pd_master.sh
new file mode 100644
index 000000000..600ef90b7
--- /dev/null
+++ b/test/start_scripts/single_pd_master/pd_master.sh
@@ -0,0 +1,5 @@
+# pd_master for deepseek R1
+# pd_master_ip: the ip of the pd master
+# sh pd_master.sh
+export pd_master_ip=$1
+python -m lightllm.server.api_server --model_dir /path/DeepSeek-R1 --run_mode "pd_master" --host $pd_master_ip --port 60011
\ No newline at end of file
diff --git a/test/start_scripts/single_pd_master/pd_prefill.sh b/test/start_scripts/single_pd_master/pd_prefill.sh
new file mode 100644
index 000000000..b15e4ef70
--- /dev/null
+++ b/test/start_scripts/single_pd_master/pd_prefill.sh
@@ -0,0 +1,21 @@
+# PD prefill mode for deepseek R1 (DP+EP) on H200
+# host: the host of the current node
+# pd_master_ip: the ip of the pd master
+# sh pd_prefill.sh
+export host=$1
+export pd_master_ip=$2
+nvidia-cuda-mps-control -d
+MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
+--model_dir /path/DeepSeek-R1 \
+--run_mode "prefill" \
+--tp 8 \
+--dp 8 \
+--host $host \
+--port 8019 \
+--nccl_port 2732 \
+--enable_fa3 \
+--disable_cudagraph \
+--pd_master_ip $pd_master_ip \
+--pd_master_port 60011
+# if you want to enable microbatch overlap, you can uncomment the following lines
+#--enable_prefill_microbatch_overlap
\ No newline at end of file
diff --git a/test/test.sh b/test/test.sh
deleted file mode 100644
index 8f3882386..000000000
--- a/test/test.sh
+++ /dev/null
@@ -1,107 +0,0 @@
-# pd start
-python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat --run_mode "pd_master" --host `hostname -i` --port 60011
-
-nvidia-cuda-mps-control -d
-CUDA_VISIBLE_DEVICES=0,1,2,3 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat \
---run_mode "prefill" \
---host `hostname -i` \
---port 8019 \
---tp 4 \
---nccl_port 2732 \
---max_total_token_num 400000 \
---tokenizer_mode fast \
---pd_master_ip `hostname -i` \
---pd_master_port 60011 \
---max_req_total_len 16000 \
---running_max_req_size 128 \
---disable_cudagraph
-
-nvidia-cuda-mps-control -d
-CUDA_VISIBLE_DEVICES=4,5,6,7 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat \
---run_mode "decode" \
---host `hostname -i` \
---port 8121 \
---nccl_port 12322 \
---tp 4 \
---max_total_token_num 400000 \
---graph_max_len_in_batch 2048 \
---graph_max_batch_size 16 \
---tokenizer_mode fast \
---pd_master_ip `hostname -i` \
---pd_master_port 60011
-
-# pd start1
-python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat --run_mode "pd_master" --host `hostname -i` --port 60011
-
-nvidia-cuda-mps-control -d
-CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat \
---run_mode "prefill" \
---host `hostname -i` \
---port 8019 \
---tp 1 \
---nccl_port 2732 \
---max_total_token_num 40000 \
---tokenizer_mode fast \
---pd_master_ip `hostname -i` \
---pd_master_port 60011 \
---max_req_total_len 16000 \
---running_max_req_size 128 \
---disable_cudagraph
-
-nvidia-cuda-mps-control -d
-CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat \
---run_mode "decode" \
---host `hostname -i` \
---port 8121 \
---nccl_port 12322 \
---tp 1 \
---max_total_token_num 40000 \
---graph_max_len_in_batch 2048 \
---graph_max_batch_size 16 \
---tokenizer_mode fast \
---pd_master_ip `hostname -i` \
---pd_master_port 60011
-
-
-# normal start
-LOADWORKER=8 python -m lightllm.server.api_server --port 8018 --model_dir /dev/shm/llama2-7b-chat --tp 2 --graph_max_batch_size 16
-
-
-# 多 pd_master 节点部署实列
-python -m lightllm.server.api_server --run_mode "config_server" --config_server_host 10.120.114.74 --config_server_port 60088
-
-python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60011 --config_server_host 10.120.114.74 --config_server_port 60088
-
-python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60012 --config_server_host 10.120.114.74 --config_server_port 60088
-
-
-nvidia-cuda-mps-control -d
-CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
---run_mode "prefill" \
---host 10.120.178.74 \
---port 8019 \
---tp 1 \
---nccl_port 2732 \
---max_total_token_num 40000 \
---tokenizer_mode fast \
---max_req_total_len 16000 \
---running_max_req_size 128 \
---disable_cudagraph \
---config_server_host 10.120.114.74 \
---config_server_port 60088
-
-CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
---run_mode "decode" \
---host 10.120.178.74 \
---port 8121 \
---nccl_port 12322 \
---tp 1 \
---max_total_token_num 40000 \
---graph_max_len_in_batch 2048 \
---graph_max_batch_size 16 \
---tokenizer_mode fast \
---config_server_host 10.120.114.74 \
---config_server_port 60088
-
-
-
diff --git a/test/test_accuracy.py b/test/test_accuracy.py
deleted file mode 100644
index 5ea825356..000000000
--- a/test/test_accuracy.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import argparse
-import subprocess
-import time
-import os
-import requests
-import sys
-import json
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("--tp", type=int, required=True, help="Number of GPUs to use.")
- parser.add_argument("--model_dir", type=str, required=True, help="Directory of the model.")
- return parser.parse_args()
-
-
-def start_server(tp, model_dir):
- cmd = [
- "python",
- "-m",
- "lightllm.server.api_server",
- "--tp",
- str(tp),
- "--model_dir",
- model_dir,
- "--data_type",
- "fp16",
- "--mode",
- "triton_gqa_flashdecoding",
- "--trust_remote_code",
- "--tokenizer_mode",
- "fast",
- "--host",
- "0.0.0.0",
- "--port",
- "8080",
- ]
- process = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
- return process
-
-
-def check_health():
- health_url = "http://localhost:8080/health"
- try:
- r = requests.get(health_url, timeout=2)
- return r.status_code == 200
- except Exception:
- return False
-
-
-def send_prompts(prompts, output_file):
- for prompt in prompts:
- while not check_health():
- time.sleep(1)
-
- request_data = {
- "inputs": prompt,
- "parameters": {"max_new_tokens": 1024, "frequency_penalty": 1, "do_sample": False},
- "multimodal_params": {},
- }
-
- try:
- r = requests.post("http://localhost:8080/generate", json=request_data, timeout=10)
- response_json = json.loads(r.text)
- generated_text = (
- response_json["generated_text"][0] if "generated_text" in response_json else "No generated_text."
- )
- except Exception as e:
- generated_text = f"ERROR: {str(e)}"
-
- with open(output_file, "a", encoding="utf-8") as f:
- f.write(f"===== prompt: {prompt} =====\n")
- f.write(f"{generated_text}\n\n")
-
- print(f"===================Ouput saved in {output_file}===========================")
-
-
-def main():
- # args
- args = parse_args()
- tp = args.tp
- model_dir = args.model_dir
-
- # output_file
- output_file = "test_results.txt"
-
- if os.path.exists(output_file):
- os.remove(output_file)
-
- # start server
- process = start_server(tp, model_dir)
-
- # prompts
- prompts = [
- "What is the machine learning?",
- "1+1等于几",
- "What role does attention play in transformer architectures?",
- "西红柿炒鸡蛋怎么做?",
- "Describe the concept of overfitting and underfitting.",
- "CPU和GPU的区别是什么?",
- "What is the role of a loss function in machine learning?",
- ]
-
- send_prompts(prompts, output_file)
-
- # shutdown server
- process.terminate()
- process.wait()
-
-
-if __name__ == "__main__":
- main()
-
-# python test_accuracy.py --tp 2 --model_dir /xx/xx
diff --git a/test/test.jpg b/test/test_api/test.jpg
similarity index 100%
rename from test/test.jpg
rename to test/test_api/test.jpg
diff --git a/test/test_server.py b/test/test_api/test_generate_api.py
similarity index 100%
rename from test/test_server.py
rename to test/test_api/test_generate_api.py
diff --git a/test/test_multimodal_server.py b/test/test_api/test_multimodal_api.py
similarity index 100%
rename from test/test_multimodal_server.py
rename to test/test_api/test_multimodal_api.py
diff --git a/test/test_api/test_openai_api.py b/test/test_api/test_openai_api.py
new file mode 100644
index 000000000..6d98dadbe
--- /dev/null
+++ b/test/test_api/test_openai_api.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+LightLLM OpenAI API test cases
+
+python test_openai_api.py
+"""
+
+import requests
+import json
+import time
+from typing import Dict, List, Any, Optional
+
+
+class LightLLMClient:
+ """LightLLM OpenAI API test cases"""
+
+ def __init__(self, base_url: str = "http://localhost:8000", model_name: str = "your_model_name"):
+ self.base_url = base_url
+ self.model_name = model_name
+ self.headers = {"Content-Type": "application/json"}
+ self.conversation_history = []
+
+ def simple_chat(self, message: str, **kwargs) -> Dict[str, Any]:
+ """简单对话"""
+ data = {
+ "model": self.model_name,
+ "messages": [{"role": "user", "content": message}],
+ "temperature": kwargs.get("temperature", 0.7),
+ "max_tokens": kwargs.get("max_tokens", 1000),
+ **kwargs,
+ }
+
+ response = requests.post(f"{self.base_url}/v1/chat/completions", headers=self.headers, json=data)
+
+ if response.status_code == 200:
+ return response.json()
+ else:
+ raise Exception(f"API调用失败: {response.status_code} - {response.text}")
+
+ def stream_chat(self, message: str, **kwargs):
+ data = {
+ "model": self.model_name,
+ "messages": [{"role": "user", "content": message}],
+ "stream": True,
+ "temperature": kwargs.get("temperature", 0.7),
+ "max_tokens": kwargs.get("max_tokens", 1000),
+ **kwargs,
+ }
+
+ response = requests.post(f"{self.base_url}/v1/chat/completions", headers=self.headers, json=data, stream=True)
+
+ if response.status_code == 200:
+ for line in response.iter_lines():
+ if line:
+ line = line.decode("utf-8")
+ if line.startswith("data: "):
+ data_str = line[6:]
+ if data_str == "[DONE]":
+ break
+ try:
+ chunk = json.loads(data_str)
+ if chunk["choices"][0]["delta"].get("content"):
+ yield chunk["choices"][0]["delta"]["content"]
+ except json.JSONDecodeError:
+ continue
+ else:
+ raise Exception(f"API调用失败: {response.status_code} - {response.text}")
+
+ def function_call(self, message: str, tools: List[Dict], tool_choice: str = "auto", **kwargs) -> Dict[str, Any]:
+ """Function calling"""
+ data = {
+ "model": self.model_name,
+ "messages": [{"role": "user", "content": message}],
+ "tools": tools,
+ "tool_choice": tool_choice,
+ "temperature": kwargs.get("temperature", 0.7),
+ "max_tokens": kwargs.get("max_tokens", 1000),
+ **kwargs,
+ }
+
+ response = requests.post(f"{self.base_url}/v1/chat/completions", headers=self.headers, json=data)
+
+ if response.status_code == 200:
+ return response.json()
+ else:
+ raise Exception(f"API调用失败: {response.status_code} - {response.text}")
+
+ def stream_function_call(self, message: str, tools: List[Dict], tool_choice: str = "auto", **kwargs):
+ """stream Function calling"""
+ data = {
+ "model": self.model_name,
+ "messages": [{"role": "user", "content": message}],
+ "tools": tools,
+ "tool_choice": tool_choice,
+ "stream": True,
+ "temperature": kwargs.get("temperature", 0.7),
+ "max_tokens": kwargs.get("max_tokens", 1000),
+ **kwargs,
+ }
+
+ response = requests.post(f"{self.base_url}/v1/chat/completions", headers=self.headers, json=data, stream=True)
+
+ if response.status_code == 200:
+ content_buffer = ""
+ tool_calls_buffer = []
+
+ for line in response.iter_lines():
+ if line:
+ line = line.decode("utf-8")
+ if line.startswith("data: "):
+ data_str = line[6:]
+ if data_str == "[DONE]":
+ break
+ try:
+ chunk = json.loads(data_str)
+ delta = chunk["choices"][0]["delta"]
+
+ # 处理内容
+ if delta.get("content"):
+ content_buffer += delta["content"]
+ yield {"type": "content", "data": delta["content"]}
+
+ # 处理函数调用
+ if delta.get("tool_calls"):
+ for tool_call in delta["tool_calls"]:
+ tool_calls_buffer.append(tool_call)
+ yield {"type": "tool_call", "data": tool_call}
+
+ except json.JSONDecodeError:
+ continue
+ else:
+ raise Exception(f"API调用失败: {response.status_code} - {response.text}")
+
+
+def test_simple_chat():
+ client = LightLLMClient()
+
+ try:
+ result = client.simple_chat("你好,请介绍一下你自己")
+ print("用户: 你好,请介绍一下你自己")
+ print("助手:", result["choices"][0]["message"]["content"])
+ print()
+ except Exception as e:
+ print(f"错误: {e}")
+ print("请确保 LightLLM 服务已启动,并检查配置")
+
+
+def test_stream_chat():
+ client = LightLLMClient()
+
+ try:
+ print("用户: 请写一个关于人工智能的短文")
+ print("助手: ", end="", flush=True)
+
+ for chunk in client.stream_chat("请写一个关于人工智能的短文"):
+ print(chunk, end="", flush=True)
+ print("\n")
+ except Exception as e:
+ print(f"错误: {e}")
+
+
+def test_function_call():
+ client = LightLLMClient()
+
+ # 定义函数
+ tools = [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "获取指定城市的天气信息",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "city": {"type": "string", "description": "城市名称,例如:北京、上海"},
+ "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "温度单位"},
+ },
+ "required": ["city"],
+ },
+ },
+ },
+ {
+ "type": "function",
+ "function": {
+ "name": "calculate",
+ "description": "执行数学计算",
+ "parameters": {
+ "type": "object",
+ "properties": {"expression": {"type": "string", "description": "数学表达式,例如:2+3*4"}},
+ "required": ["expression"],
+ },
+ },
+ },
+ ]
+
+ try:
+ # 测试天气查询
+ print("用户: 北京今天天气怎么样?")
+ result = client.function_call("北京今天天气怎么样?", tools)
+ message = result["choices"][0]["message"]
+
+ if message.get("tool_calls"):
+ print("助手决定调用函数:")
+ for tool_call in message["tool_calls"]:
+ print(f" 函数名: {tool_call['function']['name']}")
+ print(f" 参数: {tool_call['function']['arguments']}")
+ else:
+ print("助手:", message["content"])
+ print()
+
+ # 测试数学计算
+ print("用户: 请计算 25 * 4 + 10 的结果")
+ result = client.function_call("请计算 25 * 4 + 10 的结果", tools)
+ message = result["choices"][0]["message"]
+
+ if message.get("tool_calls"):
+ print("助手决定调用函数:")
+ for tool_call in message["tool_calls"]:
+ print(f" 函数名: {tool_call['function']['name']}")
+ print(f" 参数: {tool_call['function']['arguments']}")
+ else:
+ print("助手:", message["content"])
+ print()
+
+ except Exception as e:
+ print(f"错误: {e}")
+
+
+def test_stream_function_call():
+
+ client = LightLLMClient()
+
+ tools = [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "获取指定城市的天气信息",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "city": {"type": "string", "description": "城市名称"},
+ "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+ },
+ "required": ["city"],
+ },
+ },
+ }
+ ]
+
+ try:
+ print("用户: 上海今天天气怎么样?")
+ print("助手: ", end="", flush=True)
+
+ for chunk in client.stream_function_call("上海今天天气怎么样?", tools):
+ if chunk["type"] == "content":
+ print(chunk["data"], end="", flush=True)
+ elif chunk["type"] == "tool_call":
+ print(f"\n[函数调用: {chunk['data']['function']['name']}]")
+ if chunk["data"]["function"].get("arguments"):
+ print(f"参数: {chunk['data']['function']['arguments']}")
+ print("\n")
+
+ except Exception as e:
+ print(f"错误: {e}")
+
+
+def main():
+ test_simple_chat()
+ test_stream_chat()
+ test_function_call()
+ test_stream_function_call()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/test/test_constraint_server.py b/test/test_constraint_server.py
deleted file mode 100644
index 46802239f..000000000
--- a/test/test_constraint_server.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import time
-import requests
-import json
-import threading
-
-"""
-python -m lightllm.server.api_server --model_dir /Meta-Llama-3-8B-Instruct \
- --host 0.0.0.0 \
- --port 8017 \
- --tp 1 \
- --max_total_token_num 100000 \
- --simple_constraint_mode
-"""
-
-
-class RequestThread(threading.Thread):
- def __init__(self, url, headers, data):
- threading.Thread.__init__(self)
- self.url = url
- self.headers = headers
- self.data = data
-
- def run(self):
- response = requests.post(self.url, headers=self.headers, data=json.dumps(self.data))
- if response.status_code == 200:
- print(response.json())
- else:
- print("Error:", response.status_code, response.text)
-
-
-url = "http://localhost:8017/generate"
-headers = {"Content-Type": "application/json"}
-
-for i in range(1):
- data = {
- "inputs": "(100+1+3)*2=",
- # 'temperature': 0.1,
- "parameters": {"do_sample": False, "regular_constraint": r"-?\d+"},
- }
- thread = RequestThread(url, headers, data)
- thread.start()
-
-time.sleep(2)
-
-for i in range(20):
- data = {
- "inputs": "Are dog a man? ",
- "parameters": {
- "do_sample": False,
- "ignore_eos": True,
- "max_new_tokens": 200,
- "regular_constraint": r"(Yes|No) Reason is [a-zA-Z\s]+",
- },
- }
- thread = RequestThread(url, headers, data)
- thread.start()
-
-time.sleep(10)
-
-for i in range(20):
- data = {
- "inputs": "Are dog a man? ",
- "parameters": {"do_sample": False, "ignore_eos": True, "max_new_tokens": 200, "allowed_token_ids": [2, 3]},
- }
- thread = RequestThread(url, headers, data)
- thread.start()
diff --git a/test/test_function_call_api.py b/test/test_function_call_api.py
deleted file mode 100644
index 584b41d84..000000000
--- a/test/test_function_call_api.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import time
-import requests
-import json
-import threading
-
-
-class RequestThread(threading.Thread):
- def __init__(self, url, headers, data):
- threading.Thread.__init__(self)
- self.url = url
- self.headers = headers
- self.data = data
-
- def run(self):
- response = requests.post(self.url, headers=self.headers, data=json.dumps(self.data))
- if response.status_code == 200:
- print(response.json())
- else:
- print("Error:", response.status_code, response.text)
-
-
-openai_url = "http://localhost:8888/v1/chat/completions"
-headers = {"Content-Type": "application/json"}
-
-# Test OpenAI Tool Call API
-messages = [
- {
- "role": "user",
- "content": "What's the weather like in Boston today? "
- "Output a reasoning before act, then use the tools to help you.",
- }
-]
-tools = [
- {
- "type": "function",
- "function": {
- "name": "get_current_weather",
- "description": "Get the current weather in a given location",
- "parameters": {
- "type": "object",
- "properties": {
- "city": {
- "type": "string",
- "description": "The city to find the weather for, e.g. 'San Francisco'",
- },
- "state": {
- "type": "string",
- "description": "the two-letter abbreviation for the state that the city is"
- " in, e.g. 'CA' which would mean 'California'",
- },
- "unit": {
- "type": "string",
- "description": "The unit to fetch the temperature in",
- "enum": ["celsius", "fahrenheit"],
- },
- },
- "required": ["city", "state", "unit"],
- },
- },
- }
-]
-
-for i in range(1):
- data = {
- "model": "qwen25",
- "messages": messages,
- "tools": tools,
- "do_sample": False,
- "max_tokens": 1024,
- }
- thread = RequestThread(openai_url, headers, data)
- thread.start()