Skip to content

Commit

Permalink
add modelscope and lora testcase (#1506)
Browse files Browse the repository at this point in the history
* update

* update

* updateg

* update

* update
  • Loading branch information
zhulinJulia24 committed Apr 30, 2024
1 parent ba0e6b3 commit 7c4e75b
Show file tree
Hide file tree
Showing 24 changed files with 714 additions and 415 deletions.
63 changes: 36 additions & 27 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,22 @@ on:
required: true
description: 'Dependency packages, you can also set a specific version'
type: string
default: 'packaging transformers_stream_generator transformers datasets matplotlib'
default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'
default_tp:
required: true
description: 'Default tp value'
type: string
default: '--tp 1'
log_level:
required: true
description: 'Default ERROR, can also set INFO'
type: string
default: 'ERROR'
kvint_quantization:
required: true
description: 'Default kvint4, kvint8'
type: string
default: "['kvint4','kvint8']"
models:
required: true
description: 'Set models run benchmark'
Expand All @@ -52,6 +62,7 @@ env:
DATASET_FILE: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
TP_INFO: --tp 1
LOOP_NUM: 3
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas


jobs:
Expand Down Expand Up @@ -93,15 +104,15 @@ jobs:
generation_benchmark:
needs: linux-build
if: ${{github.event_name == 'schedule' || (!cancelled() && contains(fromJSON(github.event.inputs.benchmark_type), 'generation'))}}
runs-on: [self-hosted, linux-a100-2]
runs-on: [self-hosted, linux-a100]
strategy:
fail-fast: false
matrix:
model: ${{fromJSON(github.event.inputs.models)}}
timeout-minutes: 120
env:
MODEL_PATH: /nvme/qa_test_models/${{matrix.model}}
CUDA_VISIBLE_DEVICES: 4,5
CUDA_VISIBLE_DEVICES: 6,7
container:
image: nvcr.io/nvidia/tritonserver:22.12-py3
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
Expand All @@ -119,7 +130,7 @@ jobs:
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Copy repository - offline
if: ${{inputs.offline_mode}}
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
- name: Download Artifacts
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
uses: actions/download-artifact@v4
Expand All @@ -133,7 +144,7 @@ jobs:
run: |
python3 -m pip install ${{inputs.dependency_pkgs}}
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
- name: Install lmdeploy
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
Expand Down Expand Up @@ -216,7 +227,7 @@ jobs:
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Copy repository - offline
if: ${{inputs.offline_mode}}
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
- name: Download Artifacts
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
uses: actions/download-artifact@v4
Expand All @@ -230,7 +241,7 @@ jobs:
run: |
python3 -m pip install ${{inputs.dependency_pkgs}}
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
- name: Install lmdeploy
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
Expand Down Expand Up @@ -266,7 +277,7 @@ jobs:
done
done
- name: Run throughput benchmark - kvint4
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
env:
result_dir: benchmark-throughput-turbomind-kvint4
run: |
Expand All @@ -281,7 +292,7 @@ jobs:
done
done
- name: Run throughput benchmark - kvint8
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
env:
result_dir: benchmark-throughput-turbomind-kvint8
run: |
Expand Down Expand Up @@ -357,7 +368,7 @@ jobs:
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Copy repository - offline
if: ${{inputs.offline_mode}}
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
- name: Download Artifacts
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
uses: actions/download-artifact@v4
Expand All @@ -371,7 +382,7 @@ jobs:
run: |
python3 -m pip install ${{inputs.dependency_pkgs}}
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
- name: Install lmdeploy
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
Expand All @@ -394,7 +405,7 @@ jobs:
- name: Start restful api turbomind
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
run: |
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level INFO > turbomind_run.log 2>&1 &
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level ${{inputs.log_level}} > turbomind_run.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 180s
- name: Run restful benchmark
Expand All @@ -414,17 +425,17 @@ jobs:
done
- name: Kill restful api turbomind
continue-on-error: true
if: always()
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
run: |
kill -15 "$restful_pid"
- name: Start restful api turbomind - kvint4
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
run: |
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level INFO > turbomind_kvint4_run.log 2>&1 &
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level ${{inputs.log_level}} > turbomind_kvint4_run.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 180s
- name: Run restful benchmark -kvint4
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
env:
result_dir: benchmark-restful-turbomind-kvint4
run: |
Expand All @@ -439,18 +450,17 @@ jobs:
done
done
- name: Kill restful api turbomind - kvint4
continue-on-error: true
if: always()
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
run: |
kill -15 "$restful_pid"
- name: Start restful api turbomind - kvint8
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
run: |
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level INFO > turbomind_kvint8_run.log 2>&1 &
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level ${{inputs.log_level}} > turbomind_kvint8_run.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 180s
- name: Run restful benchmark -kvint8
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
env:
result_dir: benchmark-restful-turbomind-kvint8
run: |
Expand All @@ -465,14 +475,13 @@ jobs:
done
done
- name: Kill restful api turbomind - kvint8
continue-on-error: true
if: always()
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
run: |
kill -15 "$restful_pid"
- name: Start restful api pytorch
if: (!contains(env.MODEL_FORMAT, 'awq') && contains(fromJSON(github.event.inputs.backend), 'pytorch'))
run: |
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --backend pytorch --log-level INFO > pytorch_run.log 2>&1 &
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --backend pytorch --log-level ${{inputs.log_level}} > pytorch_run.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 120s
- name: Run restful benchmark - pytorch
Expand All @@ -491,7 +500,7 @@ jobs:
done
done
- name: Kill restful api pytorch
if: always()
if: (!contains(env.MODEL_FORMAT, 'awq') && contains(fromJSON(github.event.inputs.backend), 'pytorch'))
run: |
kill -15 "$restful_pid"
- name: Save reports
Expand Down Expand Up @@ -521,7 +530,7 @@ jobs:
WORKDIR: /nvme/qa_test_models/triton_workspace
OFFLINE_PKGS: /nvme/qa_test_models/offline_pkg
MODEL_PATH: /nvme/qa_test_models/autotest_model/workspace_${{matrix.model}}
DEVICE: device=7
DEVICE: device=4
GRPC_PORT: 33337
strategy:
fail-fast: false
Expand All @@ -537,7 +546,7 @@ jobs:
- name: Set params
if: (contains( matrix.model, 'internlm2-chat-20b'))
run: |
echo 'DEVICE="device=6,7"' >> "$GITHUB_ENV"
echo 'DEVICE="device=4,5"' >> "$GITHUB_ENV"
- name: Create test container
run: |
export date_today="$(date +'%H%M%S')"
Expand Down
33 changes: 20 additions & 13 deletions .github/workflows/daily_ete_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ on:
required: true
description: 'Dependency packages, you can also set a specific version'
type: string
default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm'
default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath'
tools_regression:
required: true
description: 'Whether start a tool regression'
Expand All @@ -49,12 +49,13 @@ on:
type: boolean
default: true
schedule:
- cron: '00 21 * * *'
- cron: '00 20 * * 1-5'

env:
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas


jobs:
Expand Down Expand Up @@ -101,12 +102,17 @@ jobs:
env:
REPORT_DIR: /nvme/qa_test_models/test-reports
PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
MODELSCOPE_CACHE: /root/modelscope_hub
MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
container:
image: nvcr.io/nvidia/tritonserver:22.12-py3
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
volumes:
- /nvme/github-actions/pip-cache:/root/.cache/pip
- /nvme/github-actions/packages:/root/packages
- /nvme/github-actions/modelscope_hub:/root/modelscope_hub
- /nvme/github-actions/modelscope_modules:/root/modelscope_modules
- /nvme/github-actions/resources/lora:/root/lora
- /nvme/qa_test_models:/nvme/qa_test_models
- /nvme/qa_test_models/lmdeploy/autotest:/local_case
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
Expand All @@ -119,7 +125,7 @@ jobs:
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Copy repository - offline
if: ${{inputs.offline_mode}}
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
- name: Download Artifacts
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
uses: actions/download-artifact@v4
Expand All @@ -128,13 +134,13 @@ jobs:
- name: Install pytorch
run: |
python3 -m pip cache dir
python3 -m pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
python3 -m pip install torch==2.2.1 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cu118
- name: Install lmdeploy - dependency
run: |
python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai'}}
python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath'}}
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-2.5.7+cu118torch2.2cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
- name: Install lmdeploy
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
run: |
Expand All @@ -151,6 +157,7 @@ jobs:
run: |
python3 -m pip list
lmdeploy check_env
cp -r /root/lora .
rm -rf allure-results
# remove tmp log in testcase
rm -rf /nvme/qa_test_models/autotest_model/log/*
Expand Down Expand Up @@ -233,7 +240,7 @@ jobs:
continue-on-error: true
if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'interface-pipeline'))
run: |
pytest autotest/interface/pipeline -m 'not pr_test' -s -vv --alluredir=allure-results
pytest autotest/interface/pipeline -m 'not pr_test' --alluredir=allure-results
- name: Test lmdeploy - local testcase
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.model), 'local_case')
run: |
Expand Down Expand Up @@ -439,7 +446,7 @@ jobs:
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Copy repository - offline
if: ${{inputs.offline_mode}}
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
- name: Download Artifacts
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
uses: actions/download-artifact@v4
Expand All @@ -448,13 +455,13 @@ jobs:
- name: Install pytorch
run: |
python3 -m pip cache dir
python3 -m pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
python3 -m pip install torch==2.2.1 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cu118
- name: Install lmdeploy - dependency
run: |
python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai'}}
python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath'}}
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-2.5.7+cu118torch2.2cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
- name: Install lmdeploy
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
run: |
Expand Down
6 changes: 6 additions & 0 deletions autotest/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ tp_config:

turbomind_chat_model:
- meta-llama/Llama-2-7b-chat-hf
- meta-llama/Meta-Llama-3-8B-Instruct
- internlm/internlm2-chat-1_8b
- internlm/internlm-chat-7b
- internlm/internlm-chat-20b
Expand All @@ -39,6 +40,7 @@ turbomind_chat_model:

pytorch_chat_model:
- meta-llama/Llama-2-7b-chat-hf
- meta-llama/Meta-Llama-3-8B-Instruct
- internlm/internlm-chat-7b
- internlm/internlm-chat-20b
- internlm/internlm2-chat-7b
Expand Down Expand Up @@ -81,8 +83,10 @@ quatization_case_config:
- internlm/internlm2-chat-20b
- baichuan-inc/Baichuan2-7B-Chat
- internlm/internlm2-20b
- Qwen/Qwen1.5-7B-Chat
kvint:
- meta-llama/Llama-2-7b-chat-hf
- meta-llama/Meta-Llama-3-8B-Instruct
- internlm/internlm2-chat-1_8b
- internlm/internlm-chat-7b
- internlm/internlm-chat-20b
Expand All @@ -101,6 +105,8 @@ quatization_case_config:
- codellama/CodeLlama-7b-Instruct-hf
w8a8:
- meta-llama/Llama-2-7b-chat-hf
- meta-llama/Meta-Llama-3-8B-Instruct
- internlm/internlm-chat-7b
- internlm/internlm-chat-20b
- internlm/internlm2-chat-20b
- internlm/internlm2-chat-7b
Expand Down
5 changes: 2 additions & 3 deletions autotest/interface/pipeline/test_pipeline_turbomind_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def test_pipeline_stream_infer(self, config, model):
assert outputs.generate_token_len == i, str(outputs)
else:
with assume:
assert outputs.generate_token_len == i - 1, str(outputs)
assert outputs.generate_token_len >= i - 1, str(outputs)
with assume:
assert outputs.input_token_len > 50, str(outputs)
with assume:
Expand Down Expand Up @@ -250,7 +250,7 @@ def test_pipeline_stream_infer2(self, config, model):
assert outputs.generate_token_len == i, str(outputs)
else:
with assume:
assert outputs.generate_token_len == i - 1, str(outputs)
assert outputs.generate_token_len >= i - 1, str(outputs)
with assume:
assert outputs.input_token_len > 50, str(outputs)
with assume:
Expand All @@ -261,7 +261,6 @@ def test_pipeline_stream_infer2(self, config, model):
outputs_list.append(outputs)
continue

print(final_response)
for output in outputs_list[0:-1]:
with assume:
assert output.finish_reason is None, str(output)
Expand Down
Loading

0 comments on commit 7c4e75b

Please sign in to comment.