Checklist
Describe the bug
我尝试使用服务器上的2张A6000的GPU,通过tritonserver,设置TP=1,达到每张GPU卡上分别运行一个llama13b模型的实例,虽然模型可以正常启动,但通过chatbot客户端调用时,单并发请求可以正常运行,但多并发请求时,服务崩溃,崩溃信息如下:[TM][INFO] [initGen] slot sequence_id context_len seq_limit_len finished
[TM][INFO] [initGen] 0 9528120 518 618 0
[TM][INFO] ------------------------- step = 520 -------------------------
[TM][INFO] ------------------------- step = 530 -------------------------
0# 0x000056112E9F2459 in tritonserver
1# 0x00007FB96EE2E090 in /usr/lib/x86_64-linux-gnu/libc.so.6
2# gsignal in /usr/lib/x86_64-linux-gnu/libc.so.6
3# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
4# 0x00007FB96F1E7911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
5# 0x00007FB96F1F338C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
6# 0x00007FB96F1F33F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
7# 0x00007FB96F1F36A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
8# 0x00007FB96F1EA550 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
9# void triton_stream_callback<__half>(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, turbomind::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, turbomind::Tensor> > >, void) in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
10# turbomind::LlamaBatch<__half>::finish() in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
11# turbomind::LlamaV2<__half>::internalThreadEntry(int) in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
12# 0x00007FB96F21FDE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
13# 0x00007FB970597609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
14# clone in /usr/lib/x86_64-linux-gnu/libc.so.6
Signal (11) received.
[TM][INFO] ------------------------- step = 540 -------------------------
[TM][INFO] ------------------------- step = 550 -------------------------
[TM][INFO] ------------------------- step = 560 -------------------------
0# 0x000056112E9F2459 in tritonserver
1# 0x00007FB96EE2E090 in /usr/lib/x86_64-linux-gnu/libc.so.6
2# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
3# 0x00007FB96F1E7911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
4# 0x00007FB96F1F338C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
5# 0x00007FB96F1F33F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
6# 0x00007FB96F1F36A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
7# 0x00007FB96F1EA550 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
8# void triton_stream_callback<__half>(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, turbomind::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, turbomind::Tensor> > >, void) in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
9# turbomind::LlamaBatch<__half>::finish() in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
10# turbomind::LlamaV2<__half>::internalThreadEntry(int) in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
11# 0x00007FB96F21FDE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
12# 0x00007FB970597609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
13# clone in /usr/lib/x86_64-linux-gnu/libc.so.6
Reproduction
config.pbtxt采用了系统生成的值,部署参数如下:
instance_group [ { # max concurrent instances count: 48 kind: KIND_CPU } ] parameters { key: "pipeline_para_size" value: { string_value: "1" } } parameters { key: "tensor_para_size" value: { string_value: "1" } }
启动参数:
docker run --gpus "device=0,1" --rm \ -v $model_workspace:/workspace/models \ --shm-size 16g -p $triton_port:$triton_port -p $metrics_port:$metrics_port \ --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \ --name lmdeploy-20 -it --env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:v0.0.10 \ tritonserver --model-repository=/workspace/models/model_repository \ --allow-http=0 --allow-grpc=1 --grpc-port=$triton_port --metrics-port=$metrics_port\ --log-verbose=0 --allow-metrics=1
请问是上述配置及启动方式的问题?还是代码的问题?
Checklist
Describe the bug
我尝试使用服务器上的2张A6000的GPU,通过tritonserver,设置TP=1,达到每张GPU卡上分别运行一个llama13b模型的实例,虽然模型可以正常启动,但通过chatbot客户端调用时,单并发请求可以正常运行,但多并发请求时,服务崩溃,崩溃信息如下:[TM][INFO] [initGen] slot sequence_id context_len seq_limit_len finished
[TM][INFO] [initGen] 0 9528120 518 618 0
[TM][INFO] ------------------------- step = 520 -------------------------
[TM][INFO] ------------------------- step = 530 -------------------------
0# 0x000056112E9F2459 in tritonserver
1# 0x00007FB96EE2E090 in /usr/lib/x86_64-linux-gnu/libc.so.6
2# gsignal in /usr/lib/x86_64-linux-gnu/libc.so.6
3# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
4# 0x00007FB96F1E7911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
5# 0x00007FB96F1F338C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
6# 0x00007FB96F1F33F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
7# 0x00007FB96F1F36A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
8# 0x00007FB96F1EA550 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
9# void triton_stream_callback<__half>(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, turbomind::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, turbomind::Tensor> > >, void) in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
10# turbomind::LlamaBatch<__half>::finish() in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
11# turbomind::LlamaV2<__half>::internalThreadEntry(int) in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
12# 0x00007FB96F21FDE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
13# 0x00007FB970597609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
14# clone in /usr/lib/x86_64-linux-gnu/libc.so.6
Signal (11) received.
[TM][INFO] ------------------------- step = 540 -------------------------
[TM][INFO] ------------------------- step = 550 -------------------------
[TM][INFO] ------------------------- step = 560 -------------------------
0# 0x000056112E9F2459 in tritonserver
1# 0x00007FB96EE2E090 in /usr/lib/x86_64-linux-gnu/libc.so.6
2# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
3# 0x00007FB96F1E7911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
4# 0x00007FB96F1F338C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
5# 0x00007FB96F1F33F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
6# 0x00007FB96F1F36A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
7# 0x00007FB96F1EA550 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
8# void triton_stream_callback<__half>(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, turbomind::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, turbomind::Tensor> > >, void) in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
9# turbomind::LlamaBatch<__half>::finish() in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
10# turbomind::LlamaV2<__half>::internalThreadEntry(int) in /opt/tritonserver/backends/turbomind/libtransformer-shared.so
11# 0x00007FB96F21FDE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
12# 0x00007FB970597609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
13# clone in /usr/lib/x86_64-linux-gnu/libc.so.6
Reproduction
config.pbtxt采用了系统生成的值,部署参数如下:
instance_group [ { # max concurrent instances count: 48 kind: KIND_CPU } ] parameters { key: "pipeline_para_size" value: { string_value: "1" } } parameters { key: "tensor_para_size" value: { string_value: "1" } }启动参数:
docker run --gpus "device=0,1" --rm \ -v $model_workspace:/workspace/models \ --shm-size 16g -p $triton_port:$triton_port -p $metrics_port:$metrics_port \ --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \ --name lmdeploy-20 -it --env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:v0.0.10 \ tritonserver --model-repository=/workspace/models/model_repository \ --allow-http=0 --allow-grpc=1 --grpc-port=$triton_port --metrics-port=$metrics_port\ --log-verbose=0 --allow-metrics=1请问是上述配置及启动方式的问题?还是代码的问题?