hello, when i use multiple stream parallel inference engine, the speed is doubled, this is my code, thank you very much for your help
const int nStreams =3;
std::cout << "Creating " << nStreams << " CUDA streams." << std::endl;
cudaStream_t stream[nStreams];;
for (int i = 0; i < nStreams; i++)
cudaStreamCreate(&stream[i]);
//cudaStream_t stream;
//cudaStreamCreate(&stream);
//复制图片数据到GPU
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i = 0; i < nStreams; i++) {
cudaMemcpyAsync(buffers[inputIndex], data, m_nImgNum * m_nChannelNum * m_nImgSize * m_nImgSize * sizeof(float), cudaMemcpyHostToDevice, stream[i]);
//执行推理
context->enqueueV2(buffers, stream[i], nullptr);
//将GPU数据拷贝回CPU
cudaMemcpyAsync(outdata, buffers[outputIndex], m_nImgNum * m_nImgSize * m_nImgSize * sizeof(int), cudaMemcpyDeviceToHost, stream[i]);
//cudaStreamSynchronize(stream[i]);
}
for (int i = 0; i < nStreams; ++i)
cudaStreamSynchronize(stream[i]);
//销毁流
CUDA_CHECK();
CUDA_CALL(cudaDeviceSynchronize());
CUDA_CALL(cudaEventRecord(stop, 0));
CUDA_CALL(cudaEventSynchronize(stop));
CUDA_CALL(cudaEventElapsedTime(&elapsedTime, start, stop));
std::cout << "Whole process took " << elapsedTime << "ms." << std::endl;
for (int i = 0; i < nStreams; ++i)
cudaStreamDestroy(stream[i]);
hello, when i use multiple stream parallel inference engine, the speed is doubled, this is my code, thank you very much for your help