thrust::universal_vector push_back is very slow #809

pca006132 · 2022-05-14T16:46:06Z

I was trying to use a single universal_vector to replace a pair of host_vector and device_vector, hoping to reduce memory usage and support computation with buffer size larger than GPU memory. However, it seems that universal_vector is very slow for push_back operations, regardless if the operations requires reallocation or not.

Simple benchmark:

#include <chrono>
#include <thrust/execution_policy.h>
#include <thrust/universal_vector.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

constexpr int repeat = 1000000;

int universal(bool reserve) {
  thrust::universal_vector<int> test_vector;
  if (reserve) {
    test_vector.reserve(repeat);
    /* cudaMemPrefetchAsync(test_vector.data().get(), repeat * sizeof(int), 0, 0); */
  }
  std::chrono::time_point<std::chrono::high_resolution_clock> t0 = std::chrono::high_resolution_clock::now();
  {
    for (int i = 0; i < repeat; i++) {
      test_vector.push_back(i);
    }
  }
  std::chrono::time_point<std::chrono::high_resolution_clock> t1 = std::chrono::high_resolution_clock::now();

  return std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
}

int universal_set() {
  thrust::universal_vector<int> test_vector(repeat);
  std::chrono::time_point<std::chrono::high_resolution_clock> t0 = std::chrono::high_resolution_clock::now();
  {
    for (int i = 0; i < repeat; i++) {
      test_vector[i] = i;
    }
  }
  std::chrono::time_point<std::chrono::high_resolution_clock> t1 = std::chrono::high_resolution_clock::now();

  return std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
}

int host_device() {
  thrust::host_vector<int> test_vector_h;
  std::chrono::time_point<std::chrono::high_resolution_clock> t0 = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < repeat; i++) {
    test_vector_h.push_back(i);
  }
  thrust::device_vector<int> test_vector_d(test_vector_h);
  std::chrono::time_point<std::chrono::high_resolution_clock> t1 = std::chrono::high_resolution_clock::now();
  return std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
}

int main() {
  // warm up
  for (int i = 0; i < 5; i++)
    host_device();
  std::cout << "host device (no reserve):" << host_device()   << "ms" << std::endl;
  std::cout << "universal (no reserve):" <<  universal(false) << "ms" << std::endl;
  std::cout << "universal (with reserve):" << universal(true) << "ms" << std::endl;
  std::cout << "universal (set elements):" << universal_set() << "ms" << std::endl;
  std::cout << std::endl;
}

Output:

host device (no reserve):6ms
universal (no reserve):5578ms
universal (with reserve):5584ms
universal (set elements):1ms

I tried running nvprof, and got the following result:

==119609== NVPROF is profiling process 119609, command: ./a.out
universal (with reserve):10528ms

==119609== Profiling application: ./a.out
==119609== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.66960s   1000000  1.6690us  1.5990us  167.81us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::pointer<int, thrust::cuda_cub::tag, int&, thrust::use_default>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::pointer<int, thrust::cuda_cub::tag, int&, thrust::use_default>, int>, unsigned long>(thrust::cuda_cub::tag, int&)
      API calls:   46.65%  4.16090s   2000000  2.0800us     573ns  269.30us  cudaStreamSynchronize
                   30.47%  2.71802s   1000000  2.7180us  2.2110us  6.2884ms  cudaLaunchKernel
                   10.97%  978.84ms  10000003      97ns      86ns  279.10us  cudaGetLastError
                    5.09%  454.43ms   2000001     227ns     197ns  243.55us  cudaGetDevice
                    2.55%  227.76ms   1000000     227ns     196ns  277.03us  cudaDeviceGetAttribute
                    2.44%  217.78ms   2000000     108ns      87ns  29.846us  cudaPeekAtLastError
                    1.81%  161.54ms         1  161.54ms  161.54ms  161.54ms  cudaMallocManaged
                    0.00%  194.83us       101  1.9290us      99ns  139.97us  cuDeviceGetAttribute
                    0.00%  156.64us         1  156.64us  156.64us  156.64us  cudaFree
                    0.00%  17.573us         1  17.573us  17.573us  17.573us  cuDeviceGetName
                    0.00%  10.741us         1  10.741us  10.741us  10.741us  cudaFuncGetAttributes
                    0.00%  7.0360us         1  7.0360us  7.0360us  7.0360us  cuDeviceGetPCIBusId
                    0.00%  1.2080us         3     402ns     150ns     887ns  cuDeviceGetCount
                    0.00%     602ns         2     301ns      96ns     506ns  cuDeviceGet
                    0.00%     264ns         1     264ns     264ns     264ns  cuDeviceTotalMem
                    0.00%     245ns         1     245ns     245ns     245ns  cudaGetDeviceCount
                    0.00%     222ns         1     222ns     222ns     222ns  cuDeviceGetUuid

==119609== Unified Memory profiling result:
Device "NVIDIA GeForce GTX 1050 (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
      12         -         -         -           -  998.7450us  Gpu page fault groups

So it seems to me that each push_back requires a cudaStreamSynchronize? I guess this might cause the problem, but I'm not familiar with CUDA so this might be wrong. I'm using a Geforce GTX1050, not sure if this is related to demand-paging.

The text was updated successfully, but these errors were encountered:

pca006132 mentioned this issue May 14, 2022

thrust universal vector elalish/manifold#120

Closed

jrhemstad added the thrust For all items related to Thrust. label Feb 22, 2023

jarmak-nv assigned gevtushenko Feb 23, 2023

jarmak-nv transferred this issue from NVIDIA/thrust Nov 8, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

thrust::universal_vector push_back is very slow #809

thrust::universal_vector push_back is very slow #809

pca006132 commented May 14, 2022

thrust::universal_vector push_back is very slow #809

thrust::universal_vector push_back is very slow #809

Comments

pca006132 commented May 14, 2022