Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuSparse: cusparseScsrgemm2 much slower than SpGEMM #39

Closed
pspacehard opened this issue Jun 16, 2021 · 1 comment
Closed

cuSparse: cusparseScsrgemm2 much slower than SpGEMM #39

pspacehard opened this issue Jun 16, 2021 · 1 comment
Labels
cuSPARSE question Further information is requested

Comments

@pspacehard
Copy link

According to this comment, the current SpGEMM implementation may issue CUSPARSE_STATUS_INSUFFICIENT_RESOURCES for some specific input. Hence, I tried the cusparseScsrgemm2 method. However, I find that cusparseScsrgemm2 is quite slow. For example, for two 600,000 x 600,000 matrices A and B, where A contains 40,000,000 entries and B is a diagonal matrix, cusparseScsrgemm2 took several seconds to compute the multiplication of A and B, much slower than SpGEMM, which took only tens of milliseconds. I used CUDA11.3 and Tesla V100. The input matrices can be downloaded here. The program is as follows.

#include <cstdio>
#include <cstdlib>
#include <cusparse.h>

void mul(const int n,
         const float* const A_val, const int* const A_colind, const int* const A_rowptr, const int A_nnz,
         const float* const B_val, const int* const B_colind, const int* const B_rowptr, const int B_nnz,
         float** const C_val, int** const C_colind, int** const C_rowptr, int* const C_nnz) {

  float alpha = 1.0;

  cusparseHandle_t handle;
  cusparseCreate(&handle);

  cusparseMatDescr_t desc;
  cusparseCreateMatDescr(&desc);
  cusparseSetMatType(desc, CUSPARSE_MATRIX_TYPE_GENERAL);
  cusparseSetMatIndexBase(desc, CUSPARSE_INDEX_BASE_ZERO);

  csrgemm2Info_t info = NULL;
  cusparseCreateCsrgemm2Info(&info);

  size_t buffer_size;
  cusparseScsrgemm2_bufferSizeExt(handle, n, n, n, &alpha,
                                   desc, A_nnz, A_rowptr, A_colind,
                                   desc, B_nnz, B_rowptr, B_colind,
                                   NULL,
                                   desc, B_nnz, B_rowptr, B_colind,
                                   info, &buffer_size);
  void* buffer = NULL;
  cudaMalloc(&buffer, buffer_size);

  cudaMalloc(C_rowptr, sizeof(int) * (n + 1));
  cusparseXcsrgemm2Nnz(handle, n, n, n,
                                      desc, A_nnz, A_rowptr, A_colind,
                                      desc, B_nnz, B_rowptr, B_colind,
                                      desc, B_nnz, B_rowptr, B_colind,
                                      desc, *C_rowptr, C_nnz,
                                      info, buffer);

  cudaMalloc(C_colind, sizeof(int) * *C_nnz);
  cudaMalloc(C_val, sizeof(float) * *C_nnz);

  cusparseScsrgemm2(handle, n, n, n, &alpha,
                               desc, A_nnz, A_val, A_rowptr, A_colind,
                               desc, B_nnz, B_val, B_rowptr, B_colind,
                               NULL,
                               desc, B_nnz, B_val, B_rowptr, B_colind,
                               desc, *C_val, *C_rowptr, *C_colind,
                               info, buffer);

  cusparseDestroyCsrgemm2Info(info);
  cusparseDestroyMatDescr(desc);
  cudaFree(buffer);
}

int main() {
  FILE* file = fopen("AS.bin", "rb");
  int A_nnz, B_nnz, n;
  fread(&A_nnz, sizeof(int), 1, file);
  fread(&B_nnz, sizeof(int), 1, file);
  fread(&n, sizeof(int), 1, file);
  printf("%d %d %d\n", A_nnz, B_nnz, n);
  float* h_A_val = new float[A_nnz];
  int* h_A_colind = new int[A_nnz];
  int* h_A_rowptr = new int[n + 1];
  float* h_B_val = new float[B_nnz];
  int* h_B_colind = new int[B_nnz];
  int* h_B_rowptr = new int[n + 1];
  fread(h_A_val, sizeof(float), A_nnz, file);
  fread(h_A_colind, sizeof(int), A_nnz, file);
  fread(h_A_rowptr, sizeof(int), n + 1, file);
  fread(h_B_val, sizeof(float), B_nnz, file);
  fread(h_B_colind, sizeof(int), B_nnz, file);
  fread(h_B_rowptr, sizeof(int), n + 1, file);
  fclose(file);

  float* d_A_val;
  cudaMalloc(&d_A_val, sizeof(float) * A_nnz);
  cudaMemcpy(d_A_val, h_A_val, sizeof(float) * A_nnz, cudaMemcpyHostToDevice);
  int* d_A_colind;
  cudaMalloc(&d_A_colind, sizeof(int) * A_nnz);
  cudaMemcpy(d_A_colind, h_A_colind, sizeof(int) * A_nnz, cudaMemcpyHostToDevice);
  int* d_A_rowptr;
  cudaMalloc(&d_A_rowptr, sizeof(int) * (n + 1));
  cudaMemcpy(d_A_rowptr, h_A_rowptr, sizeof(int) * (n + 1), cudaMemcpyHostToDevice);
  int d_A_nnz = A_nnz;

  float* d_B_val;
  cudaMalloc(&d_B_val, sizeof(float) * B_nnz);
  cudaMemcpy(d_B_val, h_B_val, sizeof(float) * B_nnz, cudaMemcpyHostToDevice);
  int* d_B_colind;
  cudaMalloc(&d_B_colind, sizeof(int) * B_nnz);
  cudaMemcpy(d_B_colind, h_B_colind, sizeof(int) * B_nnz, cudaMemcpyHostToDevice);
  int* d_B_rowptr;
  cudaMalloc(&d_B_rowptr, sizeof(int) * (n + 1));
  cudaMemcpy(d_B_rowptr, h_B_rowptr, sizeof(int) * (n + 1), cudaMemcpyHostToDevice);
  int d_B_nnz = B_nnz;

  float* d_C_val = NULL;
  int* d_C_colind = NULL;
  int* d_C_rowptr = NULL;
  int d_C_nnz;

  mul(n, d_A_val, d_A_colind, d_A_rowptr, d_A_nnz,
         d_B_val, d_B_colind, d_B_rowptr, d_B_nnz,
         &d_C_val, &d_C_colind, &d_C_rowptr, &d_C_nnz);

  printf("%d\n", d_C_nnz);

  return 0;
}

I have the following questions.

  1. Is the low efficiency of cusparseScsrgemm2 caused by not being able to exploit the architecture of V100?
  2. Are there any other alternatives to cusparseScsrgemm2 and SpGEMM?

Thanks.

@fbusato fbusato added cuSPARSE question Further information is requested labels Jun 24, 2021
@fbusato
Copy link
Collaborator

fbusato commented Jun 24, 2021

I try to answer:

  1. No, it is not related to the specific GPU architecture. Low performance for cusparseScsrgemm2 is due to the sparsity pattern of the input matrix
  2. There are no alternatives. You may try with cusparseSpGEMMreuse. This routine makes sense when the cost of preprocessing steps can be amortized over multiple runs as cusparseSpGEMMreuse_compute is very fast

@fbusato fbusato closed this as completed Jun 24, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
cuSPARSE question Further information is requested
Projects
None yet
Development

No branches or pull requests

2 participants