Merge branch 'update-cusolvermp-samples-for-23.1' into 'master'

update cusolverMp samples See merge request cuda-hpc-libraries/cudalibrarysamples-mirror!89
NVIDIA · Jan 26, 2023 · ce619bb · ce619bb
2 parents a9899a2 + 80d160d
commit ce619bb
Show file tree

Hide file tree

Showing 8 changed files with 3,332 additions and 1,092 deletions.
diff --git a/cuSOLVERMp/Makefile b/cuSOLVERMp/Makefile
@@ -1,5 +1,5 @@
 # 
-# Copyright 2021 NVIDIA Corporation.  All rights reserved.
+# Copyright 2023 NVIDIA Corporation.  All rights reserved.
 # 
 # NOTICE TO LICENSEE:
 # 
@@ -51,39 +51,25 @@
 #     Defines, includes and libraries
 # --------------------------------------------
 
-# HPC SDK version and installation path
-HPCSDKVER?=21.11
-CUDAVER?=11.5
-HPCSDKARCH?=Linux_x86_64
-HPCSDKPATH?=/opt/nvidia/hpc_sdk
-HPCSDKROOT?=$(HPCSDKPATH)/$(HPCSDKARCH)/$(HPCSDKVER)
-
 # cusolverMp depends on HPC-X inside HPC SDK distribution, so it's required to 
 # initialize HPC-X using one of the initialization scripts 
 # (i.e. $HPCSDKROOT/comm_libs/hpcx/latest/hpcx-init-ompi.sh)
 # and load HPC-X environment with `hpcx_load` function
 # This will make `mpicc` command available which will provide required MPI compile 
 # and link flags for compiler
-COMPILER?=g++ -std=c++11
-
-# HPC SDK provides CUDA components.
-CUDAMATHLIBSPATH?=$(HPCSDKROOT)/math_libs/$(CUDAVER)
-CUDAPATH?=$(HPCSDKROOT)/cuda/$(CUDAVER)
+COMPILER?=nvc++ -cuda -gpu cuda11.8
 
 # Includes and linker flags
-INCS = -I./ -I$(CUDAMATHLIBSPATH)/include -I$(CUDAPATH)/include
-COMMLIBS = -L$(HPCSDKROOT)/comm_libs/$(CUDAVER)/nccl/lib -lnccl -Wl,-rpath=$(HPCSDKROOT)/comm_libs/$(CUDAVER)/nccl/lib -lcal
-EXTRALIBS = -L$(CUDAPATH)/lib64/stubs -lcuda -lnvToolsExt -ldl -lrt
-LIBS = $(COMMLIBS) -L$(CUDAMATHLIBSPATH)/lib64 -Wl,-rpath=$(CUDAMATHLIBSPATH)/lib64 -lcusolverMp -L$(CUDAPATH)/lib64 -lcudart_static -Wl,-rpath=$(CUDAPATH)/lib64 $(EXTRALIBS)
+LIBS = -cudalib=cublas,cusolver,nccl,cusolvermp
 
 # --------------------------------------------
 #     Define targets
 # --------------------------------------------
 
-clean:
-	rm mp_getrf_getrs mp_potrf_potrs
+all: mp_getrf_getrs mp_potrf_potrs mp_geqrf mp_syevd mp_gels
 
-all: mp_getrf_getrs mp_potrf_potrs
+clean:
+	rm -f mp_getrf_getrs mp_potrf_potrs mp_geqrf mp_syevd mp_gels
 
 # --------------------------------------------
 #     Build commands
@@ -94,4 +80,13 @@ mp_getrf_getrs: mp_getrf_getrs.cpp
 mp_potrf_potrs: mp_potrf_potrs.cpp
 	$(COMPILER) $(INCS) `mpicc --showme:compile` $+ -o $@ $(LIBS) `mpicc --showme:link`
 
+mp_geqrf: mp_geqrf.cpp
+	$(COMPILER) $(INCS) `mpicc --showme:compile` $+ -o $@ $(LIBS) `mpicc --showme:link`
+
+mp_syevd: mp_syevd.cpp
+	$(COMPILER) $(INCS) `mpicc --showme:compile` $+ -o $@ $(LIBS) `mpicc --showme:link`
+
+mp_gels: mp_gels.cpp
+	$(COMPILER) $(INCS) `mpicc --showme:compile` $+ -o $@ $(LIBS) `mpicc --showme:link`
+
 .PHONY: all clean
diff --git a/cuSOLVERMp/README.md b/cuSOLVERMp/README.md
@@ -14,6 +14,12 @@ Distributed decompositions and linear system solutions
 
 [Dense matrix Cholesky factorization and linear system solve](mp_potrf_potrs.cpp)
 
+[Dense matrix Symmetric Eigensolver](mp_syevd.cpp)
+
+[Dense matrix QR factorization](mp_geqrf.cpp)
+
+[Dense matrix QR factorization and linear system solve](mp_gels.cpp)
+
 Examples are bootstrapped by MPI and use it to set up distributed data. Those examples are intended just to show how API is used and not for performance benchmarking. For same reasons process grid is hardcoded to `2x1` in the examples, however you can change it to other values in following lines:
 ```
 /* Define grid of processors */
@@ -40,6 +46,8 @@ x86_64
 
 [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
+[SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
+
 ### Documentation
 
 [cuSOLVERMp documentation](https://docs.nvidia.com/hpc-sdk/index.html)
@@ -52,7 +60,7 @@ Samples require c++11 compatible compiler.
 cusolverMp is distributed as a part of [HPC SDK](https://developer.nvidia.com/hpc-sdk) starting with version 21.11 and requires 
 HPC SDK to be installed in the system. Also you need to set up `HPCX` environment which is part of `HPC SDK` using one of the provided scripts before building and running examples, i.e.:
 ```
-HPCSDKVER=21.11
+HPCSDKVER=23.1
 HPCSDKARCH=Linux_x86_64
 HPCSDKPATH=/opt/nvidia/hpc_sdk
 HPCSDKROOT=$HPCSDKPATH/$HPCSDKARCH/$HPCSDKVER
@@ -64,7 +72,7 @@ hpcx_load
 
 Build examples using `make` command:
 
-`make HPCSDKVER=21.11 CUDAVER=11.5 all`
+`make`
 
 ### Running
 
@@ -73,3 +81,9 @@ Run examples with mpi command and number of processes according to process grid
 `mpirun -n 2 ./mp_getrf_getrs`
 
 `mpirun -n 2 ./mp_potrf_potrs`
+
+`mpirun -n 2 ./mp_syevd`
+
+`mpirun -n 2 ./mp_geqrf`
+
+`mpirun -n 2 ./mp_gels`
diff --git a/cuSOLVERMp/helpers.h b/cuSOLVERMp/helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2021 NVIDIA Corporation.  All rights reserved.
+ * Copyright 2023 NVIDIA Corporation.  All rights reserved.
  *
  * NOTICE TO LICENSEE:
  *
@@ -50,6 +50,204 @@
 #pragma once
 
 #include <mpi.h>
+#include <cal.h>
+
+struct Options
+{
+    // problem properties
+    int m;
+    int n;
+    int nrhs;
+    int mbA;
+    int nbA;
+    int mbB;
+    int nbB;
+    int mbQ;
+    int nbQ;
+    int ia;
+    int ja;
+    int ib;
+    int jb;
+    int iq;
+    int jq;
+
+    // grid
+    int p;
+    int q;
+
+    // others
+    bool verbose;
+
+    void printHelp() const
+    {
+        printf("Available options:\n"
+            "    -m\n"
+            "    -n\n"
+            "    -nrhs\n"
+            "    -mbA\n"
+            "    -nbA\n"
+            "    -mbB\n"
+            "    -nbB\n"
+            "    -mbQ\n"
+            "    -nbQ\n"
+            "    -ia\n"
+            "    -ja\n"
+            "    -ib\n"
+            "    -jb\n"
+            "    -iq\n"
+            "    -jq\n"
+            "    -p\n"
+            "    -q\n"
+        );
+    }
+
+    void print() const
+    {
+        printf("Parameters: m=%d n=%d nrhs=%d mbA=%d nbA=%d mbB=%d nbB=%d mbQ=%d nbQ=%d ia=%d ja=%d ib=%d jb=%d iq=%d jq=%d p=%d q=%d\n",
+            m,
+            n,
+            nrhs,
+            mbA,
+            nbA,
+            mbB,
+            nbB,
+            mbQ,
+            nbQ,
+            ia,
+            ja,
+            ib,
+            jb,
+            iq,
+            jq,
+            p,
+            q);
+    }
+
+    void parse(int argc, char** argv)
+    {
+        for (int i = 1; i < argc; i++)
+        {
+            if (strcmp(argv[i], "-m") == 0)
+            {
+                m = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-n") == 0)
+            {
+                n = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-nrhs") == 0)
+            {
+                nrhs = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-mbA") == 0)
+            {
+                mbA = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-nbA") == 0)
+            {
+                nbA = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-mbB") == 0)
+            {
+                mbB = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-nbB") == 0)
+            {
+                nbB = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-mbQ") == 0)
+            {
+                mbQ = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-nbQ") == 0)
+            {
+                nbQ = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-ia") == 0)
+            {
+                ia = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-ja") == 0)
+            {
+                ja = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-ib") == 0)
+            {
+                ib = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-jb") == 0)
+            {
+                jb = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-iq") == 0)
+            {
+                iq = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-jq") == 0)
+            {
+                jq = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-p") == 0)
+            {
+                p = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-q") == 0)
+            {
+                q = atoi(argv[++i]);
+            }
+            else if (strcmp(argv[i], "-help") == 0)
+            {
+                printHelp();
+                exit(0);
+            }
+            else
+            {
+                printf("unknown option: %s\n", argv[i]);
+                printHelp();
+                exit(1);
+            }
+        }
+    }
+
+    void validate()
+    {
+        if (ia && mbA && (ia - 1) % mbA != 0)
+        {
+            fprintf(stderr, "Error: IA must be a multiple of mbA\n");
+            exit(1);
+        }
+
+        if (ja && nbA && (ja - 1) % nbA != 0)
+        {
+            fprintf(stderr, "Error: JA must be a multiple of nbA\n");
+            exit(1);
+        }
+
+        if (ib && mbB && (ib - 1) % mbB != 0)
+        {
+            fprintf(stderr, "Error: IB must be a multiple of mbB\n");
+            exit(1);
+        }
+
+        if (jb && nbB && (jb - 1) % nbB != 0)
+        {
+            fprintf(stderr, "Error: JB must be a multiple of nbB\n");
+            exit(1);
+        }
+
+        if (iq && mbQ && (iq - 1) % mbQ != 0)
+        {
+            fprintf(stderr, "Error: IQ must be a multiple of mbQ\n");
+            exit(1);
+        }
+
+        if (jq && nbQ && (jq - 1) % nbQ != 0)
+        {
+            fprintf(stderr, "Error: JQ must be a multiple of nbQ\n");
+            exit(1);
+        }
+    }
+};
 
 static inline int getLocalRank()
 {
@@ -58,6 +256,36 @@ static inline int getLocalRank()
 
     MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm);
     MPI_Comm_rank(localComm, &localRank);
+    MPI_Comm_free(&localComm);
 
     return localRank;
-}
+}
+
+static calError_t allgather(void *src_buf, void *recv_buf, size_t size, void *data,  void **request)
+{
+    MPI_Request req;
+    int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, reinterpret_cast<MPI_Comm>(data), &req);
+    if (err != MPI_SUCCESS)
+    {
+        return CAL_ERROR;
+    }
+    *request = reinterpret_cast<void*>(req);
+    return CAL_OK;
+}
+
+static calError_t request_test(void *request)
+{
+    MPI_Request req = reinterpret_cast<MPI_Request>(request);
+    int         completed;
+    int err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
+    if (err != MPI_SUCCESS)
+    {
+        return CAL_ERROR;
+    }
+    return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
+}
+
+static calError_t request_free(void *request)
+{
+    return CAL_OK;
+}