Skip to content

Commit

Permalink
Merge branch 'update-cusolvermp-samples-for-23.1' into 'master'
Browse files Browse the repository at this point in the history
update cusolverMp samples

See merge request cuda-hpc-libraries/cudalibrarysamples-mirror!89
  • Loading branch information
almogsegal committed Jan 26, 2023
2 parents a9899a2 + 80d160d commit ce619bb
Show file tree
Hide file tree
Showing 8 changed files with 3,332 additions and 1,092 deletions.
35 changes: 15 additions & 20 deletions cuSOLVERMp/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright 2021 NVIDIA Corporation. All rights reserved.
# Copyright 2023 NVIDIA Corporation. All rights reserved.
#
# NOTICE TO LICENSEE:
#
Expand Down Expand Up @@ -51,39 +51,25 @@
# Defines, includes and libraries
# --------------------------------------------

# HPC SDK version and installation path
HPCSDKVER?=21.11
CUDAVER?=11.5
HPCSDKARCH?=Linux_x86_64
HPCSDKPATH?=/opt/nvidia/hpc_sdk
HPCSDKROOT?=$(HPCSDKPATH)/$(HPCSDKARCH)/$(HPCSDKVER)

# cusolverMp depends on HPC-X inside HPC SDK distribution, so it's required to
# initialize HPC-X using one of the initialization scripts
# (i.e. $HPCSDKROOT/comm_libs/hpcx/latest/hpcx-init-ompi.sh)
# and load HPC-X environment with `hpcx_load` function
# This will make `mpicc` command available which will provide required MPI compile
# and link flags for compiler
COMPILER?=g++ -std=c++11

# HPC SDK provides CUDA components.
CUDAMATHLIBSPATH?=$(HPCSDKROOT)/math_libs/$(CUDAVER)
CUDAPATH?=$(HPCSDKROOT)/cuda/$(CUDAVER)
COMPILER?=nvc++ -cuda -gpu cuda11.8

# Includes and linker flags
INCS = -I./ -I$(CUDAMATHLIBSPATH)/include -I$(CUDAPATH)/include
COMMLIBS = -L$(HPCSDKROOT)/comm_libs/$(CUDAVER)/nccl/lib -lnccl -Wl,-rpath=$(HPCSDKROOT)/comm_libs/$(CUDAVER)/nccl/lib -lcal
EXTRALIBS = -L$(CUDAPATH)/lib64/stubs -lcuda -lnvToolsExt -ldl -lrt
LIBS = $(COMMLIBS) -L$(CUDAMATHLIBSPATH)/lib64 -Wl,-rpath=$(CUDAMATHLIBSPATH)/lib64 -lcusolverMp -L$(CUDAPATH)/lib64 -lcudart_static -Wl,-rpath=$(CUDAPATH)/lib64 $(EXTRALIBS)
LIBS = -cudalib=cublas,cusolver,nccl,cusolvermp

# --------------------------------------------
# Define targets
# --------------------------------------------

clean:
rm mp_getrf_getrs mp_potrf_potrs
all: mp_getrf_getrs mp_potrf_potrs mp_geqrf mp_syevd mp_gels

all: mp_getrf_getrs mp_potrf_potrs
clean:
rm -f mp_getrf_getrs mp_potrf_potrs mp_geqrf mp_syevd mp_gels

# --------------------------------------------
# Build commands
Expand All @@ -94,4 +80,13 @@ mp_getrf_getrs: mp_getrf_getrs.cpp
mp_potrf_potrs: mp_potrf_potrs.cpp
$(COMPILER) $(INCS) `mpicc --showme:compile` $+ -o $@ $(LIBS) `mpicc --showme:link`

mp_geqrf: mp_geqrf.cpp
$(COMPILER) $(INCS) `mpicc --showme:compile` $+ -o $@ $(LIBS) `mpicc --showme:link`

mp_syevd: mp_syevd.cpp
$(COMPILER) $(INCS) `mpicc --showme:compile` $+ -o $@ $(LIBS) `mpicc --showme:link`

mp_gels: mp_gels.cpp
$(COMPILER) $(INCS) `mpicc --showme:compile` $+ -o $@ $(LIBS) `mpicc --showme:link`

.PHONY: all clean
18 changes: 16 additions & 2 deletions cuSOLVERMp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ Distributed decompositions and linear system solutions

[Dense matrix Cholesky factorization and linear system solve](mp_potrf_potrs.cpp)

[Dense matrix Symmetric Eigensolver](mp_syevd.cpp)

[Dense matrix QR factorization](mp_geqrf.cpp)

[Dense matrix QR factorization and linear system solve](mp_gels.cpp)

Examples are bootstrapped by MPI and use it to set up distributed data. Those examples are intended just to show how API is used and not for performance benchmarking. For same reasons process grid is hardcoded to `2x1` in the examples, however you can change it to other values in following lines:
```
/* Define grid of processors */
Expand All @@ -40,6 +46,8 @@ x86_64

[SM 8.0 ](https://developer.nvidia.com/cuda-gpus)

[SM 9.0 ](https://developer.nvidia.com/cuda-gpus)

### Documentation

[cuSOLVERMp documentation](https://docs.nvidia.com/hpc-sdk/index.html)
Expand All @@ -52,7 +60,7 @@ Samples require c++11 compatible compiler.
cusolverMp is distributed as a part of [HPC SDK](https://developer.nvidia.com/hpc-sdk) starting with version 21.11 and requires
HPC SDK to be installed in the system. Also you need to set up `HPCX` environment which is part of `HPC SDK` using one of the provided scripts before building and running examples, i.e.:
```
HPCSDKVER=21.11
HPCSDKVER=23.1
HPCSDKARCH=Linux_x86_64
HPCSDKPATH=/opt/nvidia/hpc_sdk
HPCSDKROOT=$HPCSDKPATH/$HPCSDKARCH/$HPCSDKVER
Expand All @@ -64,7 +72,7 @@ hpcx_load

Build examples using `make` command:

`make HPCSDKVER=21.11 CUDAVER=11.5 all`
`make`

### Running

Expand All @@ -73,3 +81,9 @@ Run examples with mpi command and number of processes according to process grid
`mpirun -n 2 ./mp_getrf_getrs`

`mpirun -n 2 ./mp_potrf_potrs`

`mpirun -n 2 ./mp_syevd`

`mpirun -n 2 ./mp_geqrf`

`mpirun -n 2 ./mp_gels`
232 changes: 230 additions & 2 deletions cuSOLVERMp/helpers.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation. All rights reserved.
* Copyright 2023 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
Expand Down Expand Up @@ -50,6 +50,204 @@
#pragma once

#include <mpi.h>
#include <cal.h>

struct Options
{
// problem properties
int m;
int n;
int nrhs;
int mbA;
int nbA;
int mbB;
int nbB;
int mbQ;
int nbQ;
int ia;
int ja;
int ib;
int jb;
int iq;
int jq;

// grid
int p;
int q;

// others
bool verbose;

void printHelp() const
{
printf("Available options:\n"
" -m\n"
" -n\n"
" -nrhs\n"
" -mbA\n"
" -nbA\n"
" -mbB\n"
" -nbB\n"
" -mbQ\n"
" -nbQ\n"
" -ia\n"
" -ja\n"
" -ib\n"
" -jb\n"
" -iq\n"
" -jq\n"
" -p\n"
" -q\n"
);
}

void print() const
{
printf("Parameters: m=%d n=%d nrhs=%d mbA=%d nbA=%d mbB=%d nbB=%d mbQ=%d nbQ=%d ia=%d ja=%d ib=%d jb=%d iq=%d jq=%d p=%d q=%d\n",
m,
n,
nrhs,
mbA,
nbA,
mbB,
nbB,
mbQ,
nbQ,
ia,
ja,
ib,
jb,
iq,
jq,
p,
q);
}

void parse(int argc, char** argv)
{
for (int i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-m") == 0)
{
m = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-n") == 0)
{
n = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-nrhs") == 0)
{
nrhs = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-mbA") == 0)
{
mbA = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-nbA") == 0)
{
nbA = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-mbB") == 0)
{
mbB = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-nbB") == 0)
{
nbB = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-mbQ") == 0)
{
mbQ = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-nbQ") == 0)
{
nbQ = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-ia") == 0)
{
ia = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-ja") == 0)
{
ja = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-ib") == 0)
{
ib = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-jb") == 0)
{
jb = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-iq") == 0)
{
iq = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-jq") == 0)
{
jq = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-p") == 0)
{
p = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-q") == 0)
{
q = atoi(argv[++i]);
}
else if (strcmp(argv[i], "-help") == 0)
{
printHelp();
exit(0);
}
else
{
printf("unknown option: %s\n", argv[i]);
printHelp();
exit(1);
}
}
}

void validate()
{
if (ia && mbA && (ia - 1) % mbA != 0)
{
fprintf(stderr, "Error: IA must be a multiple of mbA\n");
exit(1);
}

if (ja && nbA && (ja - 1) % nbA != 0)
{
fprintf(stderr, "Error: JA must be a multiple of nbA\n");
exit(1);
}

if (ib && mbB && (ib - 1) % mbB != 0)
{
fprintf(stderr, "Error: IB must be a multiple of mbB\n");
exit(1);
}

if (jb && nbB && (jb - 1) % nbB != 0)
{
fprintf(stderr, "Error: JB must be a multiple of nbB\n");
exit(1);
}

if (iq && mbQ && (iq - 1) % mbQ != 0)
{
fprintf(stderr, "Error: IQ must be a multiple of mbQ\n");
exit(1);
}

if (jq && nbQ && (jq - 1) % nbQ != 0)
{
fprintf(stderr, "Error: JQ must be a multiple of nbQ\n");
exit(1);
}
}
};

static inline int getLocalRank()
{
Expand All @@ -58,6 +256,36 @@ static inline int getLocalRank()

MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm);
MPI_Comm_rank(localComm, &localRank);
MPI_Comm_free(&localComm);

return localRank;
}
}

static calError_t allgather(void *src_buf, void *recv_buf, size_t size, void *data, void **request)
{
MPI_Request req;
int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, reinterpret_cast<MPI_Comm>(data), &req);
if (err != MPI_SUCCESS)
{
return CAL_ERROR;
}
*request = reinterpret_cast<void*>(req);
return CAL_OK;
}

static calError_t request_test(void *request)
{
MPI_Request req = reinterpret_cast<MPI_Request>(request);
int completed;
int err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
if (err != MPI_SUCCESS)
{
return CAL_ERROR;
}
return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
}

static calError_t request_free(void *request)
{
return CAL_OK;
}

0 comments on commit ce619bb

Please sign in to comment.