From 0a5d53147af393d637b8173413007bd3b094904b Mon Sep 17 00:00:00 2001 From: benvanwerkhoven Date: Wed, 1 May 2013 14:01:59 +0200 Subject: [PATCH 1/6] added build files --- build/compile.mk | 1 + build/linuxDAS4.gnu | 34 ++++++++++++++++++++-------------- build/linuxDAS4gnu.gnu | 22 ++++++++++++++-------- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/build/compile.mk b/build/compile.mk index ae5900e..d4eceb1 100644 --- a/build/compile.mk +++ b/build/compile.mk @@ -150,4 +150,5 @@ include $(DEPENDS) %.o: %.cu @echo $(POPARCH) Compiling with implicit rule $(CUFLAGS) $< + @cd $(POPEXEDIR)/compile && $(NVCC) $(CUFLAGS) -ptx $< @cd $(POPEXEDIR)/compile && $(NVCC) $(CUFLAGS) -c $(notdir $<) diff --git a/build/linuxDAS4.gnu b/build/linuxDAS4.gnu index b59469c..0e31f7d 100644 --- a/build/linuxDAS4.gnu +++ b/build/linuxDAS4.gnu @@ -11,17 +11,17 @@ MPILIB = -L/cm/shared/apps/openmpi/intel/64/1.4.4/lib64/ #CUDALIB = -L/cm/shared/apps/cuda40/toolkit/4.0.17/lib64/ CUDALIB = -L/cm/shared/apps/cuda50/toolkit/current/lib64/ -F77 = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -F90 = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -LD = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -lcurl $(CUDALIB) -lcudart -lstdc++ -shared-intel -i-dynamic -CC = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpicc +F77 = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -r8 -O3 +F90 = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -r8 -O3 +LD = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -r8 -O3 -lcurl $(CUDALIB) -lcudart -lstdc++ -shared-intel -i-dynamic +CC = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpicc -O3 Cp = /bin/cp Cpp = cpp -P AWK = /usr/bin/gawk ABI = COMMDIR = mpi -NVCC = nvcc +NVCC = nvcc -O3 @@ -78,9 +78,9 @@ CFLAGS = $(ABI) ifeq ($(OPTIMIZE),yes) # CFLAGS := $(CFLAGS) -O - CFLAGS := $(CFLAGS) -O3 + CFLAGS := $(CFLAGS) else - CFLAGS := $(CFLAGS) -g -check all -ftrapuv + CFLAGS := $(CFLAGS) -O3 -check all -ftrapuv endif CFLAGS := $(CFLAGS) @@ -100,17 +100,17 @@ ifeq ($(TRAP_FPE),yes) endif ifeq ($(OPTIMIZE),yes) - FFLAGS = $(FBASE) -O3 -# FFLAGS = $(FBASE) -g + FFLAGS = $(FBASE) +# FFLAGS = $(FBASE) -O3 else - FFLAGS = $(FBASE) -g -check bounds + FFLAGS = $(FBASE) -O3 -check bounds endif #DAS4 specific FFLAGS := $(FFLAGS) -convert big_endian FFLAGS := $(FFLAGS) -mcmodel=medium -shared-intel -i-dynamic #-i-dynamic -#FFLAGS := $(FFLAGS) + #---------------------------------------------------------------------------- @@ -119,13 +119,19 @@ FFLAGS := $(FFLAGS) -mcmodel=medium -shared-intel -i-dynamic # #---------------------------------------------------------------------------- -CUFLAGS = -Xptxas=-v -arch=compute_20 -code=sm_20 +CUFLAGS = -gencode arch=compute_35,code=sm_35 -Xptxas=-v -maxrregcount=64 + +#CUFLAGS = -gencode arch=compute_20,code=sm_20 -Xptxas=-v + #-prec-sqrt=true -fmad=false ifeq ($(OPTIMIZE),yes) - CUFLAGS := $(CUFLAGS) -O3 + CUFLAGS := $(CUFLAGS) endif - + +CUFLAGS := $(CUFLAGS) + + #---------------------------------------------------------------------------- # # Loader Flags and Libraries diff --git a/build/linuxDAS4gnu.gnu b/build/linuxDAS4gnu.gnu index 78339b1..97716b0 100644 --- a/build/linuxDAS4gnu.gnu +++ b/build/linuxDAS4gnu.gnu @@ -11,17 +11,17 @@ #CUDALIB = -L/cm/shared/apps/cuda40/toolkit/4.0.17/lib64/ CUDALIB = -L/cm/shared/apps/cuda50/toolkit/current/lib64/ -F77 = mpif90 -p -O3 -F90 = mpif90 -p -O3 -LD = mpif90 -p -O3 -lcurl $(CUDALIB) -lcudart -lstdc++ -CC = gcc -p -O3 +F77 = mpif90 -O0 -g +F90 = mpif90 -O0 -g +LD = mpif90 -O0 -g -lcurl $(CUDALIB) -lcudart -lstdc++ +CC = gcc -O0 -g Cp = /bin/cp Cpp = cpp -P AWK = /usr/bin/gawk ABI = COMMDIR = mpi -NVCC = nvcc +NVCC = nvcc -O0 -g # Enable MPI library for parallel code, yes/no. @@ -73,6 +73,7 @@ CFLAGS = $(ABI) ifeq ($(OPTIMIZE),yes) # CFLAGS := $(CFLAGS) -O + CFLAGS := $(CFLAGS) else CFLAGS := $(CFLAGS) -check all -ftrapuv endif @@ -94,7 +95,7 @@ ifeq ($(TRAP_FPE),yes) endif ifeq ($(OPTIMIZE),yes) -# FFLAGS = $(FBASE) -O3 +# FFLAGS = $(FBASE) FFLAGS = $(FBASE) else FFLAGS = $(FBASE) -check bounds @@ -112,11 +113,16 @@ FFLAGS := $(FFLAGS) # #---------------------------------------------------------------------------- -CUFLAGS = -Xptxas=-v -arch=compute_20 -code=sm_20 +#CUFLAGS = -Xptxas=-v -arch=compute_20 -code=sm_20 + +#CUFLAGS = -gencode arch=compute_35,code=sm_35 -Xptxas=-v -maxrregcount=64 + +CUFLAGS = -gencode arch=compute_20,code=sm_20 -Xptxas=-v + #-prec-sqrt=true -fmad=false ifeq ($(OPTIMIZE),yes) - CUFLAGS := $(CUFLAGS) -O3 + CUFLAGS := $(CUFLAGS) endif #---------------------------------------------------------------------------- From f5989bb1a052183d2d4877eb5a5f76f703f7a514 Mon Sep 17 00:00:00 2001 From: benvanwerkhoven Date: Wed, 1 May 2013 14:03:04 +0200 Subject: [PATCH 2/6] removed some whitespace --- source/prognostic.F90 | 1 - 1 file changed, 1 deletion(-) diff --git a/source/prognostic.F90 b/source/prognostic.F90 index 4260cd3..5cbc566 100644 --- a/source/prognostic.F90 +++ b/source/prognostic.F90 @@ -65,7 +65,6 @@ module prognostic real (r8), dimension(:,:,:), pointer :: & RHOREF ! 3d density fields used to check for correctness of GPU routines - real (r8), dimension(nx_block,ny_block,3,max_blocks_clinic), & target :: & PSURF, &! surface pressure for all blocks at 3 time levels From d49bdcd38229e5e97cc0307d2af8fbe7aced03f1 Mon Sep 17 00:00:00 2001 From: benvanwerkhoven Date: Wed, 1 May 2013 14:18:10 +0200 Subject: [PATCH 3/6] eclipse is a bit confused --- source/prognostic.F90 | 1 + 1 file changed, 1 insertion(+) diff --git a/source/prognostic.F90 b/source/prognostic.F90 index 5cbc566..4260cd3 100644 --- a/source/prognostic.F90 +++ b/source/prognostic.F90 @@ -65,6 +65,7 @@ module prognostic real (r8), dimension(:,:,:), pointer :: & RHOREF ! 3d density fields used to check for correctness of GPU routines + real (r8), dimension(nx_block,ny_block,3,max_blocks_clinic), & target :: & PSURF, &! surface pressure for all blocks at 3 time levels From c4b55f3db8062a5552f4db1ce881ec70d4ca50b2 Mon Sep 17 00:00:00 2001 From: benvanwerkhoven Date: Wed, 1 May 2013 14:30:49 +0200 Subject: [PATCH 4/6] replaced the use of max_blocks_clinic with nblocks_clinic in cudahostmalloc routines to save memory when the number of clinic blocks is less then max_blocks_clinic --- source/gpu_mod.F90 | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/source/gpu_mod.F90 b/source/gpu_mod.F90 index 5d761fc..c880a97 100644 --- a/source/gpu_mod.F90 +++ b/source/gpu_mod.F90 @@ -187,14 +187,14 @@ subroutine init_gpu_mod ! !----------------------------------------------------------------------- - call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nt*3*max_blocks_clinic)) - call c_f_pointer(cptr, TRACER, (/ nx_block,ny_block,km,nt,3,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nt*3*nblocks_clinic)) + call c_f_pointer(cptr, TRACER, (/ nx_block,ny_block,km,nt,3,nblocks_clinic /)) - call my_cudaMallocHost(cptr, (nx_block*ny_block*km*3*max_blocks_clinic)) - call c_f_pointer(cptr, RHO, (/ nx_block,ny_block,km,3,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*km*3*nblocks_clinic)) + call c_f_pointer(cptr, RHO, (/ nx_block,ny_block,km,3,nblocks_clinic /)) - call my_cudaMallocHost(cptr, (nx_block*ny_block*km*max_blocks_clinic)) - call c_f_pointer(cptr, RHOP, (/ nx_block,ny_block,km,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nblocks_clinic)) + call c_f_pointer(cptr, RHOP, (/ nx_block,ny_block,km,nblocks_clinic /)) ! real (r8), dimension(nx_block,ny_block,km) :: & ! DBLOC, &! buoyancy difference between adjacent levels @@ -208,8 +208,8 @@ subroutine init_gpu_mod !allocate (VDC(nx_block,ny_block,0:km+1,2,nblocks_clinic), & ! VVC(nx_block,ny_block,km, nblocks_clinic)) - call my_cudaMallocHost(cptr, (nx_block*ny_block*(km+2)*2*max_blocks_clinic)) - call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,(km+2),2,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*(km+2)*2*nblocks_clinic)) + call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,(km+2),2,nblocks_clinic /)) !VDC = RESHAPE(VDC, (/ nx_block,ny_block,0:km+1,2,max_blocks_clinic /)) @@ -219,8 +219,8 @@ subroutine init_gpu_mod !apparantly c_f_pointer doesnt like the ':' in the array shape statement !call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,0:km+1,2,max_blocks_clinic /)) - call my_cudaMallocHost(cptr, (nx_block*ny_block*km*max_blocks_clinic)) - call c_f_pointer(cptr, VVC, (/ nx_block,ny_block,km,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nblocks_clinic)) + call c_f_pointer(cptr, VVC, (/ nx_block,ny_block,km,nblocks_clinic /)) ! arrays used for correctness checks From b73f536a29044f3a0c46be9c1dba9ea54654e5a5 Mon Sep 17 00:00:00 2001 From: benvanwerkhoven Date: Wed, 1 May 2013 14:56:48 +0200 Subject: [PATCH 5/6] removed the check from gpu_compare that verifies whether the number of values that are close to zero is the same in both arrays. --- source/gpu_cmod.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/gpu_cmod.cu b/source/gpu_cmod.cu index 5524031..476fb40 100644 --- a/source/gpu_cmod.cu +++ b/source/gpu_cmod.cu @@ -540,9 +540,9 @@ void gpu_compare (double *a1, double *a2, int *pN, int *pName) { //if (zero_one > 95*(N/100)) { fprintf(stderr, "Node %d: Error: array1 contains %d zeros\n",my_task, zero_one); } //if (zero_two > 95*(N/100)) { fprintf(stderr, "Node %d: Error: array2 contains %d zeros\n",my_task, zero_two); } - if (zero_one != zero_two) { - fprintf(stderr, "Node %d: %s Error: number of zeros in arrays dont correspond zero1=%d, zero2=%d\n", my_task, var_names[vName], zero_one, zero_two); - } + //if (zero_one != zero_two) { + // fprintf(stderr, "Node %d: %s Error: number of zeros in arrays dont correspond zero1=%d, zero2=%d\n", my_task, var_names[vName], zero_one, zero_two); + //} if (res > 0) { if (vName == 0) { From 60cd1414e16f13c6bd395ffd0304ecf99edc6e9d Mon Sep 17 00:00:00 2001 From: jmaassen Date: Wed, 1 May 2013 16:14:53 +0200 Subject: [PATCH 6/6] Added gcc+mpi+gpu build file. Changed gpu_domain.h to use 64x64 blocks. --- build/linuxg95_mpi_gpu.gnu | 135 +++++++++++++++++++++++++++++++++++++ source/gpu_domain.h | 4 +- 2 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 build/linuxg95_mpi_gpu.gnu diff --git a/build/linuxg95_mpi_gpu.gnu b/build/linuxg95_mpi_gpu.gnu new file mode 100644 index 0000000..3e5a06a --- /dev/null +++ b/build/linuxg95_mpi_gpu.gnu @@ -0,0 +1,135 @@ + +#----------------------------------------------------------------------- +# +# File: sgialtix_mpi.gnu +# +# Contains compiler and loader options for the SGI Altix using the +# intel compiler and specifies the mpi directory for communications +# modules. +# +#----------------------------------------------------------------------- +F77 = mpif77 +F90 = mpif90 +LD = mpif90 +CC = cc + +Cp = /bin/cp +Cpp = cpp -P +AWK = /usr/bin/gawk +ABI = +COMMDIR = mpi +NVCC = nvcc + +# Enable MPI library for parallel code, yes/no. + +MPI = yes + +# Adjust these to point to where netcdf is installed + +# These have been loaded as a module so no values necessary +NETCDFINC = -I/cm/shared/apps/netcdf/gcc/64/4.1.1/include +NETCDFLIB = -L/cm/shared/apps/netcdf/gcc/64/4.1.1/lib + +# Enable trapping and traceback of floating point exceptions, yes/no. +# Note - Requires 'setenv TRAP_FPE "ALL=ABORT,TRACE"' for traceback. + +TRAP_FPE = no + +#------------------------------------------------------------------ +# precompiler options +#------------------------------------------------------------------ + +#DCOUPL = -Dcoupled +DHIRES = -D_HIRES +#PRINT = -DJASON_PRINT +#PRINT_HALO = -DJASON_PRINT_HALO +#PRINT_REDIST = -DJASON_PRINT_REDIST +#PRINT_LOOP = -DJASON_PRINT_LOOP +#TIMER = -DJASON_TIMER +#FIX_DATA = -DJASON_FIX_DATA +#LOG_FILE = -DJASON_SIMPLE_LOG_FILENAME +FLOW = -D_USE_FLOW_CONTROL +#SEND = -DJASON_PRINT_SEND +FLUSH = -DJASON_FLUSH +GPU = -DBEN_GPU + +Cpp_opts = \ + $(DCOUPL) $(DHIRES) $(TIMER) $(PRINT) $(PRINT_LOOP) $(LOG_FILE) $(FLOW) $(FIX_DATA) $(SEND) $(FLUSH) $(PRINT_REDIST) $(GPU) + +Cpp_opts := $(Cpp_opts) -DPOSIX + +#---------------------------------------------------------------------------- +# +# C Flags +# +#---------------------------------------------------------------------------- + +CFLAGS = $(ABI) + +ifeq ($(OPTIMIZE),yes) + CFLAGS := $(CFLAGS) -O3 +# -mcmodel=medium +else + CFLAGS := $(CFLAGS) -g -check all -ftrapuv +endif + +#---------------------------------------------------------------------------- +# +# FORTRAN Flags +# +#---------------------------------------------------------------------------- + +FBASE = $(ABI) $(NETCDFINC) $(MPI_COMPILE_FLAGS) -I$(DepDir) +MODSUF = mod + +ifeq ($(TRAP_FPE),yes) + FBASE := $(FBASE) +endif + +ifeq ($(OPTIMIZE),yes) + FFLAGS = $(FBASE) -O3 -fconvert=swap +#-fmax-stack-var-size=536870912 +#-mcmodel=medium +else + FFLAGS = $(FBASE) -g -check bounds -fconvert=swap +endif + +#---------------------------------------------------------------------------- +# +# CUDA Flags +# +#---------------------------------------------------------------------------- + +CUFLAGS = -gencode arch=compute_35,code=sm_35 -Xptxas=-v -maxrregcount=64 + +#CUFLAGS = -gencode arch=compute_20,code=sm_20 -Xptxas=-v + +#-prec-sqrt=true -fmad=false + +ifeq ($(OPTIMIZE),yes) + CUFLAGS := $(CUFLAGS) +endif + +CUFLAGS := $(CUFLAGS) + +#---------------------------------------------------------------------------- +# +# Loader Flags and Libraries +# +#---------------------------------------------------------------------------- + +LDFLAGS = $(ABI) + +LIBS = $(NETCDFLIB) -L/cm/shared/apps/cuda50/toolkit/current/lib64/ -lnetcdf -lcurl -lcudart -lstdc++ + +ifeq ($(MPI),yes) + LIBS := $(LIBS) $(MPI_LD_FLAGS) -lmpi +endif + +ifeq ($(TRAP_FPE),yes) + LIBS := $(LIBS) +endif + +LDLIBS = $(LIBS) + +#---------------------------------------------------------------------------- diff --git a/source/gpu_domain.h b/source/gpu_domain.h index 4702be1..b9cf3c7 100644 --- a/source/gpu_domain.h +++ b/source/gpu_domain.h @@ -1,5 +1,5 @@ #define KM 42 -#define NX_BLOCK 904 -#define NY_BLOCK 604 +#define NX_BLOCK 64 +#define NY_BLOCK 64 #define NSTREAMS 42