diff --git a/build/compile.mk b/build/compile.mk index 72069e0..70421e5 100644 --- a/build/compile.mk +++ b/build/compile.mk @@ -154,4 +154,5 @@ include $(DEPENDS) %.o: %.cu @echo $(POPARCH) Compiling with implicit rule $(CUFLAGS) $< + @cd $(POPEXEDIR)/compile && $(NVCC) $(CUFLAGS) -ptx $< @cd $(POPEXEDIR)/compile && $(NVCC) $(CUFLAGS) -c $(notdir $<) diff --git a/build/linuxg95_mpi_gpu.gnu b/build/linuxg95_mpi_gpu.gnu index 3bd1a49..8db373c 100644 --- a/build/linuxg95_mpi_gpu.gnu +++ b/build/linuxg95_mpi_gpu.gnu @@ -67,7 +67,7 @@ Cpp_opts := $(Cpp_opts) -DPOSIX CFLAGS = $(ABI) ifeq ($(OPTIMIZE),yes) - CFLAGS := $(CFLAGS) -O3 -march=corei7 + CFLAGS := $(CFLAGS) -O3 # -mcmodel=medium else CFLAGS := $(CFLAGS) -g -check all -ftrapuv @@ -87,7 +87,7 @@ ifeq ($(TRAP_FPE),yes) endif ifeq ($(OPTIMIZE),yes) - FFLAGS = $(FBASE) -O3 -march=corei7 -fconvert=swap + FFLAGS = $(FBASE) -O3 -fconvert=swap #-fmax-stack-var-size=536870912 #-mcmodel=medium else @@ -100,7 +100,7 @@ endif # #---------------------------------------------------------------------------- -CUFLAGS = -gencode arch=compute_35,code=sm_35 -Xptxas=-v -maxrregcount=64 -gencode arch=compute_20,code=sm_20 +CUFLAGS = -gencode arch=compute_35,code=sm_35 -Xptxas=-v -maxrregcount=64 #CUFLAGS = -gencode arch=compute_20,code=sm_20 -Xptxas=-v #-prec-sqrt=true -fmad=false diff --git a/source/gpu_cmod.cu b/source/gpu_cmod.cu index 5524031..476fb40 100644 --- a/source/gpu_cmod.cu +++ b/source/gpu_cmod.cu @@ -540,9 +540,9 @@ void gpu_compare (double *a1, double *a2, int *pN, int *pName) { //if (zero_one > 95*(N/100)) { fprintf(stderr, "Node %d: Error: array1 contains %d zeros\n",my_task, zero_one); } //if (zero_two > 95*(N/100)) { fprintf(stderr, "Node %d: Error: array2 contains %d zeros\n",my_task, zero_two); } - if (zero_one != zero_two) { - fprintf(stderr, "Node %d: %s Error: number of zeros in arrays dont correspond zero1=%d, zero2=%d\n", my_task, var_names[vName], zero_one, zero_two); - } + //if (zero_one != zero_two) { + // fprintf(stderr, "Node %d: %s Error: number of zeros in arrays dont correspond zero1=%d, zero2=%d\n", my_task, var_names[vName], zero_one, zero_two); + //} if (res > 0) { if (vName == 0) { diff --git a/source/gpu_mod.F90 b/source/gpu_mod.F90 index 5d761fc..c880a97 100644 --- a/source/gpu_mod.F90 +++ b/source/gpu_mod.F90 @@ -187,14 +187,14 @@ subroutine init_gpu_mod ! !----------------------------------------------------------------------- - call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nt*3*max_blocks_clinic)) - call c_f_pointer(cptr, TRACER, (/ nx_block,ny_block,km,nt,3,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nt*3*nblocks_clinic)) + call c_f_pointer(cptr, TRACER, (/ nx_block,ny_block,km,nt,3,nblocks_clinic /)) - call my_cudaMallocHost(cptr, (nx_block*ny_block*km*3*max_blocks_clinic)) - call c_f_pointer(cptr, RHO, (/ nx_block,ny_block,km,3,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*km*3*nblocks_clinic)) + call c_f_pointer(cptr, RHO, (/ nx_block,ny_block,km,3,nblocks_clinic /)) - call my_cudaMallocHost(cptr, (nx_block*ny_block*km*max_blocks_clinic)) - call c_f_pointer(cptr, RHOP, (/ nx_block,ny_block,km,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nblocks_clinic)) + call c_f_pointer(cptr, RHOP, (/ nx_block,ny_block,km,nblocks_clinic /)) ! real (r8), dimension(nx_block,ny_block,km) :: & ! DBLOC, &! buoyancy difference between adjacent levels @@ -208,8 +208,8 @@ subroutine init_gpu_mod !allocate (VDC(nx_block,ny_block,0:km+1,2,nblocks_clinic), & ! VVC(nx_block,ny_block,km, nblocks_clinic)) - call my_cudaMallocHost(cptr, (nx_block*ny_block*(km+2)*2*max_blocks_clinic)) - call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,(km+2),2,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*(km+2)*2*nblocks_clinic)) + call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,(km+2),2,nblocks_clinic /)) !VDC = RESHAPE(VDC, (/ nx_block,ny_block,0:km+1,2,max_blocks_clinic /)) @@ -219,8 +219,8 @@ subroutine init_gpu_mod !apparantly c_f_pointer doesnt like the ':' in the array shape statement !call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,0:km+1,2,max_blocks_clinic /)) - call my_cudaMallocHost(cptr, (nx_block*ny_block*km*max_blocks_clinic)) - call c_f_pointer(cptr, VVC, (/ nx_block,ny_block,km,max_blocks_clinic /)) + call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nblocks_clinic)) + call c_f_pointer(cptr, VVC, (/ nx_block,ny_block,km,nblocks_clinic /)) ! arrays used for correctness checks