Merge branch 'ben' into develop

NLeSC · May 1, 2013 · e2bebec · e2bebec
2 parents 92cd351 + b933fbe
commit e2bebec
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 15 deletions.
diff --git a/build/linuxg95_mpi_gpu.gnu b/build/linuxg95_mpi_gpu.gnu
@@ -0,0 +1,135 @@
+
+#-----------------------------------------------------------------------
+#
+# File:  sgialtix_mpi.gnu
+#
+#  Contains compiler and loader options for the SGI Altix using the 
+#  intel compiler and specifies the mpi directory for communications 
+#  modules.
+#
+#-----------------------------------------------------------------------
+F77 = mpif77
+F90 = mpif90
+LD = mpif90 
+CC = cc
+
+Cp = /bin/cp
+Cpp = cpp -P
+AWK = /usr/bin/gawk
+ABI = 
+COMMDIR = mpi
+NVCC = nvcc 
+
+#  Enable MPI library for parallel code, yes/no.
+
+MPI = yes
+
+# Adjust these to point to where netcdf is installed
+
+# These have been loaded as a module so no values necessary
+NETCDFINC = -I/cm/shared/apps/netcdf/gcc/64/4.1.1/include
+NETCDFLIB = -L/cm/shared/apps/netcdf/gcc/64/4.1.1/lib
+
+#  Enable trapping and traceback of floating point exceptions, yes/no.
+#  Note - Requires 'setenv TRAP_FPE "ALL=ABORT,TRACE"' for traceback.
+
+TRAP_FPE = no
+
+#------------------------------------------------------------------
+#  precompiler options
+#------------------------------------------------------------------
+
+#DCOUPL              = -Dcoupled
+DHIRES               = -D_HIRES
+#PRINT                = -DJASON_PRINT 
+#PRINT_HALO           = -DJASON_PRINT_HALO
+#PRINT_REDIST         = -DJASON_PRINT_REDIST
+#PRINT_LOOP           = -DJASON_PRINT_LOOP
+#TIMER                = -DJASON_TIMER 
+#FIX_DATA             = -DJASON_FIX_DATA
+#LOG_FILE             = -DJASON_SIMPLE_LOG_FILENAME
+FLOW                 = -D_USE_FLOW_CONTROL
+#SEND                 = -DJASON_PRINT_SEND  
+FLUSH                = -DJASON_FLUSH
+GPU                  = -DBEN_GPU
+
+Cpp_opts =   \
+      $(DCOUPL) $(DHIRES) $(TIMER) $(PRINT) $(PRINT_LOOP) $(LOG_FILE) $(FLOW) $(FIX_DATA) $(SEND) $(FLUSH) $(PRINT_REDIST) $(GPU)
+
+Cpp_opts := $(Cpp_opts) -DPOSIX 
+
+#----------------------------------------------------------------------------
+#
+#                           C Flags
+#
+#----------------------------------------------------------------------------
+
+CFLAGS = $(ABI) 
+
+ifeq ($(OPTIMIZE),yes)
+  CFLAGS := $(CFLAGS) -O3 
+# -mcmodel=medium
+else
+  CFLAGS := $(CFLAGS) -g -check all -ftrapuv
+endif
+
+#----------------------------------------------------------------------------
+#
+#                           FORTRAN Flags
+#
+#----------------------------------------------------------------------------
+
+FBASE = $(ABI) $(NETCDFINC) $(MPI_COMPILE_FLAGS) -I$(DepDir) 
+MODSUF = mod
+
+ifeq ($(TRAP_FPE),yes)
+  FBASE := $(FBASE) 
+endif
+
+ifeq ($(OPTIMIZE),yes)
+  FFLAGS = $(FBASE) -O3 -fconvert=swap 
+#-fmax-stack-var-size=536870912
+#-mcmodel=medium
+else
+  FFLAGS = $(FBASE) -g -check bounds -fconvert=swap
+endif
+
+#----------------------------------------------------------------------------
+#
+#                           CUDA Flags
+#
+#----------------------------------------------------------------------------
+
+CUFLAGS = -gencode arch=compute_35,code=sm_35 -Xptxas=-v -maxrregcount=64
+
+#CUFLAGS = -gencode arch=compute_20,code=sm_20 -Xptxas=-v
+
+#-prec-sqrt=true -fmad=false
+
+ifeq ($(OPTIMIZE),yes)
+  CUFLAGS := $(CUFLAGS)
+endif
+
+CUFLAGS := $(CUFLAGS)
+
+#----------------------------------------------------------------------------
+#
+#                           Loader Flags and Libraries
+#
+#----------------------------------------------------------------------------
+
+LDFLAGS = $(ABI) 
+
+LIBS = $(NETCDFLIB) -L/cm/shared/apps/cuda50/toolkit/current/lib64/ -lnetcdf -lcurl -lcudart -lstdc++ 
+
+ifeq ($(MPI),yes)
+  LIBS := $(LIBS) $(MPI_LD_FLAGS) -lmpi 
+endif
+
+ifeq ($(TRAP_FPE),yes)
+  LIBS := $(LIBS) 
+endif
+
+LDLIBS = $(LIBS)
+
+#----------------------------------------------------------------------------
diff --git a/source/gpu_cmod.cu b/source/gpu_cmod.cu
@@ -540,9 +540,9 @@ void gpu_compare (double *a1, double *a2, int *pN, int *pName) {
   //if (zero_one > 95*(N/100)) { fprintf(stderr, "Node %d: Error: array1 contains %d zeros\n",my_task, zero_one); }
   //if (zero_two > 95*(N/100)) { fprintf(stderr, "Node %d: Error: array2 contains %d zeros\n",my_task, zero_two); }
 
-  if (zero_one != zero_two) {
-    fprintf(stderr, "Node %d: %s Error: number of zeros in arrays dont correspond zero1=%d, zero2=%d\n", my_task, var_names[vName], zero_one, zero_two);
-  }
+  //if (zero_one != zero_two) {
+  //  fprintf(stderr, "Node %d: %s Error: number of zeros in arrays dont correspond zero1=%d, zero2=%d\n", my_task, var_names[vName], zero_one, zero_two);
+  //}
 
   if (res > 0) {
     if (vName == 0) {

diff --git a/source/gpu_domain.h b/source/gpu_domain.h
@@ -1,5 +1,5 @@
 
 #define KM 42
-#define NX_BLOCK 904
-#define NY_BLOCK 604
+#define NX_BLOCK 64
+#define NY_BLOCK 64
 #define NSTREAMS 42
diff --git a/source/gpu_mod.F90 b/source/gpu_mod.F90
@@ -187,14 +187,14 @@ subroutine init_gpu_mod
   !
   !-----------------------------------------------------------------------
 
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nt*3*max_blocks_clinic))
-    call c_f_pointer(cptr, TRACER, (/ nx_block,ny_block,km,nt,3,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nt*3*nblocks_clinic))
+    call c_f_pointer(cptr, TRACER, (/ nx_block,ny_block,km,nt,3,nblocks_clinic /))
 
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*3*max_blocks_clinic))
-    call c_f_pointer(cptr, RHO, (/ nx_block,ny_block,km,3,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*3*nblocks_clinic))
+    call c_f_pointer(cptr, RHO, (/ nx_block,ny_block,km,3,nblocks_clinic /))
 
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*max_blocks_clinic))
-    call c_f_pointer(cptr, RHOP, (/ nx_block,ny_block,km,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nblocks_clinic))
+    call c_f_pointer(cptr, RHOP, (/ nx_block,ny_block,km,nblocks_clinic /))
 
 !       real (r8), dimension(nx_block,ny_block,km) :: &
 !      DBLOC,      &! buoyancy difference between adjacent levels
@@ -208,8 +208,8 @@ subroutine init_gpu_mod
 
       !allocate (VDC(nx_block,ny_block,0:km+1,2,nblocks_clinic), &
       !          VVC(nx_block,ny_block,km,      nblocks_clinic))
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*(km+2)*2*max_blocks_clinic))
-    call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,(km+2),2,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*(km+2)*2*nblocks_clinic))
+    call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,(km+2),2,nblocks_clinic /))
 
 
     !VDC = RESHAPE(VDC, (/ nx_block,ny_block,0:km+1,2,max_blocks_clinic /))
@@ -219,8 +219,8 @@ subroutine init_gpu_mod
     !apparantly c_f_pointer doesnt like the ':' in the array shape statement
     !call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,0:km+1,2,max_blocks_clinic /))
 
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*max_blocks_clinic))
-    call c_f_pointer(cptr, VVC, (/ nx_block,ny_block,km,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nblocks_clinic))
+    call c_f_pointer(cptr, VVC, (/ nx_block,ny_block,km,nblocks_clinic /))
 
 
     ! arrays used for correctness checks