From 0a5d53147af393d637b8173413007bd3b094904b Mon Sep 17 00:00:00 2001
From: benvanwerkhoven <ben@cs.vu.nl>
Date: Wed, 1 May 2013 14:01:59 +0200
Subject: [PATCH 1/6] added build files

---
 build/compile.mk       |  1 +
 build/linuxDAS4.gnu    | 34 ++++++++++++++++++++--------------
 build/linuxDAS4gnu.gnu | 22 ++++++++++++++--------
 3 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/build/compile.mk b/build/compile.mk
index ae5900e..d4eceb1 100644
--- a/build/compile.mk
+++ b/build/compile.mk
@@ -150,4 +150,5 @@ include $(DEPENDS)
 
 %.o: %.cu
 	@echo $(POPARCH) Compiling with implicit rule $(CUFLAGS) $<
+	@cd $(POPEXEDIR)/compile && $(NVCC) $(CUFLAGS) -ptx $<
 	@cd $(POPEXEDIR)/compile && $(NVCC) $(CUFLAGS) -c $(notdir $<)
diff --git a/build/linuxDAS4.gnu b/build/linuxDAS4.gnu
index b59469c..0e31f7d 100644
--- a/build/linuxDAS4.gnu
+++ b/build/linuxDAS4.gnu
@@ -11,17 +11,17 @@ MPILIB = -L/cm/shared/apps/openmpi/intel/64/1.4.4/lib64/
 #CUDALIB = -L/cm/shared/apps/cuda40/toolkit/4.0.17/lib64/
 CUDALIB = -L/cm/shared/apps/cuda50/toolkit/current/lib64/
 
-F77 = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90
-F90 = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90
-LD = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -lcurl $(CUDALIB) -lcudart -lstdc++   -shared-intel -i-dynamic
-CC = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpicc
+F77 = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -r8 -O3 
+F90 = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -r8 -O3 
+LD = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpif90 -r8 -O3  -lcurl $(CUDALIB) -lcudart -lstdc++   -shared-intel -i-dynamic
+CC = /cm/shared/apps/openmpi/intel/64/1.4.4/bin/mpicc -O3 
 Cp = /bin/cp
 Cpp = cpp -P
 AWK = /usr/bin/gawk
 ABI = 
 COMMDIR = mpi
 
-NVCC = nvcc 
+NVCC = nvcc -O3 
 
 
 
@@ -78,9 +78,9 @@ CFLAGS = $(ABI)
 
 ifeq ($(OPTIMIZE),yes)
 #  CFLAGS := $(CFLAGS) -O 
-  CFLAGS := $(CFLAGS) -O3
+  CFLAGS := $(CFLAGS) 
 else
-  CFLAGS := $(CFLAGS) -g -check all -ftrapuv
+  CFLAGS := $(CFLAGS) -O3 -check all -ftrapuv
 endif
 
 CFLAGS := $(CFLAGS)
@@ -100,17 +100,17 @@ ifeq ($(TRAP_FPE),yes)
 endif
 
 ifeq ($(OPTIMIZE),yes)
-  FFLAGS = $(FBASE) -O3
-#  FFLAGS = $(FBASE) -g
+  FFLAGS = $(FBASE) 
+#  FFLAGS = $(FBASE) -O3
 else
-  FFLAGS = $(FBASE) -g -check bounds
+  FFLAGS = $(FBASE) -O3 -check bounds
 endif
 
 #DAS4 specific
 FFLAGS := $(FFLAGS) -convert  big_endian
 FFLAGS := $(FFLAGS) -mcmodel=medium -shared-intel -i-dynamic
 #-i-dynamic
-#FFLAGS := $(FFLAGS) 
+
  
 
 #----------------------------------------------------------------------------
@@ -119,13 +119,19 @@ FFLAGS := $(FFLAGS) -mcmodel=medium -shared-intel -i-dynamic
 #
 #----------------------------------------------------------------------------
 
-CUFLAGS = -Xptxas=-v -arch=compute_20 -code=sm_20
+CUFLAGS = -gencode arch=compute_35,code=sm_35 -Xptxas=-v -maxrregcount=64
+
+#CUFLAGS = -gencode arch=compute_20,code=sm_20 -Xptxas=-v
+
 #-prec-sqrt=true -fmad=false
 
 ifeq ($(OPTIMIZE),yes)
-  CUFLAGS := $(CUFLAGS) -O3
+  CUFLAGS := $(CUFLAGS)
 endif
- 
+
+CUFLAGS := $(CUFLAGS) 
+
+
 #----------------------------------------------------------------------------
 #
 #                           Loader Flags and Libraries
diff --git a/build/linuxDAS4gnu.gnu b/build/linuxDAS4gnu.gnu
index 78339b1..97716b0 100644
--- a/build/linuxDAS4gnu.gnu
+++ b/build/linuxDAS4gnu.gnu
@@ -11,17 +11,17 @@
 #CUDALIB = -L/cm/shared/apps/cuda40/toolkit/4.0.17/lib64/
 CUDALIB = -L/cm/shared/apps/cuda50/toolkit/current/lib64/
 
-F77 = mpif90 -p -O3
-F90 = mpif90 -p -O3
-LD = mpif90 -p -O3 -lcurl $(CUDALIB) -lcudart -lstdc++
-CC = gcc -p -O3
+F77 = mpif90 -O0 -g
+F90 = mpif90 -O0 -g
+LD = mpif90 -O0 -g  -lcurl $(CUDALIB) -lcudart -lstdc++
+CC = gcc -O0 -g
 Cp = /bin/cp
 Cpp = cpp -P
 AWK = /usr/bin/gawk
 ABI = 
 COMMDIR = mpi
 
-NVCC = nvcc 
+NVCC = nvcc -O0 -g
  
 #  Enable MPI library for parallel code, yes/no.
 
@@ -73,6 +73,7 @@ CFLAGS = $(ABI)
 
 ifeq ($(OPTIMIZE),yes)
 #  CFLAGS := $(CFLAGS) -O 
+  CFLAGS := $(CFLAGS)  
 else
   CFLAGS := $(CFLAGS) -check all -ftrapuv
 endif
@@ -94,7 +95,7 @@ ifeq ($(TRAP_FPE),yes)
 endif
 
 ifeq ($(OPTIMIZE),yes)
-#  FFLAGS = $(FBASE) -O3
+#  FFLAGS = $(FBASE) 
   FFLAGS = $(FBASE)
 else
   FFLAGS = $(FBASE) -check bounds
@@ -112,11 +113,16 @@ FFLAGS := $(FFLAGS)
 #
 #----------------------------------------------------------------------------
 
-CUFLAGS = -Xptxas=-v -arch=compute_20 -code=sm_20
+#CUFLAGS = -Xptxas=-v -arch=compute_20 -code=sm_20
+
+#CUFLAGS = -gencode arch=compute_35,code=sm_35 -Xptxas=-v -maxrregcount=64
+
+CUFLAGS = -gencode arch=compute_20,code=sm_20 -Xptxas=-v
+
 #-prec-sqrt=true -fmad=false
 
 ifeq ($(OPTIMIZE),yes)
-  CUFLAGS := $(CUFLAGS) -O3
+  CUFLAGS := $(CUFLAGS) 
 endif
 
 #----------------------------------------------------------------------------

From f5989bb1a052183d2d4877eb5a5f76f703f7a514 Mon Sep 17 00:00:00 2001
From: benvanwerkhoven <ben@cs.vu.nl>
Date: Wed, 1 May 2013 14:03:04 +0200
Subject: [PATCH 2/6] removed some whitespace

---
 source/prognostic.F90 | 1 -
 1 file changed, 1 deletion(-)

diff --git a/source/prognostic.F90 b/source/prognostic.F90
index 4260cd3..5cbc566 100644
--- a/source/prognostic.F90
+++ b/source/prognostic.F90
@@ -65,7 +65,6 @@ module prognostic
    real (r8), dimension(:,:,:), pointer :: &
       RHOREF    ! 3d density fields used to check for correctness of GPU routines
 
-
    real (r8), dimension(nx_block,ny_block,3,max_blocks_clinic), &
       target :: &
       PSURF,    &! surface pressure for all blocks at 3 time levels

From d49bdcd38229e5e97cc0307d2af8fbe7aced03f1 Mon Sep 17 00:00:00 2001
From: benvanwerkhoven <ben@cs.vu.nl>
Date: Wed, 1 May 2013 14:18:10 +0200
Subject: [PATCH 3/6] eclipse is a bit confused

---
 source/prognostic.F90 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/prognostic.F90 b/source/prognostic.F90
index 5cbc566..4260cd3 100644
--- a/source/prognostic.F90
+++ b/source/prognostic.F90
@@ -65,6 +65,7 @@ module prognostic
    real (r8), dimension(:,:,:), pointer :: &
       RHOREF    ! 3d density fields used to check for correctness of GPU routines
 
+
    real (r8), dimension(nx_block,ny_block,3,max_blocks_clinic), &
       target :: &
       PSURF,    &! surface pressure for all blocks at 3 time levels

From c4b55f3db8062a5552f4db1ce881ec70d4ca50b2 Mon Sep 17 00:00:00 2001
From: benvanwerkhoven <ben@cs.vu.nl>
Date: Wed, 1 May 2013 14:30:49 +0200
Subject: [PATCH 4/6] replaced the use of max_blocks_clinic with nblocks_clinic
 in cudahostmalloc routines to save memory when the number of clinic blocks is
 less then max_blocks_clinic

---
 source/gpu_mod.F90 | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/source/gpu_mod.F90 b/source/gpu_mod.F90
index 5d761fc..c880a97 100644
--- a/source/gpu_mod.F90
+++ b/source/gpu_mod.F90
@@ -187,14 +187,14 @@ subroutine init_gpu_mod
   !
   !-----------------------------------------------------------------------
 
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nt*3*max_blocks_clinic))
-    call c_f_pointer(cptr, TRACER, (/ nx_block,ny_block,km,nt,3,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nt*3*nblocks_clinic))
+    call c_f_pointer(cptr, TRACER, (/ nx_block,ny_block,km,nt,3,nblocks_clinic /))
 
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*3*max_blocks_clinic))
-    call c_f_pointer(cptr, RHO, (/ nx_block,ny_block,km,3,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*3*nblocks_clinic))
+    call c_f_pointer(cptr, RHO, (/ nx_block,ny_block,km,3,nblocks_clinic /))
 
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*max_blocks_clinic))
-    call c_f_pointer(cptr, RHOP, (/ nx_block,ny_block,km,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nblocks_clinic))
+    call c_f_pointer(cptr, RHOP, (/ nx_block,ny_block,km,nblocks_clinic /))
 
 !       real (r8), dimension(nx_block,ny_block,km) :: &
 !      DBLOC,      &! buoyancy difference between adjacent levels
@@ -208,8 +208,8 @@ subroutine init_gpu_mod
 
       !allocate (VDC(nx_block,ny_block,0:km+1,2,nblocks_clinic), &
       !          VVC(nx_block,ny_block,km,      nblocks_clinic))
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*(km+2)*2*max_blocks_clinic))
-    call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,(km+2),2,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*(km+2)*2*nblocks_clinic))
+    call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,(km+2),2,nblocks_clinic /))
 
 
     !VDC = RESHAPE(VDC, (/ nx_block,ny_block,0:km+1,2,max_blocks_clinic /))
@@ -219,8 +219,8 @@ subroutine init_gpu_mod
     !apparantly c_f_pointer doesnt like the ':' in the array shape statement
     !call c_f_pointer(cptr, VDC, (/ nx_block,ny_block,0:km+1,2,max_blocks_clinic /))
 
-    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*max_blocks_clinic))
-    call c_f_pointer(cptr, VVC, (/ nx_block,ny_block,km,max_blocks_clinic /))
+    call my_cudaMallocHost(cptr, (nx_block*ny_block*km*nblocks_clinic))
+    call c_f_pointer(cptr, VVC, (/ nx_block,ny_block,km,nblocks_clinic /))
 
 
     ! arrays used for correctness checks

From b73f536a29044f3a0c46be9c1dba9ea54654e5a5 Mon Sep 17 00:00:00 2001
From: benvanwerkhoven <ben@cs.vu.nl>
Date: Wed, 1 May 2013 14:56:48 +0200
Subject: [PATCH 5/6] removed the check from gpu_compare that verifies whether
 the number of values that are close to zero is the same in both arrays.

---
 source/gpu_cmod.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/gpu_cmod.cu b/source/gpu_cmod.cu
index 5524031..476fb40 100644
--- a/source/gpu_cmod.cu
+++ b/source/gpu_cmod.cu
@@ -540,9 +540,9 @@ void gpu_compare (double *a1, double *a2, int *pN, int *pName) {
   //if (zero_one > 95*(N/100)) { fprintf(stderr, "Node %d: Error: array1 contains %d zeros\n",my_task, zero_one); }
   //if (zero_two > 95*(N/100)) { fprintf(stderr, "Node %d: Error: array2 contains %d zeros\n",my_task, zero_two); }
 
-  if (zero_one != zero_two) {
-    fprintf(stderr, "Node %d: %s Error: number of zeros in arrays dont correspond zero1=%d, zero2=%d\n", my_task, var_names[vName], zero_one, zero_two);
-  }
+  //if (zero_one != zero_two) {
+  //  fprintf(stderr, "Node %d: %s Error: number of zeros in arrays dont correspond zero1=%d, zero2=%d\n", my_task, var_names[vName], zero_one, zero_two);
+  //}
 
   if (res > 0) {
     if (vName == 0) {

From 60cd1414e16f13c6bd395ffd0304ecf99edc6e9d Mon Sep 17 00:00:00 2001
From: jmaassen <J.Maassen@esciencecenter.nl>
Date: Wed, 1 May 2013 16:14:53 +0200
Subject: [PATCH 6/6] Added gcc+mpi+gpu build file. Changed gpu_domain.h to use
 64x64 blocks.

---
 build/linuxg95_mpi_gpu.gnu | 135 +++++++++++++++++++++++++++++++++++++
 source/gpu_domain.h        |   4 +-
 2 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 build/linuxg95_mpi_gpu.gnu

diff --git a/build/linuxg95_mpi_gpu.gnu b/build/linuxg95_mpi_gpu.gnu
new file mode 100644
index 0000000..3e5a06a
--- /dev/null
+++ b/build/linuxg95_mpi_gpu.gnu
@@ -0,0 +1,135 @@
+
+#-----------------------------------------------------------------------
+#
+# File:  sgialtix_mpi.gnu
+#
+#  Contains compiler and loader options for the SGI Altix using the 
+#  intel compiler and specifies the mpi directory for communications 
+#  modules.
+#
+#-----------------------------------------------------------------------
+F77 = mpif77
+F90 = mpif90
+LD = mpif90 
+CC = cc
+
+Cp = /bin/cp
+Cpp = cpp -P
+AWK = /usr/bin/gawk
+ABI = 
+COMMDIR = mpi
+NVCC = nvcc 
+
+#  Enable MPI library for parallel code, yes/no.
+
+MPI = yes
+
+# Adjust these to point to where netcdf is installed
+
+# These have been loaded as a module so no values necessary
+NETCDFINC = -I/cm/shared/apps/netcdf/gcc/64/4.1.1/include
+NETCDFLIB = -L/cm/shared/apps/netcdf/gcc/64/4.1.1/lib
+
+#  Enable trapping and traceback of floating point exceptions, yes/no.
+#  Note - Requires 'setenv TRAP_FPE "ALL=ABORT,TRACE"' for traceback.
+
+TRAP_FPE = no
+
+#------------------------------------------------------------------
+#  precompiler options
+#------------------------------------------------------------------
+
+#DCOUPL              = -Dcoupled
+DHIRES               = -D_HIRES
+#PRINT                = -DJASON_PRINT 
+#PRINT_HALO           = -DJASON_PRINT_HALO
+#PRINT_REDIST         = -DJASON_PRINT_REDIST
+#PRINT_LOOP           = -DJASON_PRINT_LOOP
+#TIMER                = -DJASON_TIMER 
+#FIX_DATA             = -DJASON_FIX_DATA
+#LOG_FILE             = -DJASON_SIMPLE_LOG_FILENAME
+FLOW                 = -D_USE_FLOW_CONTROL
+#SEND                 = -DJASON_PRINT_SEND  
+FLUSH                = -DJASON_FLUSH
+GPU                  = -DBEN_GPU
+
+Cpp_opts =   \
+      $(DCOUPL) $(DHIRES) $(TIMER) $(PRINT) $(PRINT_LOOP) $(LOG_FILE) $(FLOW) $(FIX_DATA) $(SEND) $(FLUSH) $(PRINT_REDIST) $(GPU)
+
+Cpp_opts := $(Cpp_opts) -DPOSIX 
+ 
+#----------------------------------------------------------------------------
+#
+#                           C Flags
+#
+#----------------------------------------------------------------------------
+ 
+CFLAGS = $(ABI) 
+
+ifeq ($(OPTIMIZE),yes)
+  CFLAGS := $(CFLAGS) -O3 
+# -mcmodel=medium
+else
+  CFLAGS := $(CFLAGS) -g -check all -ftrapuv
+endif
+ 
+#----------------------------------------------------------------------------
+#
+#                           FORTRAN Flags
+#
+#----------------------------------------------------------------------------
+
+FBASE = $(ABI) $(NETCDFINC) $(MPI_COMPILE_FLAGS) -I$(DepDir) 
+MODSUF = mod
+
+ifeq ($(TRAP_FPE),yes)
+  FBASE := $(FBASE) 
+endif
+
+ifeq ($(OPTIMIZE),yes)
+  FFLAGS = $(FBASE) -O3 -fconvert=swap 
+#-fmax-stack-var-size=536870912
+#-mcmodel=medium
+else
+  FFLAGS = $(FBASE) -g -check bounds -fconvert=swap
+endif
+
+#----------------------------------------------------------------------------
+#
+#                           CUDA Flags
+#
+#----------------------------------------------------------------------------
+
+CUFLAGS = -gencode arch=compute_35,code=sm_35 -Xptxas=-v -maxrregcount=64
+
+#CUFLAGS = -gencode arch=compute_20,code=sm_20 -Xptxas=-v
+
+#-prec-sqrt=true -fmad=false
+
+ifeq ($(OPTIMIZE),yes)
+  CUFLAGS := $(CUFLAGS)
+endif
+
+CUFLAGS := $(CUFLAGS)
+ 
+#----------------------------------------------------------------------------
+#
+#                           Loader Flags and Libraries
+#
+#----------------------------------------------------------------------------
+ 
+LDFLAGS = $(ABI) 
+ 
+LIBS = $(NETCDFLIB) -L/cm/shared/apps/cuda50/toolkit/current/lib64/ -lnetcdf -lcurl -lcudart -lstdc++ 
+ 
+ifeq ($(MPI),yes)
+  LIBS := $(LIBS) $(MPI_LD_FLAGS) -lmpi 
+endif
+
+ifeq ($(TRAP_FPE),yes)
+  LIBS := $(LIBS) 
+endif
+ 
+LDLIBS = $(LIBS)
+ 
+#----------------------------------------------------------------------------
diff --git a/source/gpu_domain.h b/source/gpu_domain.h
index 4702be1..b9cf3c7 100644
--- a/source/gpu_domain.h
+++ b/source/gpu_domain.h
@@ -1,5 +1,5 @@
 
 #define KM 42
-#define NX_BLOCK 904
-#define NY_BLOCK 604
+#define NX_BLOCK 64
+#define NY_BLOCK 64
 #define NSTREAMS 42