Merge pull request #591 from LLNL/v0.8.0-rc

V0.8.0 rc
LLNL · Mar 28, 2019 · 8d19a8c · 8d19a8c
2 parents caa33b3 + b305a3a
commit 8d19a8c
Show file tree

Hide file tree

Showing 168 changed files with 5,178 additions and 2,068 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -69,6 +69,12 @@ matrix:
     - IMG=nvcc9
     - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug -DENABLE_CUDA=On -DENABLE_WARNINGS=On -DENABLE_WARNINGS_AS_ERRORS=On -DENABLE_TBB=On"
     - DO_TEST=no
+  - compiler: clang-cuda
+    env:
+    - COMPILER=clang++
+    - IMG=clang-cuda
+    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=Off -DENABLE_CLANG_CUDA=On -DBLT_CLANG_CUDA_ARCH=sm_60 -DENABLE_CUDA=On -DCUDA_ARCH=sm_60"
+    - DO_TEST=no
 
 before_install: # don't try to build and run intel when it's impossible
 - |

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_policy(SET CMP0048 NEW)
 
 # Set version number
 set(RAJA_VERSION_MAJOR 0)
-set(RAJA_VERSION_MINOR 7)
+set(RAJA_VERSION_MINOR 8)
 set(RAJA_VERSION_PATCHLEVEL 0)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
@@ -27,7 +27,7 @@ endif()
 if (NOT RAJA_LOADED)
   set (RAJA_LOADED "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")
 
-  # Promote RAJA_LOADED to PARENT_SCOPE if it exists, which is only if we are bringing	
+  # Promote RAJA_LOADED to PARENT_SCOPE if it exists, which is only if we are bringing
   # in RAJA as a subproject to a larger CMake project
   get_directory_property(hasParent PARENT_DIRECTORY)
   if(hasParent)
@@ -55,9 +55,11 @@ if (NOT RAJA_LOADED)
   option(ENABLE_TARGET_OPENMP "Build OpenMP on target device support" Off)
   option(ENABLE_CLANG_CUDA "Use Clang's native CUDA support" Off)
   set(CUDA_ARCH "sm_35" CACHE STRING "Compute architecture to pass to CUDA builds")
+  option(ENABLE_EXTERNAL_CUB "Use an external cub for scans" Off)
   option(ENABLE_TESTS "Build tests" On)
   option(ENABLE_REPRODUCERS "Build issue reproducers" Off)
   option(ENABLE_EXAMPLES "Build simple examples" On)
+  option(ENABLE_EXERCISES "Build exercises " On)
   option(ENABLE_MODULES "Enable modules in supporting compilers (clang)" On)
   option(ENABLE_WARNINGS "Enable warnings as errors for CI" Off)
   option(ENABLE_DOCUMENTATION "Build RAJA documentation" Off)
@@ -128,6 +130,23 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" AND CMAKE_CXX_COMPILER_VERSION V
       cuda)
   endif ()
 
+  if (ENABLE_CUDA)
+    if(ENABLE_EXTERNAL_CUB)
+      find_package(CUB)
+        if (CUB_FOUND)
+          blt_register_library(
+            NAME cub
+            INCLUDES ${CUB_INCLUDE_DIRS})
+            set(raja_depends
+                ${raja_depends}
+                cub)
+          else()
+            message(WARNING "External CUB not found.")
+            set(ENABLE_EXTERNAL_CUB Off)
+         endif()
+      endif ()
+  endif ()
+
   if (ENABLE_CHAI)
     set (raja_depends
       ${raja_depends}
@@ -162,8 +181,12 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" AND CMAKE_CXX_COMPILER_VERSION V
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tpl/cub>
     $<INSTALL_INTERFACE:include>)
 
-  install(DIRECTORY include/ DESTINATION include FILES_MATCHING PATTERN *.hpp)
-  install(DIRECTORY tpl/cub/ DESTINATION include FILES_MATCHING PATTERN *.cuh)
+  if(ENABLE_EXTERNAL_CUB)
+    install(DIRECTORY include/ DESTINATION include FILES_MATCHING PATTERN *.hpp)
+  else()
+    install(DIRECTORY include/ DESTINATION include FILES_MATCHING PATTERN *.hpp)
+    install(DIRECTORY tpl/cub/ DESTINATION include FILES_MATCHING PATTERN *.cuh)
+  endif()
 
   install(FILES
     ${PROJECT_BINARY_DIR}/include/RAJA/config.hpp
@@ -183,6 +206,10 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" AND CMAKE_CXX_COMPILER_VERSION V
     add_subdirectory(examples)
   endif()
 
+  if(ENABLE_EXERCISES)
+    add_subdirectory(exercises)
+  endif()
+
   if (ENABLE_DOCUMENTATION)
     add_subdirectory(docs)
   endif ()

diff --git a/Dockerfile b/Dockerfile
@@ -1,13 +1,16 @@
-FROM nvidia/cuda:8.0-devel-ubuntu16.04
+#
+#Builds and installs RAJA using the gcc8 compiler
+#
+
+FROM rajaorg/compiler:gcc8
 MAINTAINER RAJA Development Team <raja-dev@llnl.gov>
 
-RUN apt-get update -y
-RUN apt-get install -y git cmake gdb
+COPY --chown=raja:raja . /home/raja/workspace
 
-RUN cd /opt/ && git clone https://github.com/LLNL/RAJA.git
+WORKDIR /home/raja/workspace
 
-WORKDIR /opt/RAJA
+RUN  mkdir build && cd build && cmake -DENABLE_CUDA=OFF ..
 
-RUN mkdir build && cd build && cmake -DENABLE_CUDA=ON ..
+RUN cd build && sudo make -j 3 && sudo make install
 
-RUN cd build && make -j && make install
+CMD ["bash"]
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -13,6 +13,41 @@
 [comment]: # (For details about use and distribution, please read RAJA/LICENSE.)
 [comment]: # (#################################################################)
 
+RAJA v0.8.0 Release Notes
+=========================
+
+This release contains one major change and some minor improvements to 
+compilation and performance.
+
+Major changes include:
+
+  * Build system updated to use the latest version of BLT (or close to it). 
+    Depending on how one builds RAJA, this could require changes to how 
+    information is passed to CMake. Content has been added to the relevant 
+    sections of the RAJA User Guide which describes how this is done.
+
+Other notable changes include:
+
+  * Features (These are not yet documented and should be considered 
+    experimental. There will be documentation and usage examples in the
+    next RAJA release.)
+    * New thread, warp, and bitmask policies for CUDA. These are not
+      yet documented and should be considered experimental.
+    * Added AtomicLocalArray type which returns data elements wrapped
+      in an AtomicRef object.
+  * Bug Fixes:
+    * Fixed issue in RangeStrideSegment iteration.
+    * Fix 'align hint' macro to eliminate compile warning when XL compiler
+      is used with nvcc.
+    * Fix issues associated with CUDA architecture level (i.e., sm_*) set
+      too low and generated compiler warning/errors. Caveats for RAJA features
+      (mostly atomic operations) available at different CUDA architecture 
+      levels added to User Guide.
+
+  * Performance Improvements:
+    * Some performance improvements in RAJA::kernel usage with CUDA back-end.
+
+
 RAJA v0.7.0 Release Notes
 =========================
 

diff --git a/blt b/blt
diff --git a/cmake/SetupCompilers.cmake b/cmake/SetupCompilers.cmake
@@ -77,44 +77,25 @@ if ( MSVC )
 endif()
 
 if (ENABLE_CUDA)
-  if ( NOT DEFINED RAJA_NVCC_STD ) 
-    set(RAJA_NVCC_STD "c++11")
-    # When we require cmake 3.8+, replace this with setting CUDA_STANDARD
-    if(CUDA_VERSION_MAJOR GREATER "8")
-      execute_process(COMMAND ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc -std c++14 -ccbin ${CMAKE_CXX_COMPILER} . 
-                      ERROR_VARIABLE TEST_NVCC_ERR
-                      OUTPUT_QUIET)
-      if (NOT TEST_NVCC_ERR MATCHES "flag is not supported with the configured host compiler")
-        set(RAJA_NVCC_STD "c++14")
-      endif()
-    else()
-    endif()
-  endif()
+  set(CMAKE_CUDA_STANDARD 11)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda")
 
   if (NOT RAJA_HOST_CONFIG_LOADED)
-    list(APPEND RAJA_EXTRA_NVCC_FLAGS -restrict; -arch ${CUDA_ARCH}; -std ${RAJA_NVCC_STD}; --expt-extended-lambda; -ccbin; ${CMAKE_CXX_COMPILER})
-
-    set(RAJA_NVCC_FLAGS_RELEASE -O2 CACHE STRING "")
-    set(RAJA_NVCC_FLAGS_DEBUG -g; -G; -O0 CACHE STRING "")
-    set(RAJA_NVCC_FLAGS_MINSIZEREL -Os CACHE STRING "")
-    set(RAJA_NVCC_FLAGS_RELWITHDEBINFO -g; -lineinfo; -O2 CACHE STRING "")
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O2")
+    set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0")
+    set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os")
+    set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O2")
 
     if(RAJA_ENABLE_COVERAGE)
       if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
         message(INFO "Coverage analysis enabled")
-        set(RAJA_EXTRA_NVCC_FLAGS ${RAJA_EXTRA_NVCC_FLAGS}; -Xcompiler -coverage; -Xlinker -coverage)
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -coverage -Xlinker -coverage")
         set(CMAKE_EXE_LINKER_FLAGS "-coverage ${CMAKE_EXE_LINKER_FLAGS}")
       else()
         message(WARNING "Code coverage specified but not enabled -- GCC was not detected")
       endif()
     endif()
   endif()
-  set(RAJA_NVCC_FLAGS ${RAJA_EXTRA_NVCC_FLAGS} CACHE STRING "")
-  set(CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS})
-  set(CUDA_NVCC_FLAGS_RELEASE ${RAJA_NVCC_FLAGS_RELEASE})
-  set(CUDA_NVCC_FLAGS_DEBUG ${RAJA_NVCC_FLAGS_DEBUG})
-  set(CUDA_NVCC_FLAGS_MINSIZEREL ${RAJA_NVCC_FLAGS_MINSIZEREL})
-  set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ${RAJA_NVCC_FLAGS_RELWITHDEBINFO})
 endif()
 # end RAJA_ENABLE_CUDA section
 

diff --git a/cmake/thirdparty/FindCUB.cmake b/cmake/thirdparty/FindCUB.cmake
@@ -0,0 +1,18 @@
+include (FindPackageHandleStandardArgs)
+
+find_path(CUB_INCLUDE_DIRS
+  NAMES cub/cub.cuh
+  HINTS
+    ${CUB_DIR}/
+    ${CUB_DIR}/include)
+
+find_package_handle_standard_args(
+  CUB
+  DEFAULT_MSG
+  CUB_INCLUDE_DIRS)
+
+if (CUB_INCLUDE_DIRS)
+  set(CUB_FOUND True)
+else ()
+  set(CUB_FOUND False)
+endif()
diff --git a/docs/sphinx/user_guide/conf.py b/docs/sphinx/user_guide/conf.py
@@ -58,7 +58,7 @@
 
 # General information about the project.
 project = u'RAJA'
-copyright = u'2016-2018'
+copyright = u'2016-2019'
 author = u'LLNS'
 
 # The version info for the project you're documenting, acts as replacement for

diff --git a/docs/sphinx/user_guide/config_options.rst b/docs/sphinx/user_guide/config_options.rst
@@ -48,18 +48,27 @@ Following CMake conventions, RAJA supports three build types: ``Release``,
 choose a build type that includes debug information, you do not have to specify
 the '-g' compiler flag to generate debugging symbols. 
 
-All RAJA options are set like standard CMake variables. For example, to enable 
-RAJA OpenMP functionality, pass the following argument to cmake::
+All RAJA options are set like standard CMake variables. All RAJA settings for 
+default options, compilers, flags for optimization, etc. can be found in files 
+in the ``RAJA/cmake`` directory. Configuration variables can be set by passing
+arguments to CMake on the command line when CMake is called, or by setting
+options in a CMake cache file and passing that file to CMake. For example, 
+to enable RAJA OpenMP functionality, pass the following argument to cmake::
 
     -DENABLE_OPENMP=On
 
-All RAJA settings for default options, compilers, flags for optimization, etc. 
-can be found in files in the ``RAJA/cmake`` directory. Next, we
-summarize the available options and their defaults
+The RAJA repository contains a collection of CMake cache files 
+(or 'host-config' files) that may be used as a guide for users trying
+to set their own options. See :ref:`configopt-raja-hostconfig-label`.
 
-=================================
-Available Options and Defaults
-=================================
+Next, we summarize RAJA options and their defaults.
+
+
+.. _configopt-raja-features-label:
+
+====================================
+Available RAJA Options and Defaults
+====================================
 
 RAJA uses a variety of custom variables to control how it is compiled. Many 
 of these are used internally to control RAJA compilation and do 
@@ -96,7 +105,7 @@ and their default settings:
       ENABLE_WARNINGS_AS_ERRORS   Off
       =========================   ======================
 
-* **Programming models and compilers**
+* **Programming model back-ends**
 
      Variables that control which RAJA programming model back-ends are enabled
      are (names are descriptive of what they enable):
@@ -127,8 +136,8 @@ and their default settings:
       for RAJA CUDA scans. Since the CUB library is included in RAJA as a
       Git submodule, users should not have to set this in most scenarios.
 
-.. note:: When using the NVIDIA nvcc compiler for RAJA CUDA functionality, 
-          the variable 'RAJA_NVCC_FLAGS' should be used to pass flags to nvcc.
+.. note:: See :ref:`configopt-raja-backends-label` for more information about
+          setting compiler flags and other options for RAJA back-ends.
 
 * **Data types, sizes, alignment, etc.**
 
@@ -281,6 +290,61 @@ and their default settings:
                                       recovery overhead, etc.)
       =============================   ========================================
 
+
+.. _configopt-raja-backends-label:
+
+===============================
+Setting RAJA Back-End Features
+===============================
+
+To access compiler and hardware optimization features, it is often necessary
+to pass options to a compiler. This sections describes how to do this and
+which CMake variables to use for certain cases. 
+
+* **OpenMP Compiler Options**
+
+   The variable `OpenMP_CXX_FLAGS` is used to pass OpenMP-related flags to a
+   compiler. Option syntax follows the CMake *list* pattern. Here is an example
+   showing how to specify OpenMP target back-end options for the clang compiler
+   as a CMake option::
+
+   cmake \
+     ....
+     -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda;-fopenmp-implicit-declare-target" 
+     ....
+
+* **CUDA Compiler Options**
+
+When using the NVIDIA nvcc compiler for RAJA CUDA functionality, the variables
+`CMAKE_CUDA_FLAGS_RELEASE`, `CMAKE_CUDA_FLAGS_DEBUG`, and 
+'CMAKE_CUDA_FLAGS_RELWITHDEBINFO` (corresponding to the standard CMake build
+types) are used to pass flags to nvcc.
+
+.. note:: When nvcc must pass options to the host compiler, the arguments
+          can be included in these CMake variables. Each host compiler
+          option must be prepended with the `-Xcompiler` directive.
+
+To set the CUDA architecture level for the nvcc compiler, which should be 
+chosen based on the NVIDIA GPU hardware you are using, you can use the 
+`CUDA_ARCH` CMake variable. For example, the CMake option::
+
+  -DCUDA_ARCH=sm_60
+
+will tell the compiler to use the `sm_60` SASS architecture in its second
+stage of compilation. It will pick the PTX architecture to use in the first
+stage of compilation that is suitable for the SASS architecture you specify.
+
+Alternatively, you may specify the PTX and SASS architectures, using 
+appropriate nvcc options in the `CMAKE_CUDA_FLAGS_*` variables.
+
+.. note:: RAJA requires a minimum CUDA architecture level of `sm_35` to use 
+          all supported CUDA features. Mostly, the architecture level affects 
+          which RAJA CUDA atomic operations are available and how they are 
+          implemented inside RAJA. This is described in :ref:`atomics-label`.
+
+
+.. _configopt-raja-hostconfig-label:
+
 =======================
 RAJA Host-Config Files
 =======================

diff --git a/docs/sphinx/user_guide/feature/atomic.rst b/docs/sphinx/user_guide/feature/atomic.rst
@@ -37,6 +37,7 @@ RAJA atomic support includes a variety of the most common atomic operations.
           * Each method described in the table below returns the value of 
             the potentially modified argument (i.e., \*acc) immediately before 
             the atomic operation is applied, in case it is needed by a user.
+          * See :ref:`atomics-label` for details about CUDA atomic operations.
 
 ^^^^^^^^^^^
 Arithmetic
@@ -131,3 +132,30 @@ Atomic Policies
 
 For more information about available RAJA atomic policies, please see
 :ref:`atomicpolicy-label`.
+
+
+.. _cudaatomics-label:
+
+---------------------------------------
+CUDA Atomics Architecture Dependencies
+---------------------------------------
+
+The internal implementations for RAJA atomic operations may vary depending
+on which CUDA architecture is available and/or specified when the RAJA
+is configured for compilation. The following rules apply when the following
+CUDA architecture level is chosen:
+
+  * **CUDA architecture is lower than `sm_35`** 
+
+    * Certain atomics will be implemented using CUDA `atomicCAS` 
+      (Compare and Swap).
+
+  * **CUDA architecture is `sm_35` or higher**   
+
+    * CUDA native 64-bit unsigned atomicMin, atomicMax, atomicAnd, atomicOr,
+      atomicXor are used.
+
+  * **CUDA architecture is `sm_60` or higher** 
+
+    * CUDA native 64-bit double `atomicAdd` is used.
+