Skip to content

Commit

Permalink
Changes necessary to support Feta.
Browse files Browse the repository at this point in the history
Reviewed-by: Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com>

Bug 2839527
  • Loading branch information
dkolsen-pgi authored and brycelelbach committed Apr 2, 2020
1 parent 3cfcc0c commit 1669350
Show file tree
Hide file tree
Showing 30 changed files with 364 additions and 278 deletions.
2 changes: 1 addition & 1 deletion dependencies/cub
Submodule cub updated from 629f01 to 35e4f6
36 changes: 18 additions & 18 deletions examples/dot_products_with_zip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
#include <thrust/random.h>


// This example shows how thrust::zip_iterator can be used to create a
// 'virtual' array of structures. In this case the structure is a 3d
// vector type (Float3) whose (x,y,z) components will be stored in
// This example shows how thrust::zip_iterator can be used to create a
// 'virtual' array of structures. In this case the structure is a 3d
// vector type (Float3) whose (x,y,z) components will be stored in
// three separate float arrays. The zip_iterator "zips" these arrays
// into a single virtual Float3 array.

Expand Down Expand Up @@ -54,17 +54,17 @@ int main(void)
// We'll store the components of the 3d vectors in separate arrays. One set of
// arrays will store the 'A' vectors and another set will store the 'B' vectors.

// This 'structure of arrays' (SoA) approach is usually more efficient than the
// This 'structure of arrays' (SoA) approach is usually more efficient than the
// 'array of structures' (AoS) approach. The primary reason is that structures,
// like Float3, don't always obey the memory coalescing rules, so they are not
// efficiently transferred to and from memory. Another reason to prefer SoA to
// AoS is that we don't aways want to process all members of the structure. For
// example, if we only need to look at first element of the structure then it
// example, if we only need to look at first element of the structure then it
// is wasteful to load the entire structure from memory. With the SoA approach,
// we can chose which elements of the structure we wish to read.

thrust::device_vector<float> A0 = random_vector(N); // x components of the 'A' vectors
thrust::device_vector<float> A1 = random_vector(N); // y components of the 'A' vectors
thrust::device_vector<float> A1 = random_vector(N); // y components of the 'A' vectors
thrust::device_vector<float> A2 = random_vector(N); // z components of the 'A' vectors

thrust::device_vector<float> B0 = random_vector(N); // x components of the 'B' vectors
Expand All @@ -78,7 +78,7 @@ int main(void)
// We'll now illustrate two ways to use zip_iterator to compute the dot
// products. The first method is verbose but shows how the parts fit together.
// The second method hides these details and is more concise.


// METHOD #1
// Defining a zip_iterator type can be a little cumbersome ...
Expand All @@ -87,24 +87,24 @@ int main(void)
typedef thrust::zip_iterator<FloatIteratorTuple> Float3Iterator;

// Now we'll create some zip_iterators for A and B
Float3Iterator A_first = thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin()));
Float3Iterator A_last = thrust::make_zip_iterator(make_tuple(A0.end(), A1.end(), A2.end()));
Float3Iterator B_first = thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin()));
Float3Iterator A_first = thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin()));
Float3Iterator A_last = thrust::make_zip_iterator(thrust::make_tuple(A0.end(), A1.end(), A2.end()));
Float3Iterator B_first = thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin()));

// Finally, we pass the zip_iterators into transform() as if they
// were 'normal' iterators for a device_vector<Float3>.
thrust::transform(A_first, A_last, B_first, result.begin(), DotProduct());


// METHOD #2
// Alternatively, we can avoid creating variables for X_first, X_last,
// Alternatively, we can avoid creating variables for X_first, X_last,
// and Y_first and invoke transform() directly.
thrust::transform( thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin())),
thrust::make_zip_iterator(make_tuple(A0.end(), A1.end(), A2.end())),
thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin())),
thrust::transform( thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin())),
thrust::make_zip_iterator(thrust::make_tuple(A0.end(), A1.end(), A2.end())),
thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin())),
result.begin(),
DotProduct() );



// Finally, we'll print a few results
Expand All @@ -126,8 +126,8 @@ int main(void)
std::cout << "(" << thrust::get<0>(b) << "," << thrust::get<1>(b) << "," << thrust::get<2>(b) << ")";
std::cout << " = ";
std::cout << dot << std::endl;
}
}

return 0;
}

17 changes: 11 additions & 6 deletions thrust/detail/allocator/temporary_allocator.inl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
#include <thrust/system/detail/bad_alloc.h>
#include <cassert>

#if defined(__CUDA_ARCH__) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#if (defined(__NVCOMPILER_CUDA__) || defined(__CUDA_ARCH__)) && \
THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#include <thrust/system/cuda/detail/terminate.h>
#endif

Expand All @@ -45,11 +46,15 @@ __host__ __device__
// note that we pass cnt to deallocate, not a value derived from result.second
deallocate(result.first, cnt);

#if !defined(__CUDA_ARCH__)
throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
#endif
if (THRUST_IS_HOST_CODE) {
#if THRUST_INCLUDE_HOST_CODE
throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
#endif
} else {
#if THRUST_INCLUDE_DEVICE_CODE
thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
#endif
}
} // end if

return result.first;
Expand Down
21 changes: 19 additions & 2 deletions thrust/detail/config/cpp_compatibility.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@
// FIXME: Combine THRUST_INLINE_CONSTANT and
// THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT into one macro when NVCC properly
// supports `constexpr` globals in host and device code.
#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__) || defined(__NVCOMPILER_CUDA__)
// FIXME: Add this when NVCC supports inline variables.
//# if THRUST_CPP_DIALECT >= 2017
//# define THRUST_INLINE_CONSTANT inline constexpr
//# define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr
# if THRUST_CPP_DIALECT >= 2011
# define THRUST_INLINE_CONSTANT static constexpr
# define THRUST_INLINE_CONSTANT static const __device__
# define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
# else
# define THRUST_INLINE_CONSTANT static const __device__
Expand All @@ -75,3 +75,20 @@
# endif
#endif

#if defined(__NVCOMPILER_CUDA__)
# define THRUST_IS_DEVICE_CODE __builtin_is_device_code()
# define THRUST_IS_HOST_CODE (!__builtin_is_device_code())
# define THRUST_INCLUDE_DEVICE_CODE 1
# define THRUST_INCLUDE_HOST_CODE 1
#elif defined(__CUDA_ARCH__)
# define THRUST_IS_DEVICE_CODE 1
# define THRUST_IS_HOST_CODE 0
# define THRUST_INCLUDE_DEVICE_CODE 1
# define THRUST_INCLUDE_HOST_CODE 0
#else
# define THRUST_IS_DEVICE_CODE 0
# define THRUST_IS_HOST_CODE 1
# define THRUST_INCLUDE_DEVICE_CODE 0
# define THRUST_INCLUDE_HOST_CODE 1
#endif

6 changes: 5 additions & 1 deletion thrust/detail/config/exec_check_disable.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@

#include <thrust/detail/config.h>

#if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__))
// #pragma nv_exec_check_disable is only recognized by NVCC. Having a macro
// expand to a #pragma (rather than _Pragma) only works with NVCC's compilation
// model, not with other compilers.
#if defined(__CUDACC__) && !defined(__NVCOMPILER_CUDA__) && \
!(defined(__CUDA__) && defined(__clang__))

#define __thrust_exec_check_disable__ #pragma nv_exec_check_disable

Expand Down
20 changes: 12 additions & 8 deletions thrust/detail/contiguous_storage.inl
Original file line number Diff line number Diff line change
Expand Up @@ -430,15 +430,19 @@ __host__ __device__
void contiguous_storage<T,Alloc>
::swap_allocators(false_type, Alloc &other)
{
#ifdef __CUDA_ARCH__
// allocators must be equal when swapping containers with allocators that propagate on swap
assert(!is_allocator_not_equal(other));
#else
if (is_allocator_not_equal(other))
{
throw allocator_mismatch_on_swap();
if (THRUST_IS_DEVICE_CODE) {
#if THRUST_INCLUDE_DEVICE_CODE
// allocators must be equal when swapping containers with allocators that propagate on swap
assert(!is_allocator_not_equal(other));
#endif
} else {
#if THRUST_INCLUDE_HOST_CODE
if (is_allocator_not_equal(other))
{
throw allocator_mismatch_on_swap();
}
#endif
}
#endif
thrust::swap(m_allocator, other);
} // end contiguous_storage::swap_allocators()

Expand Down
2 changes: 1 addition & 1 deletion thrust/detail/functional/actor.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ template<typename Eval>
typedef Eval eval_type;

__host__ __device__
actor(void);
THRUST_CONSTEXPR actor();

__host__ __device__
actor(const Eval &base);
Expand Down
4 changes: 2 additions & 2 deletions thrust/detail/functional/actor.inl
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ namespace functional

template<typename Eval>
__host__ __device__
actor<Eval>
::actor(void)
THRUST_CONSTEXPR actor<Eval>
::actor()
: eval_type()
{}

Expand Down
2 changes: 1 addition & 1 deletion thrust/detail/functional/argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ template<unsigned int i>
};

__host__ __device__
argument(void){}
THRUST_CONSTEXPR argument(){}

template<typename Env>
__host__ __device__
Expand Down
35 changes: 20 additions & 15 deletions thrust/detail/integer_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,27 @@ template <typename Integer>
__host__ __device__ __thrust_forceinline__
Integer clz(Integer x)
{
#if __CUDA_ARCH__
return ::__clz(x);
#else
int num_bits = 8 * sizeof(Integer);
int num_bits_minus_one = num_bits - 1;

for (int i = num_bits_minus_one; i >= 0; --i)
{
if ((Integer(1) << i) & x)
{
return num_bits_minus_one - i;
}
Integer result;
if (THRUST_IS_DEVICE_CODE) {
#if THRUST_INCLUDE_DEVICE_CODE
result = ::__clz(x);
#endif
} else {
#if THRUST_INCLUDE_HOST_CODE
int num_bits = 8 * sizeof(Integer);
int num_bits_minus_one = num_bits - 1;
result = num_bits;
for (int i = num_bits_minus_one; i >= 0; --i)
{
if ((Integer(1) << i) & x)
{
result = num_bits_minus_one - i;
break;
}
}
#endif
}

return num_bits;
#endif
return result;
}

template <typename Integer>
Expand Down
8 changes: 2 additions & 6 deletions thrust/detail/seq.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>,
thrust::system::detail::sequential::execution_policy>
{
__host__ __device__
seq_t() : thrust::system::detail::sequential::execution_policy<seq_t>() {}
THRUST_CONSTEXPR seq_t() : thrust::system::detail::sequential::execution_policy<seq_t>() {}

// allow any execution_policy to convert to seq_t
template<typename DerivedPolicy>
Expand All @@ -45,11 +45,7 @@ struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>,
} // end detail


#ifdef __CUDA_ARCH__
static const __device__ detail::seq_t seq;
#else
static const detail::seq_t seq;
#endif
THRUST_INLINE_CONSTANT detail::seq_t seq;


} // end thrust
Expand Down
6 changes: 1 addition & 5 deletions thrust/execution_policy.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,11 +344,7 @@ static const detail::host_t host;
* \see host_execution_policy
* \see thrust::device
*/
#ifdef __CUDA_ARCH__
static const __device__ detail::device_t device;
#else
static const detail::device_t device;
#endif
THRUST_INLINE_CONSTANT detail::device_t device;


// define seq for the purpose of Doxygenating it
Expand Down
Loading

0 comments on commit 1669350

Please sign in to comment.