Skip to content

Commit

Permalink
Let the compiler inline the permutation lookup table into registers r…
Browse files Browse the repository at this point in the history
…ather than using __constant__ memory
  • Loading branch information
maddyscientist committed Jul 17, 2019
1 parent d488a6f commit 7611275
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 68 deletions.
1 change: 0 additions & 1 deletion src/cuda_zfp/CMakeLists.txt
Expand Up @@ -18,7 +18,6 @@ set(cuZFP_sources
type_info.cuh)

set(cuZFP_headers
constant_setup.cuh
shared.h
cuZFP.h
ErrorCheck.h)
Expand Down
39 changes: 0 additions & 39 deletions src/cuda_zfp/constant_setup.cuh

This file was deleted.

6 changes: 3 additions & 3 deletions src/cuda_zfp/constants.h
Expand Up @@ -5,7 +5,7 @@ namespace cuZFP {

#define index_3d(x, y, z) ((x) + 4 * ((y) + 4 * (z)))

static const unsigned char
__device__ static const unsigned char
perm_3d[64] = {
index_3d(0, 0, 0), // 0 : 0

Expand Down Expand Up @@ -94,15 +94,15 @@ perm_3d[64] = {

#undef index_3d

static const unsigned char perm_1[4] =
__device__ static const unsigned char perm_1[4] =
{
0, 1, 2, 3
};

#define index(i, j) ((i) + 4 * (j))

/* order coefficients (i, j) by i + j, then i^2 + j^2 */
static const unsigned char perm_2[16] = {
__device__ static const unsigned char perm_2[16] = {
index(0, 0), /* 0 : 0 */

index(1, 0), /* 1 : 1 */
Expand Down
7 changes: 0 additions & 7 deletions src/cuda_zfp/cuZFP.cu
Expand Up @@ -12,7 +12,6 @@

#include "ErrorCheck.h"

#include "constant_setup.cuh"
#include "pointers.cuh"
#include "type_info.cuh"
#include <iostream>
Expand Down Expand Up @@ -119,7 +118,6 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
{
int dim = dims[0];
int sx = stride.x;
cuZFP::ConstantSetup::setup_1d();
stream_size = cuZFP::encode1<T>(dim, sx, d_data, d_stream, bits_per_block);
}
else if(d == 2)
Expand All @@ -128,7 +126,6 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
int2 s;
s.x = stride.x;
s.y = stride.y;
cuZFP::ConstantSetup::setup_2d();
stream_size = cuZFP::encode2<T>(ndims, s, d_data, d_stream, bits_per_block);
}
else if(d == 3)
Expand All @@ -138,7 +135,6 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
s.y = stride.y;
s.z = stride.z;
uint3 ndims = make_uint3(dims[0], dims[1], dims[2]);
cuZFP::ConstantSetup::setup_3d();
stream_size = cuZFP::encode<T>(ndims, s, d_data, d_stream, bits_per_block);
}

Expand Down Expand Up @@ -172,15 +168,13 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o
s.y = stride.y;
s.z = stride.z;

cuZFP::ConstantSetup::setup_3d();
stream_bytes = cuZFP::decode3<T>(dims, s, stream, out, bits_per_block);
}
else if(d == 1)
{
uint dim = ndims[0];
int sx = stride.x;

cuZFP::ConstantSetup::setup_1d();
stream_bytes = cuZFP::decode1<T>(dim, sx, stream, out, bits_per_block);

}
Expand All @@ -194,7 +188,6 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o
s.x = stride.x;
s.y = stride.y;

cuZFP::ConstantSetup::setup_2d();
stream_bytes = cuZFP::decode2<T>(dims, s, stream, out, bits_per_block);
}
else std::cerr<<" d == "<<d<<" not implemented\n";
Expand Down
2 changes: 1 addition & 1 deletion src/cuda_zfp/decode.cuh
Expand Up @@ -234,7 +234,7 @@ __device__ void zfp_decode(BlockReader<BlockSize> &reader, Scalar *fblock, uint
decode_ints<Scalar, BlockSize, UInt>(reader, maxbits, ublock);

Int iblock[BlockSize];
unsigned char *perm = get_perm<BlockSize>();
const unsigned char *perm = get_perm<BlockSize>();
#if (CUDART_VERSION < 8000)
#pragma unroll
#else
Expand Down
5 changes: 3 additions & 2 deletions src/cuda_zfp/encode.cuh
Expand Up @@ -117,7 +117,7 @@ quantize_factor<double>(const int &exponent, double)
template<typename Scalar, typename Int, int BlockSize>
void __device__ fwd_cast(Int *iblock, const Scalar *fblock, int emax)
{
Scalar s = quantize_factor(emax, Scalar());
Scalar s = quantize_factor(emax, Scalar());
for(int i = 0; i < BlockSize; ++i)
{
iblock[i] = (Int) (s * fblock[i]);
Expand Down Expand Up @@ -184,7 +184,8 @@ struct transform<4>
template<typename Int, typename UInt, int BlockSize>
__device__ void fwd_order(UInt *ublock, const Int *iblock)
{
unsigned char *perm = get_perm<BlockSize>();
const unsigned char *perm = get_perm<BlockSize>();

for(int i = 0; i < BlockSize; ++i)
{
ublock[i] = int2uint(iblock[perm[i]]);
Expand Down
27 changes: 12 additions & 15 deletions src/cuda_zfp/shared.h
Expand Up @@ -7,6 +7,7 @@ typedef unsigned long long Word;

#include "type_info.cuh"
#include "zfp.h"
#include "constants.h"
#include <stdio.h>

#define MAX(x, y) ((x) > (y) ? (x) : (y))
Expand All @@ -17,10 +18,6 @@ typedef unsigned long long Word;

#define NBMASK 0xaaaaaaaaaaaaaaaaull

__constant__ unsigned char c_perm_1[4];
__constant__ unsigned char c_perm_2[16];
__constant__ unsigned char c_perm[64];

namespace cuZFP
{

Expand Down Expand Up @@ -245,28 +242,28 @@ inv_lift(Int* p)


template<int BlockSize>
__device__
unsigned char* get_perm();
__device__ inline
const unsigned char* get_perm();

template<>
__device__
unsigned char* get_perm<64>()
__device__ inline
const unsigned char* get_perm<64>()
{
return c_perm;
return perm_3d;
}

template<>
__device__
unsigned char* get_perm<16>()
__device__ inline
const unsigned char* get_perm<16>()
{
return c_perm_2;
return perm_2;
}

template<>
__device__
unsigned char* get_perm<4>()
__device__ inline
const unsigned char* get_perm<4>()
{
return c_perm_1;
return perm_1;
}


Expand Down

0 comments on commit 7611275

Please sign in to comment.