Skip to content

Commit

Permalink
add sinusoidal init method
Browse files Browse the repository at this point in the history
  • Loading branch information
Wenjing Zhu authored and minseokl committed Sep 13, 2022
1 parent 42c67fe commit a2c3b52
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 59 deletions.
15 changes: 15 additions & 0 deletions HugeCTR/embedding_storage/common.hpp
Expand Up @@ -34,6 +34,20 @@ using core::Tensor;
using core::TensorList;
using core::TensorScalarType;

struct UniformParams {
float up_bound;
};
struct SinusoidalParams {
int ev_size;
int max_sequence_len;
};

struct EmbeddingTableInitParams {
HugeCTR::Initializer_t initializer_type;
UniformParams uniform_params;
SinusoidalParams sinus_params;
};

struct EmbeddingTableParam {
int table_id;
int max_vocabulary_size; // -1 means dynamic
Expand All @@ -42,5 +56,6 @@ struct EmbeddingTableParam {
int64_t max_key;

HugeCTR::OptParams opt_param;
EmbeddingTableInitParams init_param;
};
} // namespace embedding
81 changes: 64 additions & 17 deletions HugeCTR/embedding_storage/ragged_static_embedding.cu
Expand Up @@ -172,24 +172,71 @@ RaggedStaticEmbeddingTable::RaggedStaticEmbeddingTable(
emb_table_ev_offset_.copy_from(cpu_emb_table_ev_offset);
local_ev_size_list_.copy_from(cpu_local_ev_size_list);

auto uniform_init_table = [&](const curandGenerator_t &generator) {
const size_t num_tables = cpu_local_id_space_list.size();
for (size_t embedding = 0; embedding < num_tables; embedding++) {
index_t num_keys = cpu_id_space_offset[embedding + 1] - cpu_id_space_offset[embedding];
float up_bound = sqrt(1.f / num_keys);
size_t offset = cpu_emb_table_ev_offset[embedding];
size_t num_elements =
cpu_emb_table_ev_offset[embedding + 1] - cpu_emb_table_ev_offset[embedding];

HugeCTR::UniformGenerator::fill(emb_table_.get<float>() + offset, num_elements, -up_bound,
up_bound, gpu_resource.get_sm_count(), generator,
gpu_resource.get_stream());
for (size_t embedding = 0; embedding < cpu_local_id_space_list.size(); embedding++) {
int id_space = cpu_local_id_space_list[embedding];
const auto &init_param = global_emb_table_param_list[id_space].init_param;
if (init_param.initializer_type == HugeCTR::Initializer_t::Default) {
auto default_init_table = [&](const curandGenerator_t &generator) {
index_t num_keys = cpu_id_space_offset[embedding + 1] - cpu_id_space_offset[embedding];
float up_bound = sqrt(1.f / num_keys);
size_t offset = cpu_emb_table_ev_offset[embedding];
size_t num_elements =
cpu_emb_table_ev_offset[embedding + 1] - cpu_emb_table_ev_offset[embedding];

HugeCTR::UniformGenerator::fill(emb_table_.get<float>() + offset, num_elements,
-up_bound, up_bound, gpu_resource.get_sm_count(),
generator, gpu_resource.get_stream());
};

// data parallel table should use same curand seed across all gpus
if (sharding_param.table_placement_strategy == TablePlacementStrategy::DataParallel) {
default_init_table(gpu_resource.get_replica_uniform_curand_generator());
} else {
default_init_table(gpu_resource.get_replica_variant_curand_generator());
}
} else if (init_param.initializer_type == HugeCTR::Initializer_t::Uniform) {
auto uniform_init_table = [&](const curandGenerator_t &generator) {
float up_bound = init_param.uniform_params.up_bound;
size_t offset = cpu_emb_table_ev_offset[embedding];
size_t num_elements =
cpu_emb_table_ev_offset[embedding + 1] - cpu_emb_table_ev_offset[embedding];

HugeCTR::UniformGenerator::fill(emb_table_.get<float>() + offset, num_elements,
-up_bound, up_bound, gpu_resource.get_sm_count(),
generator, gpu_resource.get_stream());
};

// data parallel table should use same curand seed across all gpus
if (sharding_param.table_placement_strategy == TablePlacementStrategy::DataParallel) {
uniform_init_table(gpu_resource.get_replica_uniform_curand_generator());
} else {
uniform_init_table(gpu_resource.get_replica_variant_curand_generator());
}
} else if (init_param.initializer_type == HugeCTR::Initializer_t::Sinusoidal) {
auto sinusoidal_init_table = [&] {
int max_sequence_len = init_param.sinus_params.max_sequence_len;
int ev_size = init_param.sinus_params.ev_size;
size_t offset = cpu_emb_table_ev_offset[embedding];
size_t num_elements =
cpu_emb_table_ev_offset[embedding + 1] - cpu_emb_table_ev_offset[embedding];

HCTR_CHECK_HINT(max_sequence_len * ev_size == static_cast<int>(num_elements),
"max_sequent_len * ev_size %d should equal to num_elements %d",
max_sequence_len * ev_size, static_cast<int>(num_elements));
HugeCTR::SinusoidalGenerator::fill(
emb_table_.get<float>() + offset, num_elements, ev_size, max_sequence_len,
gpu_resource.get_sm_count(), gpu_resource.get_stream());
};

// data parallel table should use same curand seed across all gpus
if (sharding_param.table_placement_strategy == TablePlacementStrategy::DataParallel) {
sinusoidal_init_table();
} else {
HCTR_OWN_THROW(HugeCTR::Error_t::IllegalCall, "initializer not implemented");
}
} else {
HCTR_OWN_THROW(HugeCTR::Error_t::IllegalCall, "initializer not implemented");
}
};
if (sharding_param.table_placement_strategy == TablePlacementStrategy::DataParallel) {
uniform_init_table(gpu_resource.get_replica_uniform_curand_generator());
} else {
uniform_init_table(gpu_resource.get_replica_variant_curand_generator());
}
});
});
Expand Down
4 changes: 2 additions & 2 deletions HugeCTR/include/common.hpp
Expand Up @@ -170,7 +170,7 @@ enum class Embedding_t {
None
};

enum class Initializer_t { Default, Uniform, XavierNorm, XavierUniform, Zero };
enum class Initializer_t { Default, Uniform, XavierNorm, XavierUniform, Sinusoidal, Zero };

enum class TrainState_t {
Init,
Expand Down Expand Up @@ -316,4 +316,4 @@ struct DenseLayerSwitchs {
DenseLayerSwitchs(bool fuse_wb_ = false) : fuse_wb(fuse_wb_) {}
};

} // namespace HugeCTR
} // namespace HugeCTR
7 changes: 7 additions & 0 deletions HugeCTR/include/data_simulator.hpp
Expand Up @@ -30,6 +30,13 @@ class UniformGenerator {
const curandGenerator_t& generator, const cudaStream_t& stream);
};

class SinusoidalGenerator {
public:
template <typename T>
static void fill(T* ptr, size_t num_elements, int ev_size, int max_sequence_len, size_t sm_count,
const cudaStream_t& stream);
};

class HostUniformGenerator {
public:
template <typename T>
Expand Down
1 change: 1 addition & 0 deletions HugeCTR/include/pybind/embedding_collection.hpp
Expand Up @@ -39,6 +39,7 @@ class EmbeddingTableConfig {
} else {
param_.opt_param.optimizer = Optimizer_t::NOT_INITIALIZED;
}
param_.init_param.initializer_type = HugeCTR::Initializer_t::Default;
}
};

Expand Down
21 changes: 21 additions & 0 deletions HugeCTR/src/data_simulator.cu
Expand Up @@ -35,6 +35,27 @@ void UniformGenerator::fill<float>(float* ptr, size_t num_elements, float a, flo
transform_array<<<sm_count * 2, 1024, 0, stream>>>(ptr, ptr, num_elements, op);
}

template <typename T>
__global__ void sinusoidal_kernel(T* output, int ev_size, int max_sequence_len) {
int row = blockIdx.x;
int col = threadIdx.x;
int offset = row * ev_size + col;
float log_result = __logf(10000) / (ev_size);
float exp_result = __expf(((col >> 1) << 1) * -1 * log_result);

if (col < ev_size) {
output[offset] = (col % 2) ? (T)__cosf(exp_result * row) : (T)__sinf(exp_result * row);
}
}

template <>
void SinusoidalGenerator::fill<float>(float* ptr, size_t num_elements, int ev_size,
int max_sequence_len, size_t sm_count,
const cudaStream_t& stream) {
sinusoidal_kernel<<<max_sequence_len, max(32, ev_size), 0, stream>>>(ptr, ev_size,
max_sequence_len);
}

template <>
void UniformGenerator::fill<float>(Tensor2<float>& tensor, float a, float b, size_t sm_count,
const curandGenerator_t& generator, const cudaStream_t& stream) {
Expand Down

0 comments on commit a2c3b52

Please sign in to comment.