From 8038315ebf6a24f48b473feb82effab4299c8570 Mon Sep 17 00:00:00 2001 From: Xinjing Hu Date: Thu, 6 Apr 2023 20:59:17 +0800 Subject: [PATCH] perf(encoding): add benchmark for data chunk encoding (#9035) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/common/Cargo.toml | 4 ++ .../benches/bench_data_chunk_encoding.rs | 58 +++++++++++++++++++ src/common/benches/bench_hash_key_encoding.rs | 43 ++------------ src/common/src/test_utils/mod.rs | 1 + src/common/src/test_utils/rand_chunk.rs | 54 +++++++++++++++++ 5 files changed, 121 insertions(+), 39 deletions(-) create mode 100644 src/common/benches/bench_data_chunk_encoding.rs create mode 100644 src/common/src/test_utils/rand_chunk.rs diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml index dc7b0189cdc4..6e52c0666048 100644 --- a/src/common/Cargo.toml +++ b/src/common/Cargo.toml @@ -120,6 +120,10 @@ harness = false name = "bench_hash_key_encoding" harness = false +[[bench]] +name = "bench_data_chunk_encoding" +harness = false + [[bin]] name = "example-config" path = "src/bin/default_config.rs" diff --git a/src/common/benches/bench_data_chunk_encoding.rs b/src/common/benches/bench_data_chunk_encoding.rs new file mode 100644 index 000000000000..f6aec610e9de --- /dev/null +++ b/src/common/benches/bench_data_chunk_encoding.rs @@ -0,0 +1,58 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use criterion::{criterion_group, criterion_main, Criterion}; +use risingwave_common::test_utils::rand_chunk; +use risingwave_common::types::DataType; + +static SEED: u64 = 998244353u64; +static CHUNK_SIZES: &[usize] = &[128, 1024]; +static NULL_RATIOS: &[f64] = &[0.0, 0.01, 0.1]; + +struct DataChunkBenchCase { + pub name: String, + pub data_types: Vec, +} + +impl DataChunkBenchCase { + pub fn new(name: &str, data_types: Vec) -> Self { + Self { + name: name.to_string(), + data_types, + } + } +} + +fn bench_data_chunk_encoding(c: &mut Criterion) { + let test_cases = vec![ + DataChunkBenchCase::new("Int16", vec![DataType::Int16]), + DataChunkBenchCase::new("String", vec![DataType::Varchar]), + DataChunkBenchCase::new("Int16 and String", vec![DataType::Int16, DataType::Varchar]), + ]; + for case in test_cases { + for null_ratio in NULL_RATIOS { + for chunk_size in CHUNK_SIZES { + let id = format!( + "data chunk encoding: {}, {} rows, Pr[null]={}", + case.name, chunk_size, null_ratio + ); + let chunk = rand_chunk::gen_chunk(&case.data_types, *chunk_size, SEED, *null_ratio); + c.bench_function(&id, |b| b.iter(|| chunk.serialize())); + } + } + } +} + +criterion_group!(benches, bench_data_chunk_encoding); +criterion_main!(benches); diff --git a/src/common/benches/bench_hash_key_encoding.rs b/src/common/benches/bench_hash_key_encoding.rs index d75e17ce32ba..349cad10fefc 100644 --- a/src/common/benches/bench_hash_key_encoding.rs +++ b/src/common/benches/bench_hash_key_encoding.rs @@ -14,14 +14,9 @@ use criterion::{criterion_group, criterion_main, Criterion}; use itertools::Itertools; -use risingwave_common::array::column::Column; -use risingwave_common::array::serial_array::SerialArray; -use risingwave_common::array::{ - ArrayBuilderImpl, BoolArray, DataChunk, DateArray, DecimalArray, F32Array, F64Array, I16Array, - I32Array, I64Array, IntervalArray, TimeArray, TimestampArray, Utf8Array, -}; +use risingwave_common::array::{ArrayBuilderImpl, DataChunk}; use risingwave_common::hash::{calc_hash_key_kind, HashKey, HashKeyDispatcher}; -use risingwave_common::test_utils::rand_array::seed_rand_array_ref; +use risingwave_common::test_utils::rand_chunk; use risingwave_common::types::DataType; static SEED: u64 = 998244353u64; @@ -56,7 +51,8 @@ impl HashKeyDispatcher for HashKeyBenchCaseBuilder { calc_hash_key_kind(self.data_types()), null_ratio ); - let input_chunk = gen_chunk(self.data_types(), *chunk_size, SEED, *null_ratio); + let input_chunk = + rand_chunk::gen_chunk(self.data_types(), *chunk_size, SEED, *null_ratio); ret.push(Box::new(HashKeyBenchCase::::new( id, input_chunk, @@ -139,37 +135,6 @@ impl Case for HashKeyBenchCase { } } -fn gen_chunk(data_types: &[DataType], size: usize, seed: u64, null_ratio: f64) -> DataChunk { - let mut columns = vec![]; - - for d in data_types { - columns.push(Column::new(match d { - DataType::Boolean => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Int16 => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Int32 => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Int64 => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Float32 => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Float64 => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Decimal => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Date => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Varchar => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Time => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Serial => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Timestamp => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Timestamptz => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Interval => seed_rand_array_ref::(size, seed, null_ratio), - DataType::Struct(_) | DataType::Bytea | DataType::Jsonb => { - todo!() - } - DataType::List { datatype: _ } => { - todo!() - } - })); - } - risingwave_common::util::schema_check::schema_check(data_types, &columns).unwrap(); - DataChunk::new(columns, size) -} - fn case_builders() -> Vec { vec![ HashKeyBenchCaseBuilder { diff --git a/src/common/src/test_utils/mod.rs b/src/common/src/test_utils/mod.rs index bd2c56a0f791..9b0943ebc763 100644 --- a/src/common/src/test_utils/mod.rs +++ b/src/common/src/test_utils/mod.rs @@ -13,4 +13,5 @@ // limitations under the License. pub mod rand_array; +pub mod rand_chunk; pub mod test_stream_chunk; diff --git a/src/common/src/test_utils/rand_chunk.rs b/src/common/src/test_utils/rand_chunk.rs new file mode 100644 index 000000000000..3c7bed5d9e08 --- /dev/null +++ b/src/common/src/test_utils/rand_chunk.rs @@ -0,0 +1,54 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::array::column::Column; +use crate::array::serial_array::SerialArray; +use crate::array::{ + BoolArray, DataChunk, DateArray, DecimalArray, F32Array, F64Array, I16Array, I32Array, + I64Array, IntervalArray, TimeArray, TimestampArray, Utf8Array, +}; +use crate::test_utils::rand_array::seed_rand_array_ref; +use crate::types::DataType; +use crate::util::schema_check; + +pub fn gen_chunk(data_types: &[DataType], size: usize, seed: u64, null_ratio: f64) -> DataChunk { + let mut columns = vec![]; + + for d in data_types { + columns.push(Column::new(match d { + DataType::Boolean => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Int16 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Int32 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Int64 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Float32 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Float64 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Decimal => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Date => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Varchar => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Time => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Serial => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Timestamp => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Timestamptz => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Interval => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Struct(_) | DataType::Bytea | DataType::Jsonb => { + todo!() + } + DataType::List { datatype: _ } => { + todo!() + } + })); + } + schema_check::schema_check(data_types, &columns).unwrap(); + DataChunk::new(columns, size) +}