From 3af7d217151dcd69a8a2654ecf9ebfe49817a047 Mon Sep 17 00:00:00 2001 From: Noel Kwan <47273164+kwannoel@users.noreply.github.com> Date: Thu, 6 Apr 2023 20:14:34 +0800 Subject: [PATCH] feat(sqlsmith): Generate more join expressions (#8395) --- src/tests/sqlsmith/README.md | 7 + src/tests/sqlsmith/src/lib.rs | 1 - src/tests/sqlsmith/src/runner.rs | 11 +- src/tests/sqlsmith/src/sql_gen/expr.rs | 10 +- src/tests/sqlsmith/src/sql_gen/insert.rs | 1 - src/tests/sqlsmith/src/sql_gen/mod.rs | 15 -- src/tests/sqlsmith/src/sql_gen/relation.rs | 263 ++++++++++++++++----- src/tests/sqlsmith/src/sql_gen/scalar.rs | 1 - src/tests/sqlsmith/src/sql_gen/types.rs | 52 +++- 9 files changed, 270 insertions(+), 91 deletions(-) diff --git a/src/tests/sqlsmith/README.md b/src/tests/sqlsmith/README.md index 4ced2616990c..048839621999 100644 --- a/src/tests/sqlsmith/README.md +++ b/src/tests/sqlsmith/README.md @@ -15,6 +15,13 @@ This test will be run as a unit test: ./risedev test -E "package(risingwave_sqlsmith)" --features enable_sqlsmith_unit_test ``` +## Generate snapshots + +Take a look at [`gen_queries.sh`](scripts/gen_queries.sh). + +Caveat: Even with a given snapshot, certain parts of the system are non-determninistic. +For instance with scheduler errors, the same query may not trigger errors when executed. + ## E2E In the second mode, it will test the entire query handling end-to-end. We provide a CLI tool that represents a Postgres client. You can run this tool via: diff --git a/src/tests/sqlsmith/src/lib.rs b/src/tests/sqlsmith/src/lib.rs index 232e0b75ba1e..6de404a09bb5 100644 --- a/src/tests/sqlsmith/src/lib.rs +++ b/src/tests/sqlsmith/src/lib.rs @@ -42,7 +42,6 @@ pub fn sql_gen(rng: &mut impl Rng, tables: Vec) -> String { } /// Generate `INSERT` -#[allow(dead_code)] pub fn insert_sql_gen(rng: &mut impl Rng, tables: Vec
, count: usize) -> Vec { let mut gen = SqlGenerator::new(rng, vec![]); tables diff --git a/src/tests/sqlsmith/src/runner.rs b/src/tests/sqlsmith/src/runner.rs index b0151089b613..7877bcf360c4 100644 --- a/src/tests/sqlsmith/src/runner.rs +++ b/src/tests/sqlsmith/src/runner.rs @@ -155,15 +155,16 @@ pub async fn run(client: &Client, testdata: &str, count: usize, seed: Option) -> impl Rng { } } -#[allow(dead_code)] async fn populate_tables( client: &Client, rng: &mut R, @@ -274,7 +274,6 @@ async fn test_session_variable(client: &Client, rng: &mut R) -> String { } /// Expects at least 50% of inserted rows included. -#[allow(dead_code)] async fn test_population_count(client: &Client, base_tables: Vec
, expected_count: usize) { let mut actual_count = 0; for t in base_tables { diff --git a/src/tests/sqlsmith/src/sql_gen/expr.rs b/src/tests/sqlsmith/src/sql_gen/expr.rs index 52df98f17aa1..b1ac42c96b04 100644 --- a/src/tests/sqlsmith/src/sql_gen/expr.rs +++ b/src/tests/sqlsmith/src/sql_gen/expr.rs @@ -239,12 +239,9 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { let casts = EXPLICIT_CAST_TABLE.get(ret)?; let cast_sig = casts.choose(&mut self.rng).unwrap(); - use CastContext as T; match cast_sig.context { - T::Explicit => { - let expr = self - .gen_expr(&cast_sig.from_type, context.set_inside_explicit_cast()) - .into(); + CastContext::Explicit => { + let expr = self.gen_expr(&cast_sig.from_type, context).into(); let data_type = data_type_to_ast_data_type(&cast_sig.to_type); Some(Expr::Cast { expr, data_type }) } @@ -693,6 +690,9 @@ pub(crate) fn sql_null() -> Expr { Expr::Value(Value::Null) } +// TODO(kwannoel): +// Add variadic function signatures. Can add these functions +// to a FUNC_TABLE too. pub fn print_function_table() -> String { let func_str = func_sigs() .map(|sign| { diff --git a/src/tests/sqlsmith/src/sql_gen/insert.rs b/src/tests/sqlsmith/src/sql_gen/insert.rs index fe8bc29fe84a..b5c2aa575fed 100644 --- a/src/tests/sqlsmith/src/sql_gen/insert.rs +++ b/src/tests/sqlsmith/src/sql_gen/insert.rs @@ -21,7 +21,6 @@ use crate::sql_gen::SqlGenerator; use crate::Table; impl<'a, R: Rng> SqlGenerator<'a, R> { - #[allow(dead_code)] pub(crate) fn gen_insert_stmt(&mut self, table: Table, row_count: usize) -> Statement { let table_name = ObjectName(vec![table.name.as_str().into()]); let data_types = table diff --git a/src/tests/sqlsmith/src/sql_gen/mod.rs b/src/tests/sqlsmith/src/sql_gen/mod.rs index 11b73021e306..2a73d791fe76 100644 --- a/src/tests/sqlsmith/src/sql_gen/mod.rs +++ b/src/tests/sqlsmith/src/sql_gen/mod.rs @@ -78,16 +78,13 @@ pub(crate) struct SqlGeneratorContext { // Used in top level, where we want to test queries // without aggregates. inside_agg: bool, - inside_explicit_cast: bool, } -#[allow(dead_code)] impl SqlGeneratorContext { pub fn new() -> Self { SqlGeneratorContext { can_agg: true, inside_agg: false, - inside_explicit_cast: false, } } @@ -95,7 +92,6 @@ impl SqlGeneratorContext { Self { can_agg, inside_agg: false, - inside_explicit_cast: false, } } @@ -106,17 +102,6 @@ impl SqlGeneratorContext { } } - pub fn set_inside_explicit_cast(self) -> Self { - Self { - inside_explicit_cast: true, - ..self - } - } - - pub fn can_implicit_cast(self) -> bool { - !self.inside_explicit_cast - } - pub fn can_gen_agg(self) -> bool { self.can_agg && !self.inside_agg } diff --git a/src/tests/sqlsmith/src/sql_gen/relation.rs b/src/tests/sqlsmith/src/sql_gen/relation.rs index 86e57497bf53..3624d852fce4 100644 --- a/src/tests/sqlsmith/src/sql_gen/relation.rs +++ b/src/tests/sqlsmith/src/sql_gen/relation.rs @@ -19,17 +19,18 @@ use risingwave_sqlparser::ast::{ Ident, ObjectName, TableAlias, TableFactor, TableWithJoins, Value, }; +use crate::sql_gen::types::BINARY_INEQUALITY_OP_TABLE; use crate::sql_gen::{Column, SqlGenerator, SqlGeneratorContext}; use crate::{BinaryOperator, Expr, Join, JoinConstraint, JoinOperator, Table}; -fn create_equi_expr(left: String, right: String) -> Expr { +fn create_binary_expr(op: BinaryOperator, left: String, right: String) -> Expr { let left = Box::new(Expr::Identifier(Ident::new_unchecked(left))); let right = Box::new(Expr::Identifier(Ident::new_unchecked(right))); - Expr::BinaryOp { - left, - op: BinaryOperator::Eq, - right, - } + Expr::BinaryOp { left, op, right } +} + +fn create_equi_expr(left: String, right: String) -> Expr { + create_binary_expr(BinaryOperator::Eq, left, right) } impl<'a, R: Rng> SqlGenerator<'a, R> { @@ -39,13 +40,22 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { match self.rng.gen_range(0..=range) { 0..=0 => self.gen_simple_table(), 1..=1 => self.gen_time_window_func(), - 2..=3 => self.gen_join_clause(), - 4..=4 => self.gen_table_subquery(), + 2..=3 => self + .gen_simple_join_clause() + .unwrap_or_else(|| self.gen_simple_table()), + 4..=4 => self.gen_more_joins(), + 5..=5 => self.gen_table_subquery(), + // TODO(kwannoel): cycles, bushy joins. _ => unreachable!(), } } fn gen_simple_table(&mut self) -> (TableWithJoins, Vec
) { + let (table_with_joins, table) = self.gen_simple_table_inner(); + (table_with_joins, vec![table]) + } + + fn gen_simple_table_inner(&mut self) -> (TableWithJoins, Table) { let (relation, _, table) = self.gen_simple_table_factor(); ( @@ -57,7 +67,7 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { ) } - fn gen_simple_table_factor(&mut self) -> (TableFactor, Vec, Vec
) { + fn gen_simple_table_factor(&mut self) -> (TableFactor, Vec, Table) { let alias = self.gen_table_name_with_prefix("t"); let mut table = self.tables.choose(&mut self.rng).unwrap().clone(); let table_factor = TableFactor::Table { @@ -70,10 +80,10 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { }; table.name = alias; // Rename the table. let columns = table.get_qualified_columns(); - (table_factor, columns, vec![table]) + (table_factor, columns, table) } - fn gen_table_factor(&mut self) -> (TableFactor, Vec, Vec
) { + fn gen_table_factor(&mut self) -> (TableFactor, Vec, Table) { let current_context = self.new_local_context(); let factor = self.gen_table_factor_inner(); self.restore_context(current_context); @@ -82,97 +92,179 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { /// Generates a table factor, and provides bound columns. /// Generated column names should be qualified by table name. - fn gen_table_factor_inner(&mut self) -> (TableFactor, Vec, Vec
) { + fn gen_table_factor_inner(&mut self) -> (TableFactor, Vec, Table) { // TODO: TableFactor::Derived, TableFactor::TableFunction, TableFactor::NestedJoin self.gen_simple_table_factor() } - /// TODO: - /// Generate equi join with columns not of the same type, - /// use functions to transform to same type. fn gen_equi_join_columns( &mut self, left_columns: Vec, right_columns: Vec, - ) -> Option<(Column, Column)> { + ) -> Vec<(Column, Column)> { let mut available_join_on_columns = vec![]; for left_column in &left_columns { for right_column in &right_columns { if left_column.data_type == right_column.data_type { - available_join_on_columns.push((left_column, right_column)) + available_join_on_columns.push((left_column.clone(), right_column.clone())) } } } + available_join_on_columns + } + + fn gen_bool_with_tables(&mut self, tables: Vec
) -> Expr { + let old_context = self.new_local_context(); + self.add_relations_to_context(tables); + let expr = self.gen_expr(&Boolean, SqlGeneratorContext::new_with_can_agg(false)); + self.restore_context(old_context); + expr + } + + fn gen_single_equi_join_expr( + &mut self, + left_columns: Vec, + right_columns: Vec, + ) -> Option<(Expr, Vec<(Column, Column)>)> { + let mut available_join_on_columns = self.gen_equi_join_columns(left_columns, right_columns); if available_join_on_columns.is_empty() { return None; } - let i = self.rng.gen_range(0..available_join_on_columns.len()); - let (left_column, right_column) = available_join_on_columns[i]; - Some((left_column.clone(), right_column.clone())) + available_join_on_columns.shuffle(&mut self.rng); + let remaining_columns = available_join_on_columns.split_off(1); + let (left_column, right_column) = available_join_on_columns.drain(..).next().unwrap(); + let join_on_expr = create_equi_expr(left_column.name, right_column.name); + Some((join_on_expr, remaining_columns)) + } + + fn gen_non_equi_expr(&mut self, available_join_on_columns: Vec<(Column, Column)>) -> Expr { + let expr = Expr::Value(Value::Boolean(true)); + if available_join_on_columns.is_empty() { + return expr; + } + let n = self.rng.gen_range(0..available_join_on_columns.len()); + let mut count = 0; + for (l_col, r_col) in available_join_on_columns { + if count >= n { + break; + } + let Some(inequality_ops) = BINARY_INEQUALITY_OP_TABLE.get(&(l_col.data_type, r_col.data_type)) else { + continue; + }; + let inequality_op = inequality_ops.choose(&mut self.rng).unwrap(); + let _non_equi_expr = create_binary_expr(inequality_op.clone(), l_col.name, r_col.name); + count += 1; + } + expr + } + + fn gen_more_equi_join_exprs( + &mut self, + mut available_join_on_columns: Vec<(Column, Column)>, + ) -> Expr { + let mut expr = Expr::Value(Value::Boolean(true)); + if available_join_on_columns.is_empty() { + return expr; + } + let n_join_cols = available_join_on_columns.len(); + let n = if n_join_cols < 2 { + n_join_cols + } else { + match self.rng.gen_range(0..100) { + 0..=10 => self.rng.gen_range(n_join_cols / 2..n_join_cols), + 11..=100 => self.rng.gen_range(0..n_join_cols / 2), + _ => unreachable!(), + } + }; + + for (l_col, r_col) in available_join_on_columns.drain(0..n) { + let equi_expr = create_equi_expr(l_col.name, r_col.name); + expr = Expr::BinaryOp { + left: Box::new(expr), + op: BinaryOperator::And, + right: Box::new(equi_expr), + } + } + expr + } + + fn gen_arbitrary_bool(&mut self, left_table: Table, right_table: Table) -> Option { + let expr = self.gen_bool_with_tables(vec![left_table, right_table]); + + // FIXME(noel): This is a hack to reduce streaming nested loop join occurrences. + // ... JOIN ON x=y AND false => ... JOIN ON x=y + // We can use const folding, then remove the right expression, + // if it evaluates to `false` after const folding. + // Have to first bind `Expr`, since it is AST form. + // Then if successfully bound, use `eval_row_const` to constant fold it. + // Take a look at . + if expr != Expr::Value(Value::Boolean(false)) { + Some(expr) + } else { + None + } } /// Generates the `ON` clause in `t JOIN t2 ON ...` /// It will generate at least one equi join condition /// This will reduce chance of nested loop join from being generated. - /// TODO: Generate equi-join on different types. fn gen_join_on_expr( &mut self, left_columns: Vec, - left_table: Vec
, + left_table: Table, right_columns: Vec, - right_table: Vec
, + right_table: Table, ) -> Option { // We always generate an equi join, to avoid stream nested loop join. - let Some((l, r)) = self.gen_equi_join_columns(left_columns, right_columns) else { - return None; + let Some((base_join_on_expr, remaining_equi_columns)) = self.gen_single_equi_join_expr(left_columns, right_columns) else { + return None }; - let mut join_on_expr = create_equi_expr(l.name, r.name); - // Add extra boolean expressions - if self.flip_coin() { - let old_context = self.new_local_context(); - self.add_relations_to_context(left_table); - self.add_relations_to_context(right_table); - let expr = self.gen_expr(&Boolean, SqlGeneratorContext::new_with_can_agg(false)); - self.restore_context(old_context); - // FIXME(noel): Hack to reduce streaming nested loop join occurrences. - // ... JOIN ON x=y AND false => ... JOIN ON x=y - // We can use const folding, then remove the right expression, - // if it evaluates to `false` after const folding. - if expr != Expr::Value(Value::Boolean(false)) { - join_on_expr = Expr::BinaryOp { - left: Box::new(join_on_expr), - op: BinaryOperator::And, - right: Box::new(expr), - } - } + + // Add more expressions + let extra_expr = match self.rng.gen_range(1..=100) { + 1..=25 => None, + 26..=50 => Some(self.gen_non_equi_expr(remaining_equi_columns)), + 51..=75 => Some(self.gen_more_equi_join_exprs(remaining_equi_columns)), + 76..=100 => self.gen_arbitrary_bool(left_table, right_table), + _ => unreachable!(), + }; + if let Some(extra_expr) = extra_expr { + Some(Expr::BinaryOp { + left: Box::new(base_join_on_expr), + op: BinaryOperator::And, + right: Box::new(extra_expr), + }) + } else { + Some(base_join_on_expr) } - Some(join_on_expr) } fn gen_join_constraint( &mut self, left_columns: Vec, - left_table: Vec
, + left_table: Table, right_columns: Vec, - right_table: Vec
, + right_table: Table, ) -> Option { let expr = self.gen_join_on_expr(left_columns, left_table, right_columns, right_table)?; Some(JoinConstraint::On(expr)) } /// Generates t1 JOIN t2 ON ... - fn gen_join_clause(&mut self) -> (TableWithJoins, Vec
) { - let (left_factor, left_columns, mut left_table) = self.gen_table_factor(); - let (right_factor, right_columns, mut right_table) = self.gen_table_factor(); - - let join_constraint = self.gen_join_constraint( + fn gen_join_operator( + &mut self, + left_columns: Vec, + left_table: Table, + right_columns: Vec, + right_table: Table, + ) -> Option { + let Some(join_constraint) = self.gen_join_constraint( left_columns, - left_table.clone(), + left_table, right_columns, - right_table.clone(), - ); - let Some(join_constraint) = join_constraint else { - return self.gen_simple_table(); + right_table, + ) else { + return None; }; // NOTE: INNER JOIN works fine, usually does not encounter `StreamNestedLoopJoin` much. @@ -187,18 +279,67 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { // _ => JoinOperator::CrossJoin, }; + Some(join_operator) + } + + /// Generates t1 JOIN t2 ON ... + fn gen_simple_join_clause(&mut self) -> Option<(TableWithJoins, Vec
)> { + let (left_factor, left_columns, left_table) = self.gen_table_factor(); + let (right_factor, right_columns, right_table) = self.gen_table_factor(); + let Some(join_operator) = + self.gen_join_operator(left_columns, left_table.clone(), right_columns, right_table.clone()) else { + return None; + }; + let right_factor_with_join = Join { relation: right_factor, join_operator, }; - // TODO: Different structures of joins (bushy, left, right, cycles) - left_table.append(&mut right_table); - ( + Some(( TableWithJoins { relation: left_factor, joins: vec![right_factor_with_join], }, - left_table, + vec![left_table, right_table], + )) + } + + fn gen_more_joins(&mut self) -> (TableWithJoins, Vec
) { + // gen left + let Some((left_table_with_join, mut left_tables)) = self.gen_simple_join_clause() else { + return self.gen_simple_table(); + }; + let left_columns = left_tables + .iter() + .flat_map(|t| t.get_qualified_columns()) + .collect(); + + // gen right + let (right_factor, right_columns, right_table) = self.gen_table_factor(); + + // gen join + let left_table = left_tables.choose(&mut self.rng).unwrap(); + let Some(join_operator) = + self.gen_join_operator(left_columns, left_table.clone(), right_columns, right_table.clone()) else { + return (left_table_with_join, left_tables); + }; + + // build result + let mut tables = vec![]; + tables.append(&mut left_tables); + tables.push(right_table); + + let right_join = Join { + relation: right_factor, + join_operator, + }; + + ( + TableWithJoins { + relation: TableFactor::NestedJoin(Box::new(left_table_with_join)), + joins: vec![right_join], + }, + tables, ) } diff --git a/src/tests/sqlsmith/src/sql_gen/scalar.rs b/src/tests/sqlsmith/src/sql_gen/scalar.rs index e79fdba9b4dc..2383e361c4a5 100644 --- a/src/tests/sqlsmith/src/sql_gen/scalar.rs +++ b/src/tests/sqlsmith/src/sql_gen/scalar.rs @@ -108,7 +108,6 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { } /// Generates a list of [`n`] simple scalar values of a specific [`type`]. - #[allow(dead_code)] fn gen_simple_scalar_list(&mut self, ty: &DataType, n: usize) -> Vec { (0..n).map(|_| self.gen_simple_scalar(ty)).collect() } diff --git a/src/tests/sqlsmith/src/sql_gen/types.rs b/src/tests/sqlsmith/src/sql_gen/types.rs index 561b38d5c26a..cc88a4095238 100644 --- a/src/tests/sqlsmith/src/sql_gen/types.rs +++ b/src/tests/sqlsmith/src/sql_gen/types.rs @@ -25,7 +25,7 @@ use risingwave_expr::sig::agg::{agg_func_sigs, AggFuncSig as RwAggFuncSig}; use risingwave_expr::sig::cast::{cast_sigs, CastContext, CastSig as RwCastSig}; use risingwave_expr::sig::func::{func_sigs, FuncSign as RwFuncSig}; use risingwave_frontend::expr::ExprType; -use risingwave_sqlparser::ast::{DataType as AstDataType, StructField}; +use risingwave_sqlparser::ast::{BinaryOperator, DataType as AstDataType, StructField}; pub(super) fn data_type_to_ast_data_type(data_type: &DataType) -> AstDataType { match data_type { @@ -244,3 +244,53 @@ pub(crate) static IMPLICIT_CAST_TABLE: LazyLock>> .for_each(|cast| casts.entry(cast.to_type.clone()).or_default().push(cast)); casts }); + +fn expr_type_to_inequality_op(typ: ExprType) -> Option { + match typ { + ExprType::GreaterThan => Some(BinaryOperator::Gt), + ExprType::GreaterThanOrEqual => Some(BinaryOperator::GtEq), + ExprType::LessThan => Some(BinaryOperator::Lt), + ExprType::LessThanOrEqual => Some(BinaryOperator::LtEq), + ExprType::NotEqual => Some(BinaryOperator::NotEq), + _ => None, + } +} + +/// Build set of binary inequality functions like `>`, `<`, etc... +/// Maps from LHS, RHS argument to Inequality Operation +/// For instance: +/// GreaterThanOrEqual(Int16, Int64) -> Boolean +/// Will store an entry of: +/// Key: Int16, Int64 +/// Value: `BinaryOp::GreaterThanOrEqual` +/// in the table. +pub(crate) static BINARY_INEQUALITY_OP_TABLE: LazyLock< + HashMap<(DataType, DataType), Vec>, +> = LazyLock::new(|| { + let mut funcs = HashMap::<(DataType, DataType), Vec>::new(); + func_sigs() + .filter(|func| { + !FUNC_BAN_LIST.contains(&func.func) + && func.ret_type == DataTypeName::Boolean + && func.inputs_type.len() == 2 + && func + .inputs_type + .iter() + .all(|t| *t != DataTypeName::Timestamptz) + }) + .filter_map(|func| { + let Some(lhs) = data_type_name_to_ast_data_type(&func.inputs_type[0]) else { + return None; + }; + let Some(rhs) = data_type_name_to_ast_data_type(&func.inputs_type[1]) else { + return None; + }; + let args = (lhs, rhs); + let Some(op) = expr_type_to_inequality_op(func.func) else { + return None; + }; + Some((args, op)) + }) + .for_each(|(args, op)| funcs.entry(args).or_default().push(op)); + funcs +});