From d62885aba78489d92d77b1656d2b97648b274dd2 Mon Sep 17 00:00:00 2001 From: Alexander Alexandrov Date: Wed, 22 Mar 2023 13:32:57 +0200 Subject: [PATCH] transform: rework `column_knowledge` to make it `recursion_safe` --- src/transform/src/column_knowledge.rs | 86 +++++++- .../{ => transform}/column_knowledge.slt | 188 ++++++++++++++++++ 2 files changed, 266 insertions(+), 8 deletions(-) rename test/sqllogictest/{ => transform}/column_knowledge.slt (70%) diff --git a/src/transform/src/column_knowledge.rs b/src/transform/src/column_knowledge.rs index 359de89a19a1..459583b7f8ea 100644 --- a/src/transform/src/column_knowledge.rs +++ b/src/transform/src/column_knowledge.rs @@ -11,7 +11,7 @@ use std::collections::BTreeMap; -use itertools::Itertools; +use itertools::{zip_eq, Itertools}; use mz_expr::visit::Visit; use mz_expr::JoinImplementation::IndexedFilter; @@ -43,6 +43,10 @@ impl CheckedRecursion for ColumnKnowledge { } impl crate::Transform for ColumnKnowledge { + fn recursion_safe(&self) -> bool { + true + } + /// Transforms an expression through accumulated knowledge. #[tracing::instrument( target = "optimizer" @@ -106,13 +110,50 @@ impl ColumnKnowledge { } Ok(body_knowledge) } - MirRelationExpr::LetRec { - ids: _, - values: _, - body: _, - } => { - // TODO - Err(crate::TransformError::LetRecUnsupported)? + MirRelationExpr::LetRec { ids, values, body } => { + // As a first approximation, we treat LetRec blocks as + // barriers for this optimization. + + // A map to hold knowledge from shadowed bindings + let mut shadowed_knowledge = BTreeMap::new(); + + // We set knowledge[i][j] = DatumKnowledge::empty() for each + // column j and CTE i. + for (id, value) in zip_eq(ids.iter(), values.iter()) { + let id = mz_expr::Id::Local(id.clone()); + let knowledge_new = vec![DatumKnowledge::empty(); value.arity()]; + if let Some(knowledge_old) = knowledge.insert(id, knowledge_new) { + shadowed_knowledge.insert(id, knowledge_old); + }; + } + + // For the first loop, we sequentially absorb into + // knowledge[i][j] the result of descending into values[i]. + // + // This is consistent with the WMR evaluation semantics of: + // 1. sequential updates, and + // 2. at least one iteration. + for (id, value) in zip_eq(ids.iter(), values.iter_mut()) { + let id = mz_expr::Id::Local(id.clone()); + let next_knowledge = self.harvest(value, knowledge, knowledge_stack)?; + let prev_knowledge = knowledge.get_mut(&id).unwrap(); + for (prev, next) in zip_eq(prev_knowledge.iter_mut(), next_knowledge) { + prev.absorb(&next) // Use absorb for the first iteration. + } + } + + // Descend into the body with the knowledge corresponding to one WMR iteration. + let body_knowledge = self.harvest(body, knowledge, knowledge_stack)?; + + // Restore shadowed bindings. + for id in ids.iter() { + let id = mz_expr::Id::Local(id.clone()); + if let Some(old_knowledge) = shadowed_knowledge.remove(&id) { + knowledge.insert(id, old_knowledge); + } + } + + Ok(body_knowledge) } MirRelationExpr::Project { input, outputs } => { let input_knowledge = self.harvest(input, knowledge, knowledge_stack)?; @@ -389,6 +430,21 @@ pub struct DatumKnowledge { } impl DatumKnowledge { + /// Constructs knowledge for a column that is known to come from an empty + /// collection. Useful when initializing knowledge for LetRec bindings. + /// + /// Note that DatumKnowledge::default() provides the bottom element w.r.t. + /// the induced partial DatumKnowledge order (see the field docs). + fn empty() -> Self { + // Strictly speaking we should evolve DatumKnowledge to an enum with two + // variants: Empty and NonEmpty and tweak the `union` operation such + // that `Empty union x = x`. + Self { + value: None, + nullable: false, // This is safe because the datum will never exist. + } + } + // Intersects (strengthens) the possible states of a column. fn absorb(&mut self, other: &Self) { self.nullable &= other.nullable; @@ -543,3 +599,17 @@ pub fn optimize( TransformError::Internal(String::from("unexpectedly empty stack in optimize")) }) } + +#[allow(dead_code)] // keep debugging method around +fn print_knowledge_vec<'a>( + knowledge: &BTreeMap>, + ids: impl Iterator, +) { + for id in ids { + let id = mz_expr::Id::Local(id.clone()); + for (i, k) in knowledge.get(&id).unwrap().iter().enumerate() { + println!("{id}.#{i}: {k:?}"); + } + } + println!("--------------"); +} diff --git a/test/sqllogictest/column_knowledge.slt b/test/sqllogictest/transform/column_knowledge.slt similarity index 70% rename from test/sqllogictest/column_knowledge.slt rename to test/sqllogictest/transform/column_knowledge.slt index 746757036293..9e11a4cca4e5 100644 --- a/test/sqllogictest/column_knowledge.slt +++ b/test/sqllogictest/transform/column_knowledge.slt @@ -568,3 +568,191 @@ Explained Query: Get materialize.public.json_table // { arity: 1, types: "(jsonb?)" } EOF + +# WITH MUTUALLY RECURSIVE support +# ------------------------------- + +# Single binding, value knowledge +query T multiline +EXPLAIN WITH(arity, types) +WITH MUTUALLY RECURSIVE + c0(f1 integer, f2 integer) AS ( + SELECT * FROM ( + SELECT * FROM t1 + UNION + SELECT * FROM c0 + ) WHERE f1 = 3 AND f2 = 5 + ) +SELECT f1, f2, f1 + f2 FROM c0; +---- +Explained Query: + Return // { arity: 3, types: "(integer, integer, integer)" } + Map (8) // { arity: 3, types: "(integer, integer, integer)" } + Get l0 // { arity: 2, types: "(integer, integer)" } + With Mutually Recursive + cte l0 = + Distinct group_by=[3, 5] // { arity: 2, types: "(integer, integer)" } + Union // { arity: 0, types: "()" } + Project () // { arity: 0, types: "()" } + Filter (#0 = 3) AND (#1 = 5) // { arity: 2, types: "(integer, integer)" } + Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" } + Project () // { arity: 0, types: "()" } + Filter (#0 = 3) AND (#1 = 5) // { arity: 2, types: "(integer, integer)" } + Get l0 // { arity: 2, types: "(integer?, integer?)" } + +Source materialize.public.t1 + filter=((#0 = 3) AND (#1 = 5)) + +EOF + +# Single binding, NOT NULL knowledge +query T multiline +EXPLAIN WITH(arity, types) +WITH MUTUALLY RECURSIVE + c0(f1 integer, f2 integer) AS ( + SELECT * FROM ( + SELECT * FROM t1 + UNION + SELECT * FROM c0 + ) WHERE f1 IS NOT NULL AND f2 IS NOT NULL + ) +SELECT f1, f2, f1 IS NOT NULL, f2 IS NULL FROM c0; +---- +Explained Query: + Return // { arity: 4, types: "(integer?, integer?, boolean, boolean)" } + Map (true, false) // { arity: 4, types: "(integer?, integer?, boolean, boolean)" } + Get l0 // { arity: 2, types: "(integer?, integer?)" } + With Mutually Recursive + cte l0 = + Distinct group_by=[#0, #1] // { arity: 2, types: "(integer?, integer?)" } + Union // { arity: 2, types: "(integer?, integer?)" } + Filter (#1) IS NOT NULL // { arity: 2, types: "(integer, integer)" } + Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" } + Get l0 // { arity: 2, types: "(integer?, integer?)" } + +Source materialize.public.t1 + filter=((#1) IS NOT NULL) + +EOF + +# Single binding, NOT NULL knowledge inside a UNION branch +query T multiline +EXPLAIN WITH(arity, types) +WITH MUTUALLY RECURSIVE + c0(f1 integer, f2 integer) AS ( + SELECT * FROM ( + SELECT * FROM t1 + UNION + SELECT * FROM c0 WHERE f1 IS NOT NULL AND f2 IS NOT NULL + ) + ) +SELECT f1, f2, f1 IS NOT NULL, f2 IS NULL FROM c0; +---- +Explained Query: + Return // { arity: 4, types: "(integer?, integer?, boolean, boolean)" } + Map (true, false) // { arity: 4, types: "(integer?, integer?, boolean, boolean)" } + Get l0 // { arity: 2, types: "(integer?, integer?)" } + With Mutually Recursive + cte l0 = + Distinct group_by=[#0, #1] // { arity: 2, types: "(integer?, integer?)" } + Union // { arity: 2, types: "(integer?, integer?)" } + Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" } + Get l0 // { arity: 2, types: "(integer?, integer?)" } + +EOF + +# Multiple bindings, value knowledge +# +# This also illustrates a missed opportunity here, because if we are a bit +# smarter we will know that l1 can only have '2' in it's first component. +query T multiline +EXPLAIN WITH(arity, types) +WITH MUTUALLY RECURSIVE + it(count integer) AS ( + SELECT 1 UNION SELECT * FROM it WHERE count = 1 + ), + c0(count integer, f1 integer, f2 integer) AS ( + SELECT * FROM ( + SELECT count * 2, f1, f2 FROM it, t1 + UNION + SELECT * FROM c0 + ) + ) +SELECT * FROM c0; +---- +Explained Query: + Return // { arity: 3, types: "(integer?, integer?, integer?)" } + Get l1 // { arity: 3, types: "(integer?, integer?, integer?)" } + With Mutually Recursive + cte l1 = + Distinct group_by=[#0..=#2] // { arity: 3, types: "(integer?, integer?, integer?)" } + Union // { arity: 3, types: "(integer?, integer?, integer?)" } + Project (#2, #0, #1) // { arity: 3, types: "(integer, integer, integer?)" } + Map (2) // { arity: 3, types: "(integer, integer?, integer)" } + CrossJoin type=differential // { arity: 2, types: "(integer, integer?)" } + ArrangeBy keys=[[]] // { arity: 0, types: "()" } + Project () // { arity: 0, types: "()" } + Get l0 // { arity: 1, types: "(integer)" } + ArrangeBy keys=[[]] // { arity: 2, types: "(integer, integer?)" } + Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" } + Get l1 // { arity: 3, types: "(integer?, integer?, integer?)" } + cte l0 = + Distinct group_by=[1] // { arity: 1, types: "(integer)" } + Union // { arity: 0, types: "()" } + Constant // { arity: 0, types: "()" } + - () + Project () // { arity: 0, types: "()" } + Filter (#0 = 1) // { arity: 1, types: "(integer)" } + Get l0 // { arity: 1, types: "(integer?)" } + +EOF + + +# Multiple bindings, NOT NULL knowledge +# +# This is currently masked by identical work done by "non_nullable" at the +# moment. I had to swap the order of "non_nullable" and "column_knowledge" to +# see this transform in action in https://optimizer-trace.dev.materialize.com/. +# +# This also illustrates a missed opportunity here, because if we are a bit +# smarter we will know that l1 can only have 'false' in it's first component. +query T multiline +EXPLAIN WITH(arity, types) +WITH MUTUALLY RECURSIVE + it(count integer) AS ( + SELECT 1 UNION SELECT * FROM it WHERE count IS NOT NULL + ), + c0(count_is_null boolean, f1 integer, f2 integer) AS ( + SELECT * FROM ( + SELECT count IS NULL, f1, f2 FROM it, t1 + UNION + SELECT * FROM c0 + ) + ) +SELECT * FROM c0; +---- +Explained Query: + Return // { arity: 3, types: "(boolean?, integer?, integer?)" } + Get l1 // { arity: 3, types: "(boolean?, integer?, integer?)" } + With Mutually Recursive + cte l1 = + Distinct group_by=[#0..=#2] // { arity: 3, types: "(boolean?, integer?, integer?)" } + Union // { arity: 3, types: "(boolean?, integer?, integer?)" } + Project (#2, #0, #1) // { arity: 3, types: "(boolean, integer, integer?)" } + Map (false) // { arity: 3, types: "(integer, integer?, boolean)" } + CrossJoin type=differential // { arity: 2, types: "(integer, integer?)" } + ArrangeBy keys=[[]] // { arity: 0, types: "()" } + Project () // { arity: 0, types: "()" } + Get l0 // { arity: 1, types: "(integer?)" } + ArrangeBy keys=[[]] // { arity: 2, types: "(integer, integer?)" } + Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" } + Get l1 // { arity: 3, types: "(boolean?, integer?, integer?)" } + cte l0 = + Distinct group_by=[#0] // { arity: 1, types: "(integer?)" } + Union // { arity: 1, types: "(integer?)" } + Map (1) // { arity: 1, types: "(integer)" } + Constant // { arity: 0, types: "()" } + - () + Get l0 // { arity: 1, types: "(integer?)" } + +EOF