Skip to content

Commit

Permalink
transform: rework column_knowledge to make it recursion_safe
Browse files Browse the repository at this point in the history
  • Loading branch information
aalexandrov committed Mar 24, 2023
1 parent beefd9c commit d62885a
Show file tree
Hide file tree
Showing 2 changed files with 266 additions and 8 deletions.
86 changes: 78 additions & 8 deletions src/transform/src/column_knowledge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

use std::collections::BTreeMap;

use itertools::Itertools;
use itertools::{zip_eq, Itertools};

use mz_expr::visit::Visit;
use mz_expr::JoinImplementation::IndexedFilter;
Expand Down Expand Up @@ -43,6 +43,10 @@ impl CheckedRecursion for ColumnKnowledge {
}

impl crate::Transform for ColumnKnowledge {
fn recursion_safe(&self) -> bool {
true
}

/// Transforms an expression through accumulated knowledge.
#[tracing::instrument(
target = "optimizer"
Expand Down Expand Up @@ -106,13 +110,50 @@ impl ColumnKnowledge {
}
Ok(body_knowledge)
}
MirRelationExpr::LetRec {
ids: _,
values: _,
body: _,
} => {
// TODO
Err(crate::TransformError::LetRecUnsupported)?
MirRelationExpr::LetRec { ids, values, body } => {
// As a first approximation, we treat LetRec blocks as
// barriers for this optimization.

// A map to hold knowledge from shadowed bindings
let mut shadowed_knowledge = BTreeMap::new();

// We set knowledge[i][j] = DatumKnowledge::empty() for each
// column j and CTE i.
for (id, value) in zip_eq(ids.iter(), values.iter()) {
let id = mz_expr::Id::Local(id.clone());
let knowledge_new = vec![DatumKnowledge::empty(); value.arity()];
if let Some(knowledge_old) = knowledge.insert(id, knowledge_new) {
shadowed_knowledge.insert(id, knowledge_old);
};
}

// For the first loop, we sequentially absorb into
// knowledge[i][j] the result of descending into values[i].
//
// This is consistent with the WMR evaluation semantics of:
// 1. sequential updates, and
// 2. at least one iteration.
for (id, value) in zip_eq(ids.iter(), values.iter_mut()) {
let id = mz_expr::Id::Local(id.clone());
let next_knowledge = self.harvest(value, knowledge, knowledge_stack)?;
let prev_knowledge = knowledge.get_mut(&id).unwrap();
for (prev, next) in zip_eq(prev_knowledge.iter_mut(), next_knowledge) {
prev.absorb(&next) // Use absorb for the first iteration.
}
}

// Descend into the body with the knowledge corresponding to one WMR iteration.
let body_knowledge = self.harvest(body, knowledge, knowledge_stack)?;

// Restore shadowed bindings.
for id in ids.iter() {
let id = mz_expr::Id::Local(id.clone());
if let Some(old_knowledge) = shadowed_knowledge.remove(&id) {
knowledge.insert(id, old_knowledge);
}
}

Ok(body_knowledge)
}
MirRelationExpr::Project { input, outputs } => {
let input_knowledge = self.harvest(input, knowledge, knowledge_stack)?;
Expand Down Expand Up @@ -389,6 +430,21 @@ pub struct DatumKnowledge {
}

impl DatumKnowledge {
/// Constructs knowledge for a column that is known to come from an empty
/// collection. Useful when initializing knowledge for LetRec bindings.
///
/// Note that DatumKnowledge::default() provides the bottom element w.r.t.
/// the induced partial DatumKnowledge order (see the field docs).
fn empty() -> Self {
// Strictly speaking we should evolve DatumKnowledge to an enum with two
// variants: Empty and NonEmpty and tweak the `union` operation such
// that `Empty union x = x`.
Self {
value: None,
nullable: false, // This is safe because the datum will never exist.
}
}

// Intersects (strengthens) the possible states of a column.
fn absorb(&mut self, other: &Self) {
self.nullable &= other.nullable;
Expand Down Expand Up @@ -543,3 +599,17 @@ pub fn optimize(
TransformError::Internal(String::from("unexpectedly empty stack in optimize"))
})
}

#[allow(dead_code)] // keep debugging method around
fn print_knowledge_vec<'a>(
knowledge: &BTreeMap<mz_expr::Id, Vec<DatumKnowledge>>,
ids: impl Iterator<Item = &'a mz_expr::LocalId>,
) {
for id in ids {
let id = mz_expr::Id::Local(id.clone());
for (i, k) in knowledge.get(&id).unwrap().iter().enumerate() {
println!("{id}.#{i}: {k:?}");
}
}
println!("--------------");
}
Original file line number Diff line number Diff line change
Expand Up @@ -568,3 +568,191 @@ Explained Query:
Get materialize.public.json_table // { arity: 1, types: "(jsonb?)" }

EOF

# WITH MUTUALLY RECURSIVE support
# -------------------------------

# Single binding, value knowledge
query T multiline
EXPLAIN WITH(arity, types)
WITH MUTUALLY RECURSIVE
c0(f1 integer, f2 integer) AS (
SELECT * FROM (
SELECT * FROM t1
UNION
SELECT * FROM c0
) WHERE f1 = 3 AND f2 = 5
)
SELECT f1, f2, f1 + f2 FROM c0;
----
Explained Query:
Return // { arity: 3, types: "(integer, integer, integer)" }
Map (8) // { arity: 3, types: "(integer, integer, integer)" }
Get l0 // { arity: 2, types: "(integer, integer)" }
With Mutually Recursive
cte l0 =
Distinct group_by=[3, 5] // { arity: 2, types: "(integer, integer)" }
Union // { arity: 0, types: "()" }
Project () // { arity: 0, types: "()" }
Filter (#0 = 3) AND (#1 = 5) // { arity: 2, types: "(integer, integer)" }
Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" }
Project () // { arity: 0, types: "()" }
Filter (#0 = 3) AND (#1 = 5) // { arity: 2, types: "(integer, integer)" }
Get l0 // { arity: 2, types: "(integer?, integer?)" }

Source materialize.public.t1
filter=((#0 = 3) AND (#1 = 5))

EOF

# Single binding, NOT NULL knowledge
query T multiline
EXPLAIN WITH(arity, types)
WITH MUTUALLY RECURSIVE
c0(f1 integer, f2 integer) AS (
SELECT * FROM (
SELECT * FROM t1
UNION
SELECT * FROM c0
) WHERE f1 IS NOT NULL AND f2 IS NOT NULL
)
SELECT f1, f2, f1 IS NOT NULL, f2 IS NULL FROM c0;
----
Explained Query:
Return // { arity: 4, types: "(integer?, integer?, boolean, boolean)" }
Map (true, false) // { arity: 4, types: "(integer?, integer?, boolean, boolean)" }
Get l0 // { arity: 2, types: "(integer?, integer?)" }
With Mutually Recursive
cte l0 =
Distinct group_by=[#0, #1] // { arity: 2, types: "(integer?, integer?)" }
Union // { arity: 2, types: "(integer?, integer?)" }
Filter (#1) IS NOT NULL // { arity: 2, types: "(integer, integer)" }
Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" }
Get l0 // { arity: 2, types: "(integer?, integer?)" }

Source materialize.public.t1
filter=((#1) IS NOT NULL)

EOF

# Single binding, NOT NULL knowledge inside a UNION branch
query T multiline
EXPLAIN WITH(arity, types)
WITH MUTUALLY RECURSIVE
c0(f1 integer, f2 integer) AS (
SELECT * FROM (
SELECT * FROM t1
UNION
SELECT * FROM c0 WHERE f1 IS NOT NULL AND f2 IS NOT NULL
)
)
SELECT f1, f2, f1 IS NOT NULL, f2 IS NULL FROM c0;
----
Explained Query:
Return // { arity: 4, types: "(integer?, integer?, boolean, boolean)" }
Map (true, false) // { arity: 4, types: "(integer?, integer?, boolean, boolean)" }
Get l0 // { arity: 2, types: "(integer?, integer?)" }
With Mutually Recursive
cte l0 =
Distinct group_by=[#0, #1] // { arity: 2, types: "(integer?, integer?)" }
Union // { arity: 2, types: "(integer?, integer?)" }
Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" }
Get l0 // { arity: 2, types: "(integer?, integer?)" }

EOF

# Multiple bindings, value knowledge
#
# This also illustrates a missed opportunity here, because if we are a bit
# smarter we will know that l1 can only have '2' in it's first component.
query T multiline
EXPLAIN WITH(arity, types)
WITH MUTUALLY RECURSIVE
it(count integer) AS (
SELECT 1 UNION SELECT * FROM it WHERE count = 1
),
c0(count integer, f1 integer, f2 integer) AS (
SELECT * FROM (
SELECT count * 2, f1, f2 FROM it, t1
UNION
SELECT * FROM c0
)
)
SELECT * FROM c0;
----
Explained Query:
Return // { arity: 3, types: "(integer?, integer?, integer?)" }
Get l1 // { arity: 3, types: "(integer?, integer?, integer?)" }
With Mutually Recursive
cte l1 =
Distinct group_by=[#0..=#2] // { arity: 3, types: "(integer?, integer?, integer?)" }
Union // { arity: 3, types: "(integer?, integer?, integer?)" }
Project (#2, #0, #1) // { arity: 3, types: "(integer, integer, integer?)" }
Map (2) // { arity: 3, types: "(integer, integer?, integer)" }
CrossJoin type=differential // { arity: 2, types: "(integer, integer?)" }
ArrangeBy keys=[[]] // { arity: 0, types: "()" }
Project () // { arity: 0, types: "()" }
Get l0 // { arity: 1, types: "(integer)" }
ArrangeBy keys=[[]] // { arity: 2, types: "(integer, integer?)" }
Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" }
Get l1 // { arity: 3, types: "(integer?, integer?, integer?)" }
cte l0 =
Distinct group_by=[1] // { arity: 1, types: "(integer)" }
Union // { arity: 0, types: "()" }
Constant // { arity: 0, types: "()" }
- ()
Project () // { arity: 0, types: "()" }
Filter (#0 = 1) // { arity: 1, types: "(integer)" }
Get l0 // { arity: 1, types: "(integer?)" }

EOF


# Multiple bindings, NOT NULL knowledge
#
# This is currently masked by identical work done by "non_nullable" at the
# moment. I had to swap the order of "non_nullable" and "column_knowledge" to
# see this transform in action in https://optimizer-trace.dev.materialize.com/.
#
# This also illustrates a missed opportunity here, because if we are a bit
# smarter we will know that l1 can only have 'false' in it's first component.
query T multiline
EXPLAIN WITH(arity, types)
WITH MUTUALLY RECURSIVE
it(count integer) AS (
SELECT 1 UNION SELECT * FROM it WHERE count IS NOT NULL
),
c0(count_is_null boolean, f1 integer, f2 integer) AS (
SELECT * FROM (
SELECT count IS NULL, f1, f2 FROM it, t1
UNION
SELECT * FROM c0
)
)
SELECT * FROM c0;
----
Explained Query:
Return // { arity: 3, types: "(boolean?, integer?, integer?)" }
Get l1 // { arity: 3, types: "(boolean?, integer?, integer?)" }
With Mutually Recursive
cte l1 =
Distinct group_by=[#0..=#2] // { arity: 3, types: "(boolean?, integer?, integer?)" }
Union // { arity: 3, types: "(boolean?, integer?, integer?)" }
Project (#2, #0, #1) // { arity: 3, types: "(boolean, integer, integer?)" }
Map (false) // { arity: 3, types: "(integer, integer?, boolean)" }
CrossJoin type=differential // { arity: 2, types: "(integer, integer?)" }
ArrangeBy keys=[[]] // { arity: 0, types: "()" }
Project () // { arity: 0, types: "()" }
Get l0 // { arity: 1, types: "(integer?)" }
ArrangeBy keys=[[]] // { arity: 2, types: "(integer, integer?)" }
Get materialize.public.t1 // { arity: 2, types: "(integer, integer?)" }
Get l1 // { arity: 3, types: "(boolean?, integer?, integer?)" }
cte l0 =
Distinct group_by=[#0] // { arity: 1, types: "(integer?)" }
Union // { arity: 1, types: "(integer?)" }
Map (1) // { arity: 1, types: "(integer)" }
Constant // { arity: 0, types: "()" }
- ()
Get l0 // { arity: 1, types: "(integer?)" }

EOF

0 comments on commit d62885a

Please sign in to comment.