MDEV-31496: Make optimizer handle UCASE(varchar_col)=...

(Review input addressed) (Added handling of UPDATE/DELETE and partitioning w/o index) If the properties of the used collation allow, do the following equivalent rewrites: 1. UPPER(key_col)=expr -> key_col=expr expr=UPPER(key_col) -> expr=key_col (also rewrite both sides of the equality at the same time) 2. UPPER(key_col) IN (constant-list) -> key_col IN (constant-list) - Mark utf8mb{3,4}_general_ci as collations that allow this. - Add optimizer_switch='sargable_casefold=ON' to control this. (ON by default in this patch) - Cover the rewrite in Optimizer Trace, rewrite name is "sargable_casefold_removal".
MariaDB · Sep 12, 2023 · e987b93 · e987b93
1 parent 8ad1e26
commit e987b93
Show file tree

Hide file tree

Showing 25 changed files with 858 additions and 24 deletions.
diff --git a/include/m_ctype.h b/include/m_ctype.h
@@ -287,6 +287,7 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
 #define MY_CS_NON1TO1 0x40000  /* Has a complex mapping from characters
                                   to weights, e.g. contractions, expansions,
                                   ignorable characters */
+#define MY_CS_UPPER_EQUAL_AS_EQUAL 0x80000 /* (UPPER(x)=UPPER(y)) <=> (x=y)*/
 #define MY_CHARSET_UNDEFINED 0
 
 /* Character repertoire flags */

diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt
@@ -71,6 +71,7 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc
            ../sql/mf_iocache.cc ../sql/my_decimal.cc 
            ../sql/net_serv.cc ../sql/opt_range.cc
            ../sql/opt_rewrite_date_cmp.cc
+           ../sql/opt_rewrite_remove_casefold.cc
            ../sql/opt_sum.cc
            ../sql/parse_file.cc ../sql/procedure.cc ../sql/protocol.cc 
            ../sql/records.cc ../sql/repl_failsafe.cc ../sql/rpl_filter.cc

diff --git a/mysql-test/include/sargable_casefold.inc b/mysql-test/include/sargable_casefold.inc
@@ -0,0 +1,44 @@
+# Check sargable_casefold rewrite for $collation
+
+eval create table t1 (
+  col1 varchar(32), 
+  col2 varchar(32), 
+  col3 char(32), 
+  col4  text, 
+  key(col1),
+  key(col2),
+  key(col3),
+  key(col4(32))
+) collate $collation;
+
+insert into t1
+select 
+  concat('A-', seq),
+  concat('A-', seq),
+  concat('A-', seq),
+  concat('A-', seq)
+from seq_1_to_100;
+
+analyze table t1 persistent for all;
+
+--echo # Basic examples. All should use ref(col1):
+explain
+select * from t1 where upper(col1)='A-3';
+select * from t1 where upper(col1)='A-3';
+
+explain
+select * from t1 where ucase(col1)='a-3';
+select * from t1 where ucase(col1)='a-3';
+
+explain select * from t1 where 'abc'=upper(col1);
+explain select * from t1 where 'xyz'=ucase(col1);
+
+create view v1 as select * from t1;
+explain select * from v1 where 'abc'=upper(col1);
+drop view v1;
+
+explain select * from t1 where upper(col3)='a-3';
+explain select * from t1 where upper(col4)='a-3';
+
+# DROP TABLE t1 is missing intentionally here.
+
diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result
@@ -804,7 +804,7 @@ The following specify which files/extra groups are read (specified before remain
  condition_pushdown_for_derived, split_materialized, 
  condition_pushdown_for_subquery, rowid_filter, 
  condition_pushdown_from_having, not_null_range_scan, 
- hash_join_cardinality
+ hash_join_cardinality, sargable_casefold
  --optimizer-trace=name 
  Controls tracing of the Optimizer:
  optimizer_trace=option=val[,option=val...], where option
@@ -1764,7 +1764,7 @@ optimizer-rowid-copy-cost 0.002653
 optimizer-scan-setup-cost 10
 optimizer-search-depth 62
 optimizer-selectivity-sampling-limit 100
-optimizer-switch index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_merge_sort_intersection=off,engine_condition_pushdown=off,index_condition_pushdown=on,derived_merge=on,derived_with_keys=on,firstmatch=on,loosescan=on,materialization=on,in_to_exists=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr=off,mrr_cost_based=off,mrr_sort_keys=off,outer_join_with_cache=on,semijoin_with_cache=on,join_cache_incremental=on,join_cache_hashed=on,join_cache_bka=on,optimize_join_buffer_size=on,table_elimination=on,extended_keys=on,exists_to_in=on,orderby_uses_equalities=on,condition_pushdown_for_derived=on,split_materialized=on,condition_pushdown_for_subquery=on,rowid_filter=on,condition_pushdown_from_having=on,not_null_range_scan=off,hash_join_cardinality=on
+optimizer-switch index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_merge_sort_intersection=off,engine_condition_pushdown=off,index_condition_pushdown=on,derived_merge=on,derived_with_keys=on,firstmatch=on,loosescan=on,materialization=on,in_to_exists=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr=off,mrr_cost_based=off,mrr_sort_keys=off,outer_join_with_cache=on,semijoin_with_cache=on,join_cache_incremental=on,join_cache_hashed=on,join_cache_bka=on,optimize_join_buffer_size=on,table_elimination=on,extended_keys=on,exists_to_in=on,orderby_uses_equalities=on,condition_pushdown_for_derived=on,split_materialized=on,condition_pushdown_for_subquery=on,rowid_filter=on,condition_pushdown_from_having=on,not_null_range_scan=off,hash_join_cardinality=on,sargable_casefold=on
 optimizer-trace 
 optimizer-trace-max-mem-size 1048576
 optimizer-use-condition-selectivity 4

diff --git a/mysql-test/main/mysqltest_tracking_info.result b/mysql-test/main/mysqltest_tracking_info.result
@@ -38,7 +38,7 @@ SET @@session.session_track_system_variables='optimizer_switch';
 set optimizer_switch='index_merge=off,index_merge_union=off,index_merge_sort_union=off,index_merge_intersection=off,index_merge_sort_intersection=on,engine_condition_pushdown=on,index_condition_pushdown=off,derived_merge=off,derived_with_keys=off,firstmatch=off,loosescan=off,materialization=on,in_to_exists=off,semijoin=off,partial_match_rowid_merge=off,partial_match_table_scan=off,subquery_cache=off,mrr=on,mrr_cost_based=on,mrr_sort_keys=on,outer_join_with_cache=off,semijoin_with_cache=off,join_cache_incremental=off,join_cache_hashed=off,join_cache_bka=off,optimize_join_buffer_size=on,table_elimination=off,extended_keys=off,exists_to_in=off,orderby_uses_equalities=off,condition_pushdown_for_derived=off';
 -- Tracker : SESSION_TRACK_SYSTEM_VARIABLES
 -- optimizer_switch
--- index_merge=off,index_merge_union=off,index_merge_sort_union=off,index_merge_intersection=off,index_merge_sort_intersection=on,engine_condition_pushdown=on,index_condition_pushdown=off,derived_merge=off,derived_with_keys=off,firstmatch=off,loosescan=off,materialization=on,in_to_exists=off,semijoin=off,partial_match_rowid_merge=off,partial_match_table_scan=off,subquery_cache=off,mrr=on,mrr_cost_based=on,mrr_sort_keys=on,outer_join_with_cache=off,semijoin_with_cache=off,join_cache_incremental=off,join_cache_hashed=off,join_cache_bka=off,optimize_join_buffer_size=on,table_elimination=off,extended_keys=off,exists_to_in=off,orderby_uses_equalities=off,condition_pushdown_for_derived=off,split_materialized=on,condition_pushdown_for_subquery=on,rowid_filter=on,condition_pushdown_from_having=on,not_null_range_scan=off,hash_join_cardinality=on
+-- index_merge=off,index_merge_union=off,index_merge_sort_union=off,index_merge_intersection=off,index_merge_sort_intersection=on,engine_condition_pushdown=on,index_condition_pushdown=off,derived_merge=off,derived_with_keys=off,firstmatch=off,loosescan=off,materialization=on,in_to_exists=off,semijoin=off,partial_match_rowid_merge=off,partial_match_table_scan=off,subquery_cache=off,mrr=on,mrr_cost_based=on,mrr_sort_keys=on,outer_join_with_cache=off,semijoin_with_cache=off,join_cache_incremental=off,join_cache_hashed=off,join_cache_bka=off,optimize_join_buffer_size=on,table_elimination=off,extended_keys=off,exists_to_in=off,orderby_uses_equalities=off,condition_pushdown_for_derived=off,split_materialized=on,condition_pushdown_for_subquery=on,rowid_filter=on,condition_pushdown_from_having=on,not_null_range_scan=off,hash_join_cardinality=on,sargable_casefold=on
 
 Warnings:
 Warning	1681	'engine_condition_pushdown=on' is deprecated and will be removed in a future release

diff --git a/mysql-test/main/sargable_casefold.result b/mysql-test/main/sargable_casefold.result
@@ -0,0 +1,278 @@
+set
+@tmp_switch_sarg_casefold=@@optimizer_switch,
+optimizer_switch='sargable_casefold=on';
+create table t1 (
+col1 varchar(32), 
+col2 varchar(32), 
+col3 char(32), 
+col4  text, 
+key(col1),
+key(col2),
+key(col3),
+key(col4(32))
+) collate utf8mb3_general_ci;
+insert into t1
+select 
+concat('A-', seq),
+concat('A-', seq),
+concat('A-', seq),
+concat('A-', seq)
+from seq_1_to_100;
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	Warning	Engine-independent statistics are not collected for column 'col4'
+test.t1	analyze	status	Table is already up to date
+# Basic examples. All should use ref(col1):
+explain
+select * from t1 where upper(col1)='A-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	99	const	1	Using index condition
+select * from t1 where upper(col1)='A-3';
+col1	col2	col3	col4
+A-3	A-3	A-3	A-3
+explain
+select * from t1 where ucase(col1)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	99	const	1	Using index condition
+select * from t1 where ucase(col1)='a-3';
+col1	col2	col3	col4
+A-3	A-3	A-3	A-3
+explain select * from t1 where 'abc'=upper(col1);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	99	const	1	Using index condition
+explain select * from t1 where 'xyz'=ucase(col1);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	99	const	1	Using index condition
+create view v1 as select * from t1;
+explain select * from v1 where 'abc'=upper(col1);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	99	const	1	Using index condition
+drop view v1;
+explain select * from t1 where upper(col3)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col3	col3	97	const	1	Using index condition
+explain select * from t1 where upper(col4)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col4	col4	99	const	1	Using where
+# must not be rewritten:
+explain select * from t1 where ucase(col1 collate utf8mb3_bin)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+# Will not do the rewrite due to collation mismatch:
+explain select * from t1 where ucase(col1)=_utf8mb3'abc' COLLATE utf8mb3_bin;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+drop table t1;
+create table t1 (
+col1 varchar(32), 
+col2 varchar(32), 
+col3 char(32), 
+col4  text, 
+key(col1),
+key(col2),
+key(col3),
+key(col4(32))
+) collate utf8mb4_general_ci;
+insert into t1
+select 
+concat('A-', seq),
+concat('A-', seq),
+concat('A-', seq),
+concat('A-', seq)
+from seq_1_to_100;
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	Warning	Engine-independent statistics are not collected for column 'col4'
+test.t1	analyze	status	Table is already up to date
+# Basic examples. All should use ref(col1):
+explain
+select * from t1 where upper(col1)='A-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	131	const	1	Using index condition
+select * from t1 where upper(col1)='A-3';
+col1	col2	col3	col4
+A-3	A-3	A-3	A-3
+explain
+select * from t1 where ucase(col1)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	131	const	1	Using index condition
+select * from t1 where ucase(col1)='a-3';
+col1	col2	col3	col4
+A-3	A-3	A-3	A-3
+explain select * from t1 where 'abc'=upper(col1);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	131	const	1	Using index condition
+explain select * from t1 where 'xyz'=ucase(col1);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	131	const	1	Using index condition
+create view v1 as select * from t1;
+explain select * from v1 where 'abc'=upper(col1);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col1	col1	131	const	1	Using index condition
+drop view v1;
+explain select * from t1 where upper(col3)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col3	col3	129	const	1	Using index condition
+explain select * from t1 where upper(col4)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ref	col4	col4	131	const	1	Using where
+# must not be rewritten:
+explain select * from t1 where ucase(col1 collate utf8mb4_bin)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+# Will not do the rewrite due to collation mismatch:
+explain select * from t1 where ucase(col1)=_utf8mb4'abc' COLLATE utf8mb4_bin;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+#
+# Check if optimizer_switch turns the rewrite off:
+#
+set 
+@save_os=@@optimizer_switch, 
+optimizer_switch='sargable_casefold=off';
+explain select * from t1 where upper(col1)='A-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+explain select * from t1 where ucase(col1)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+set optimizer_switch=@save_os;
+# The following will not do the rewrite because the comparison
+# is done as DOUBLEs. Come to think of it, it won't harm to do
+# the rewrite but it is outside of the scope of this patch:
+explain select * from t1 where ucase(col1)=123.456;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+select
+coercibility(upper(col1))
+from t1 limit 1;
+coercibility(upper(col1))
+2
+select coercibility(_utf8mb3'abc' COLLATE utf8mb3_bin);
+coercibility(_utf8mb3'abc' COLLATE utf8mb3_bin)
+0
+# This is transformed too even if it doesn't create any new
+# [potential] access paths:
+explain format=json select * from t1 where upper(col1)=upper(col2);
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "cost": 0.0256761,
+    "nested_loop": [
+      {
+        "table": {
+          "table_name": "t1",
+          "access_type": "ALL",
+          "loops": 1,
+          "rows": 100,
+          "cost": 0.0256761,
+          "filtered": 100,
+          "attached_condition": "t1.col2 = t1.col1"
+        }
+      }
+    ]
+  }
+}
+#
+# Check if ref access works
+#
+create table t2 (
+a varchar(32),
+non_key varchar(32),
+key(a)
+) collate utf8mb4_general_ci;
+insert into t2
+select
+concat('A-', seq),
+concat('A-', seq)
+from seq_1_to_10;
+# Must use ref access for t1:
+explain select * from t1, t2 where upper(t1.col1)= t2.non_key;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	10	Using where
+1	SIMPLE	t1	ref	col1	col1	131	test.t2.non_key	1	
+create table t3 (
+a varchar(32),
+b varchar(32),
+key(a),
+key(b)
+) collate utf8mb3_general_ci;
+insert into t3 values ('abc','ABC'), ('xyz','XYZ');
+explain extended
+select a from t3 ignore index(a) where a=b and upper(b)='ABC';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t3	ref	b	b	99	const	1	100.00	Using index condition; Using where
+Warnings:
+Note	1003	select `test`.`t3`.`a` AS `a` from `test`.`t3` IGNORE INDEX (`a`) where `test`.`t3`.`a` = `test`.`t3`.`b` and `test`.`t3`.`b` = 'ABC'
+#
+# Check that rewrite isn't applied for non-applicable collations
+#
+create table t4 (
+col1 varchar(32) collate utf8mb3_bin,
+col2 varchar(32) collate utf8mb3_czech_ci,
+col3 varchar(32) collate latin1_bin,
+key(col1),
+key(col2),
+key(col3)
+);
+insert into t4
+select 
+concat('A-', seq),
+concat('A-', seq),
+concat('A-', seq)
+from seq_1_to_100;
+analyze table t4 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t4	analyze	status	Engine-independent statistics collected
+test.t4	analyze	status	Table is already up to date
+# None should use ref access:
+explain select * from t4 where upper(col1)='A-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t4	ALL	NULL	NULL	NULL	NULL	100	Using where
+explain select * from t4 where upper(col2)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t4	ALL	NULL	NULL	NULL	NULL	100	Using where
+explain select * from t4 where upper(col3)='a-3';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t4	ALL	NULL	NULL	NULL	NULL	100	Using where
+#
+# Check that rewrite works for UPPER(col) IN (const-list)
+#
+set
+@tmp_ot= @@optimizer_trace,
+optimizer_trace=1;
+# must use range:
+explain
+select * from t1 where upper(col1) IN ('A-3','A-4','a-5');
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	col1	col1	131	NULL	3	Using index condition
+select * from t1 where upper(col1) IN ('A-3','A-4','a-5');
+col1	col2	col3	col4
+A-3	A-3	A-3	A-3
+A-4	A-4	A-4	A-4
+A-5	A-5	A-5	A-5
+# Will not use the rewrite:
+explain
+select * from t1 where upper(col1) IN ('A-3','A-4',col2);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+#
+# MDEV-31946: Optimizer handle UCASE(varchar_col)=... does not work for UPDATE/DELETE
+#
+explain delete from t1 where upper(col1)='A';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	col1	col1	131	NULL	1	Using where
+explain delete from t1 where upper(col1) IN ('A','B');
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	col1	col1	131	NULL	2	Using where
+explain update t1 set col2='ABC' where upper(col1)='A';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	col1	col1	131	NULL	1	Using where
+explain update t1 set col2='ABC' where upper(col1) IN ('A','B');
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	col1	col1	131	NULL	2	Using where
+drop table t1,t2,t3,t4;
+set optimizer_switch=@tmp_switch_sarg_casefold;