MDEV-26585 Wrong query results when using index for group-by

The problem was that "group_min_max optimization" does not work if some aggregate functions, like COUNT(*), is used. The function get_best_group_min_max() is using the join->sum_funcs array to check which aggregate functions are used. The bug was that aggregates in HAVING where not yet added to join->sum_funcs at the time get_best_group_min_max() was called. Fixed by populate join->sum_funcs already in prepare, which means that all sum functions will be in join->sum_funcs in get_best_group_min_max(). A benefit of this approach is that we can remove several calls to make_sum_func_list() from the code and simplify the function. I removed some wrong setting of 'sort_and_group'. This variable is set when alloc_group_fields() is called, as part of allocating the cache needed by end_send_group() and does not need to be set by other functions. One problematic thing was that Spider is using *join->sum_funcs to detect at which stage the optimizer is and do internal calculations of aggregate functions. Updating join->sum_funcs early caused Spider to fail when trying to find min/max values in opt_sum_query(). Fixed by temporarily resetting sum_funcs during opt_sum_query(). Reviewer: Sergei Petrunia
MariaDB · Feb 8, 2022 · 38058c0 · 38058c0
1 parent d314bd2
commit 38058c0
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 32 deletions.
diff --git a/mysql-test/main/group_min_max.result b/mysql-test/main/group_min_max.result
@@ -4043,3 +4043,24 @@ b
 101
 102
 DROP TABLE t1;
+#
+# MDEV-26585 Wrong query results when `using index for group-by`
+#
+CREATE TABLE `t1` (
+`id` int(11) NOT NULL AUTO_INCREMENT,
+`owner_id` int(11) DEFAULT NULL,
+`foo` tinyint(1) DEFAULT 0,
+`whatever` varchar(255) DEFAULT NULL,
+PRIMARY KEY (`id`),
+KEY `index_t1_on_owner_id_and_foo` (`owner_id`,`foo`)
+) engine=InnoDB DEFAULT CHARSET=utf8;
+INSERT INTO t1 (owner_id, foo, whatever)
+VALUES (1, TRUE, "yello"), (1, FALSE, "yello"), (2, TRUE, "yello"),
+(2, TRUE, "yello"), (2, FALSE, "yello");
+EXPLAIN SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	index	NULL	index_t1_on_owner_id_and_foo	7	NULL	5	Using where; Using index
+SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
+owner_id
+1
+DROP TABLE t1;
diff --git a/mysql-test/main/group_min_max.test b/mysql-test/main/group_min_max.test
@@ -4,7 +4,7 @@
 #
 
 --source include/default_optimizer_switch.inc
-
+--source include/have_innodb.inc
 #
 # TODO:
 # Add queries with:
@@ -1700,3 +1700,23 @@ INSERT INTO t1 VALUES (0,100),(2,100),(2,101),(3,102);
 explain SELECT DISTINCT b FROM t1 WHERE EXISTS ( SELECT 1 FROM DUAL WHERE a > 1 );
 SELECT DISTINCT b FROM t1 WHERE EXISTS ( SELECT 1 FROM DUAL WHERE a > 1 );
 DROP TABLE t1;
+
+--echo #
+--echo # MDEV-26585 Wrong query results when `using index for group-by`
+--echo #
+
+CREATE TABLE `t1` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `owner_id` int(11) DEFAULT NULL,
+  `foo` tinyint(1) DEFAULT 0,
+  `whatever` varchar(255) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  KEY `index_t1_on_owner_id_and_foo` (`owner_id`,`foo`)
+) engine=InnoDB DEFAULT CHARSET=utf8;
+
+INSERT INTO t1 (owner_id, foo, whatever)
+VALUES (1, TRUE, "yello"), (1, FALSE, "yello"), (2, TRUE, "yello"),
+       (2, TRUE, "yello"), (2, FALSE, "yello");
+EXPLAIN SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
+SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
+DROP TABLE t1;
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
@@ -1589,7 +1589,8 @@ bool JOIN::prepare_stage2()
 #endif
   if (select_lex->olap == ROLLUP_TYPE && rollup_init())
     goto err;
-  if (alloc_func_list())
+  if (alloc_func_list() ||
+      make_sum_func_list(all_fields, fields_list, false))
     goto err;
 
   res= FALSE;
@@ -2204,7 +2205,21 @@ JOIN::optimize_inner()
       If all items were resolved by opt_sum_query, there is no need to
       open any tables.
     */
-    if ((res=opt_sum_query(thd, select_lex->leaf_tables, all_fields, conds)))
+
+    /*
+      The following resetting and restoring of sum_funcs is needed to
+      go around a bug in spider where it assumes that
+      make_sum_func_list() has not been called yet and do logical
+      choices based on this if special handling of min/max functions should
+      be done. We disable this special handling while we are trying to find
+      out if we can replace MIN/MAX values with constants.
+    */
+    Item_sum **save_func_sums= sum_funcs, *tmp_sum_funcs= 0;
+    sum_funcs= &tmp_sum_funcs;
+    res= opt_sum_query(thd, select_lex->leaf_tables, all_fields, conds);
+    sum_funcs= save_func_sums;
+
+    if (res)
     {
       DBUG_ASSERT(res >= 0);
       if (res == HA_ERR_KEY_NOT_FOUND)
@@ -2776,13 +2791,15 @@ int JOIN::optimize_stage2()
     calc_group_buffer(this, group_list);
   }
 
-  if (test_if_subpart(group_list, order) ||
-      (!group_list && tmp_table_param.sum_func_count))
-  {
+  /*
+    We can ignore ORDER BY if it's a prefix of the GROUP BY list
+    (as MariaDB is by default sorting on GROUP BY) or
+    if there is no GROUP BY and aggregate functions are used
+    (as the result will only contain one row).
+  */
+  if (order && (test_if_subpart(group_list, order) ||
+                (!group_list && tmp_table_param.sum_func_count)))
     order=0;
-    if (is_indexed_agg_distinct(this, NULL))
-      sort_and_group= 0;
-  }
 
   // Can't use sort on head table if using join buffering
   if (full_join || hash_join)
@@ -2814,7 +2831,6 @@ int JOIN::optimize_stage2()
   if (select_lex->have_window_funcs())
     simple_order= FALSE;
 
-
   /*
     If the hint FORCE INDEX FOR ORDER BY/GROUP BY is used for the table
     whose columns are required to be returned in a sorted order, then
@@ -3540,7 +3556,7 @@ bool JOIN::make_aggr_tables_info()
       // for the first table
       if (group_list || tmp_table_param.sum_func_count)
       {
-        if (make_sum_func_list(*curr_all_fields, *curr_fields_list, true, true))
+        if (make_sum_func_list(*curr_all_fields, *curr_fields_list, true))
           DBUG_RETURN(true);
         if (prepare_sum_aggregators(sum_funcs,
                                     !join_tab->is_using_agg_loose_index_scan()))
@@ -3650,7 +3666,7 @@ bool JOIN::make_aggr_tables_info()
       last_tab->all_fields= &tmp_all_fields3;
       last_tab->fields= &tmp_fields_list3;
     }
-    if (make_sum_func_list(*curr_all_fields, *curr_fields_list, true, true))
+    if (make_sum_func_list(*curr_all_fields, *curr_fields_list, true))
       DBUG_RETURN(true);
     if (prepare_sum_aggregators(sum_funcs,
                                 !join_tab ||
@@ -3846,8 +3862,6 @@ JOIN::create_postjoin_aggr_table(JOIN_TAB *tab, List<Item> *table_fields,
   }
   else
   {
-    if (make_sum_func_list(all_fields, fields_list, false))
-      goto err;
     if (prepare_sum_aggregators(sum_funcs,
                                 !join_tab->is_using_agg_loose_index_scan()))
       goto err;
@@ -7089,8 +7103,7 @@ void optimize_keyuse(JOIN *join, DYNAMIC_ARRAY *keyuse_array)
   Check for the presence of AGGFN(DISTINCT a) queries that may be subject
   to loose index scan.
 
-
-  Check if the query is a subject to AGGFN(DISTINCT) using loose index scan 
+  Check if the query is a subject to AGGFN(DISTINCT) using loose index scan
   (QUICK_GROUP_MIN_MAX_SELECT).
   Optionally (if out_args is supplied) will push the arguments of 
   AGGFN(DISTINCT) to the list
@@ -7123,14 +7136,11 @@ is_indexed_agg_distinct(JOIN *join, List<Item_field> *out_args)
   Item_sum **sum_item_ptr;
   bool result= false;
 
-  if (join->table_count != 1 ||                    /* reference more than 1 table */
+  if (join->table_count != 1 ||               /* reference more than 1 table */
       join->select_distinct ||                /* or a DISTINCT */
       join->select_lex->olap == ROLLUP_TYPE)  /* Check (B3) for ROLLUP */
     return false;
 
-  if (join->make_sum_func_list(join->all_fields, join->fields_list, true))
-    return false;
-
   Bitmap<MAX_FIELDS> first_aggdistinct_fields;
   bool first_aggdistinct_fields_initialized= false;
   for (sum_item_ptr= join->sum_funcs; *sum_item_ptr; sum_item_ptr++)
@@ -7232,16 +7242,23 @@ add_group_and_distinct_keys(JOIN *join, JOIN_TAB *join_tab)
     while ((item= select_items_it++))
       item->walk(&Item::collect_item_field_processor, 0, &indexed_fields);
   }
-  else if (join->tmp_table_param.sum_func_count &&
-           is_indexed_agg_distinct(join, &indexed_fields))
+  else if (!join->tmp_table_param.sum_func_count ||
+           !is_indexed_agg_distinct(join, &indexed_fields))
   {
-    join->sort_and_group= 1;
-  }
-  else
+    /*
+      There where no GROUP BY fields and also either no aggregate
+      functions or not all aggregate functions where used with the
+      same DISTINCT (or MIN() / MAX() that works similarly).
+      Nothing to do there.
+    */
     return;
+  }
 
   if (indexed_fields.elements == 0)
+  {
+    /* There where no index we could use to satisfy the GROUP BY */
     return;
+  }
 
   /* Intersect the keys of all group fields. */
   cur_item= indexed_fields_it++;
@@ -25692,16 +25709,13 @@ bool JOIN::alloc_func_list()
 
 bool JOIN::make_sum_func_list(List<Item> &field_list,
                               List<Item> &send_result_set_metadata,
-			      bool before_group_by, bool recompute)
+			      bool before_group_by)
 {
   List_iterator_fast<Item> it(field_list);
   Item_sum **func;
   Item *item;
   DBUG_ENTER("make_sum_func_list");
 
-  if (*sum_funcs && !recompute)
-    DBUG_RETURN(FALSE); /* We have already initialized sum_funcs. */
-
   func= sum_funcs;
   while ((item=it++))
   {
@@ -25848,7 +25862,7 @@ change_to_use_tmp_fields(THD *thd, Ref_ptr_array ref_pointer_array,
   Change all funcs to be fields in tmp table.
 
   @param thd                   THD pointer
-  @param ref_pointer_array     array of pointers to top elements of filed list
+  @param ref_pointer_array     array of pointers to top elements of field list
   @param res_selected_fields   new list of items of select item list
   @param res_all_fields        new list of all items
   @param elements              number of elements in select item list

diff --git a/sql/sql_select.h b/sql/sql_select.h
@@ -1196,7 +1196,17 @@ class JOIN :public Sql_alloc
     Indicates that grouping will be performed on the result set during
     query execution. This field belongs to query execution.
 
-    @see make_group_fields, alloc_group_fields, JOIN::exec
+    If 'sort_and_group' is set, then the optimizer is going to use on of
+    the following algorithms to resolve GROUP BY.
+
+    - If one table, sort the table and then calculate groups on the fly.
+    - If more than one table, create a temporary table to hold the join,
+      sort it and then resolve group by on the fly.
+
+    The 'on the fly' calculation is done in end_send_group()
+
+    @see make_group_fields, alloc_group_fields, JOIN::exec,
+         setup_end_select_func
   */
   bool     sort_and_group; 
   bool     first_record,full_join, no_field_update;
@@ -1654,7 +1664,7 @@ class JOIN :public Sql_alloc
   bool make_range_rowid_filters();
   bool init_range_rowid_filters();
   bool make_sum_func_list(List<Item> &all_fields, List<Item> &send_fields,
-			  bool before_group_by, bool recompute= FALSE);
+			  bool before_group_by);
 
   /// Initialzes a slice, see comments for ref_ptrs above.
   Ref_ptr_array ref_ptr_array_slice(size_t slice_num)