Fix off-by-one error in Histogram_json_hb::find_bucket

spetrunia · spetrunia · commit 28ad12858548 · 2022-01-19T18:10:10.000+03:00
diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result
@@ -4093,12 +4093,12 @@ test.t2	analyze	status	Engine-independent statistics collected
 test.t2	analyze	status	OK
 explain extended select * from t2 where city = 'Moscow';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	50.00	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	98.02	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`city` AS `city` from `test`.`t2` where `test`.`t2`.`city` = 'Moscow'
 analyze select * from t2 where city = 'Moscow';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	101.00	50.00	98.02	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	101.00	98.02	98.02	Using where
 explain extended select * from t2 where city = 'Helsinki';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	1.98	Using where
diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test
@@ -182,4 +182,3 @@ SET histogram_type= JSON_HB;
 ANALYZE TABLE t1 PERSISTENT FOR ALL;
 SELECT * FROM t1;
 drop table t1;
-
diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc
@@ -483,12 +483,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
 
   // If the value is outside of the histogram's range, this will "clip" it to
   // first or last bucket.
-  int idx= find_bucket(field, key, false);
+  bool equal;
+  int idx= find_bucket(field, key, &equal);
 
   double sel;
 
-  if (buckets[idx].ndv == 1 &&
-      field->key_cmp((uchar*)buckets[idx].start_value.data(), key))
+  if (buckets[idx].ndv == 1 && !equal)
   {
     // The bucket has a single value and it doesn't match! Use the global
     // average.
@@ -550,7 +550,18 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
 
     // Find the leftmost bucket that contains the lookup value.
     // (If the lookup value is to the left of all buckets, find bucket #0)
-    int idx= find_bucket(field, min_key, exclusive_endp);
+    bool equal;
+    int idx= find_bucket(field, min_key, &equal);
+    if (equal && exclusive_endp && buckets[idx].ndv==1 &&
+        idx < (int)buckets.size()-1)
+    {
+      /*
+        The range is "col > $CONST" and we've found a bucket that contains
+        only the value $CONST. Move to the next bucket.
+        TODO: what if the last value in the histogram is a popular one?
+      */
+      idx++;
+    }
     double left_fract= get_left_fract(idx);
     double sel= position_in_interval(field, min_key, min_key_len,
                                      buckets[idx].start_value,
@@ -573,8 +584,18 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
       max_key++;
       max_key_len--;
     }
+    bool equal;
+    int idx= find_bucket(field, max_key, &equal);
 
-    int idx= find_bucket(field, max_key, inclusive_endp);
+    if (equal && !inclusive_endp && idx > 0)
+    {
+      /*
+        The range is "col < $CONST" and we've found a bucket starting with
+        $CONST. Move to the previous bucket.
+        TODO: what if the first value is the popular one?
+      */
+      idx--;
+    }
     double left_fract= get_left_fract(idx);
     double sel= position_in_interval(field, max_key, max_key_len,
                                      buckets[idx].start_value,
@@ -616,22 +637,59 @@ void Histogram_json_hb::serialize(Field *field)
 */
 
 int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
-                                   bool equal_is_less)
+                                   bool *equal)
 {
+  int res;
   int low= 0;
   int high= (int)buckets.size() - 1;
+  *equal= false;
 
   while (low + 1 < high)
   {
     int middle= (low + high) / 2;
-    int res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
+    res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
     if (!res)
-      res= equal_is_less? -1: 1;
-    if (res < 0)
+    {
+      *equal= true;
+      return middle;
+    }
+    else if (res < 0)
       low= middle;
     else //res > 0
       high= middle;
   }
 
+  /*
+    If low and high were assigned a value in the above loop, then they are not
+    equal to the lookup value:
+
+      bucket[low] < lookup_val < bucket[high]
+
+    But there are two special cases: low=0 and high=last_bucket. Handle them
+    below.
+  */
+  if (low == 0)
+  {
+    res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
+    if (!res)
+      *equal= true;
+    else if (res < 0)
+    {
+      res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
+      if (!res)
+        *equal= true;
+      if (res >= 0)
+        low= high;
+    }
+  }
+  else if (high == (int)buckets.size() - 1)
+  {
+    res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
+    if (!res)
+      *equal= true;
+    if (res >= 0)
+      low= high;
+  }
+
   return low;
 }
diff --git a/sql/opt_histogram_json.h b/sql/opt_histogram_json.h
@@ -123,6 +123,6 @@ class Histogram_json_hb : public Histogram_base
 private:
   double get_left_fract(int idx);
   std::string& get_end_value(int idx);
-  int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
+  int find_bucket(Field *field, const uchar *lookup_val, bool *equal);
 };