Use binary search to compute range selectivity

idoqo · spetrunia · commit c129689ddc7d · 2022-01-19T18:10:08.000+03:00
* it also adds an "explain select" statement to the test so that the fprintf calls
  can print the computed intervals to mysqld.1.err

Signed-off-by: Michael Okoko &lt;okokomichaels@outlook.com&gt;
diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result
@@ -67,33 +67,11 @@ test	t1	d	1	25	0.0000	8.0000	1.0000	10	JSON	[
   "21",
   "23"
 ]
-SELECT * FROM t1;
-a	b	c	d
-1	1	1	1
-2	2	2	2
-3	3	3	3
-4	4	4	4
-5	5	5	5
-6	6	6	6
-7	7	7	7
-8	8	8	8
-9	9	9	9
-10	10	10	10
-11	11	11	11
-12	12	12	12
-13	13	13	13
-14	14	14	14
-15	15	15	15
-16	16	16	16
-17	17	17	17
-18	18	18	18
-19	19	19	19
-20	20	20	20
-21	21	21	21
-22	22	22	22
-23	23	23	23
-24	24	24	24
-25	25	25	25
+explain extended select * from t1 where b between '20' and '70';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	25	10.00	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where `test`.`t1`.`b` between '20' and '70'
 UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
 FLUSH TABLES;
 SELECT * FROM t1;
diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test
@@ -28,7 +28,7 @@ set histogram_size=10;
 
 ANALYZE TABLE t1 PERSISTENT FOR ALL;
 SELECT * FROM mysql.column_stats WHERE table_name='t1';
-SELECT * FROM t1;
+explain extended select * from t1 where b between '20' and '70';
 
 # We then test different valid JSON strings that are invalid histograms.
 UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
@@ -1466,90 +1466,95 @@ double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
                                              key_range *max_endp)
 {
   fprintf(stderr, "Histogram_json::range_selectivity_new\n");
-
-
-  /*
-    GSOC-TODO: 
-    The code below is NOT what this function have. 
-
-    == WHAT THIS CODE DOES ==
-    At the moment it does a linear walk through histogram_bounds and compares 
-    min_endp to each of histogram bucket's min and max. 
-    ATTENTION:  This is a demo of how key_cmp() is used to compare the values.
-    
-    When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END, 
-    it computes a position of min_endp within the bucket.
-    ATTENTION: calls to pos_in_interval_.... are a demo of how to compute 
-    position of a value within a [min,max] range.
-
-    == WHAT THIS CODE SHOULD DO ==
-    * Use binary search to locate the range  [MIN_BUCKET; MAX_BUCKET] - the
-      set of buckets that overlaps with the search interval {min_endp, max_endp}.
-
-    * If the search interval covers MIN_BUCKET only partially, compute a
-      position of min_endp within the bucket.
-
-    * The same for max_endp.
-
-    * Compute the final selectivity and return it.
-  */
-  std::string prev_s;
-  bool have_prev_s=false;
-  for (auto &s : histogram_bounds)
+  double min_sel, max_sel;
+  if (min_endp)
   {
-    if (!have_prev_s)
-    {
-      prev_s = s;
-      have_prev_s= true;
-      continue;
-    }
-
-    // It's a test code, so we only process min_endp.
-    if (min_endp)
-    {
-      const uchar *min_key= min_endp->key;
-      // TODO: also, properly handle SQL NULLs.
-      // in this test patch, we just assume the values are not SQL NULLs.
-      if (field->real_maybe_null())
-        min_key++;
-
-      int res1= field->key_cmp((uchar*)prev_s.data(), min_key);
-      const char *str1="<";
-      if (res1>0) str1=">";
-      if (res1==0) str1="=";
-
-      int res2= field->key_cmp(min_key, (uchar*)s.data());
-      const char *str2="<";
-      if (res2>0) str2=">";
-      if (res2==0) str2="=";
-      fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2);
-
-      if (res1<0 && res2 < 0)
-      {
-        double sel;
-        if (field->pos_through_val_str())
-          sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(), 
-                                               (uchar*)s.data(), (uchar*)min_key);
-        else
-          sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(), 
-                                                (uchar*)s.data(), (uchar*)min_key);
+    const uchar *min_key= min_endp->key;
+    // TODO: also, properly handle SQL NULLs.
+    // in this test patch, we just assume the values are not SQL NULLs.
+    if (field->real_maybe_null())
+      min_key++;
 
-        fprintf(stderr, "  pos_in_interval=%g\n", sel);
-      }
+    min_sel= selection_in_interval(field, min_key);
+    fprintf(stderr, "min pos_in_interval(min_endp)=%g\n", min_sel);
+  }
+  if (max_endp)
+  {
+    const uchar *max_key= max_endp->key;
+    if (field->real_maybe_null())
+      max_key++;
 
-      prev_s= s;
-    }
+    max_sel= selection_in_interval(field, max_key);
+    fprintf(stderr, "max pos_in_interval(min_endp)=%g\n", max_sel);
   }
+
   fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
   return 0.5;
 }
 
+double Histogram_json::selection_in_interval(Field *field, const uchar* endpoint)
+{
+  int min_bucket_idx, max_bucket_idx;
+  min_bucket_idx= find_bucket(field, endpoint);
+  std::string min_bucket, max_bucket;
+
+  // todo:
+  //  this will probably trip up for cases where mind_endp > the last histogram value i.e min_bucket_idx = -1, but max_bucket_idx = 0 doesn't make sense.
+  max_bucket_idx= min_bucket_idx + 1;
+  double selection = 0;
+  if (min_bucket_idx != -1)
+  {
+    min_bucket= histogram_bounds[min_bucket_idx];
+    max_bucket= (max_bucket_idx < (int)histogram_bounds.size()) ? histogram_bounds[max_bucket_idx] : "";
+
+    if (field->pos_through_val_str())
+      selection = pos_in_interval_through_strxfrm(field, (uchar *) min_bucket.data(),
+                                           (uchar *) max_bucket.data(),
+                                           (uchar *) endpoint);
+    else
+      selection = pos_in_interval_through_val_real(field, (uchar *) min_bucket.data(),
+                                            (uchar *) max_bucket.data(),
+                                            (uchar *) endpoint);
+  }
+  return selection;
+}
+
 void Histogram_json::serialize(Field *field)
 {
   field->store((char*)get_values(), strlen((char*)get_values()),
                &my_charset_bin);
 }
 
+int Histogram_json::find_bucket(Field *field, const uchar *endpoint)
+{
+  int low = 0;
+  int high = (int)histogram_bounds.size()-1;
+  int mid;
+  int min_bucket_index = -1;
+  std::string mid_val;
+
+  while(low <= high) {
+    // c++ gives us the floor of integer divisions by default, below we get the ceiling (round-up).
+    // it works but it doesn't feel so readable, maybe we could make improvements?
+    int sum = (low+high);
+    mid = sum/2 + (sum % 2 != 0);
+
+    mid_val = histogram_bounds[mid];
+
+    int res = field->key_cmp((uchar*) mid_val.data(), endpoint);
+    min_bucket_index = mid;
+    if (res < 0) {
+      low = mid + 1;
+    } else if (res > 0) {
+      high = mid - 1;
+    } else {
+      //todo: endpoint is on a bucket boundary
+      break;
+    }
+  }
+  return min_bucket_index;
+}
+
 /*
   An object of the class Index_stat is created to read statistical
   data on tables from the statistical table table_stat, to update
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
@@ -419,6 +419,13 @@ class Histogram_json : public Histogram_base
   */
   double range_selectivity_new(Field *field, key_range *min_endp,
                                        key_range *max_endp) override;
+
+  /*
+   * Returns the index of the biggest histogram value that is smaller than endpoint
+   */
+  int find_bucket(Field *field, const uchar *endpoint);
+
+  double selection_in_interval(Field *field, const uchar* endpoint);
 };
 
 class Columns_statistics;