Skip to content

Commit c129689

Browse files
idoqospetrunia
authored andcommitted
Use binary search to compute range selectivity
* it also adds an "explain select" statement to the test so that the fprintf calls can print the computed intervals to mysqld.1.err Signed-off-by: Michael Okoko <okokomichaels@outlook.com>
1 parent c605285 commit c129689

File tree

4 files changed

+88
-98
lines changed

4 files changed

+88
-98
lines changed

mysql-test/main/statistics_json.result

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -67,33 +67,11 @@ test t1 d 1 25 0.0000 8.0000 1.0000 10 JSON [
6767
"21",
6868
"23"
6969
]
70-
SELECT * FROM t1;
71-
a b c d
72-
1 1 1 1
73-
2 2 2 2
74-
3 3 3 3
75-
4 4 4 4
76-
5 5 5 5
77-
6 6 6 6
78-
7 7 7 7
79-
8 8 8 8
80-
9 9 9 9
81-
10 10 10 10
82-
11 11 11 11
83-
12 12 12 12
84-
13 13 13 13
85-
14 14 14 14
86-
15 15 15 15
87-
16 16 16 16
88-
17 17 17 17
89-
18 18 18 18
90-
19 19 19 19
91-
20 20 20 20
92-
21 21 21 21
93-
22 22 22 22
94-
23 23 23 23
95-
24 24 24 24
96-
25 25 25 25
70+
explain extended select * from t1 where b between '20' and '70';
71+
id select_type table type possible_keys key key_len ref rows filtered Extra
72+
1 SIMPLE t1 ALL NULL NULL NULL NULL 25 10.00 Using where
73+
Warnings:
74+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where `test`.`t1`.`b` between '20' and '70'
9775
UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
9876
FLUSH TABLES;
9977
SELECT * FROM t1;

mysql-test/main/statistics_json.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ set histogram_size=10;
2828

2929
ANALYZE TABLE t1 PERSISTENT FOR ALL;
3030
SELECT * FROM mysql.column_stats WHERE table_name='t1';
31-
SELECT * FROM t1;
31+
explain extended select * from t1 where b between '20' and '70';
3232

3333
# We then test different valid JSON strings that are invalid histograms.
3434
UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';

sql/sql_statistics.cc

Lines changed: 75 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1466,90 +1466,95 @@ double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
14661466
key_range *max_endp)
14671467
{
14681468
fprintf(stderr, "Histogram_json::range_selectivity_new\n");
1469-
1470-
1471-
/*
1472-
GSOC-TODO:
1473-
The code below is NOT what this function have.
1474-
1475-
== WHAT THIS CODE DOES ==
1476-
At the moment it does a linear walk through histogram_bounds and compares
1477-
min_endp to each of histogram bucket's min and max.
1478-
ATTENTION: This is a demo of how key_cmp() is used to compare the values.
1479-
1480-
When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END,
1481-
it computes a position of min_endp within the bucket.
1482-
ATTENTION: calls to pos_in_interval_.... are a demo of how to compute
1483-
position of a value within a [min,max] range.
1484-
1485-
== WHAT THIS CODE SHOULD DO ==
1486-
* Use binary search to locate the range [MIN_BUCKET; MAX_BUCKET] - the
1487-
set of buckets that overlaps with the search interval {min_endp, max_endp}.
1488-
1489-
* If the search interval covers MIN_BUCKET only partially, compute a
1490-
position of min_endp within the bucket.
1491-
1492-
* The same for max_endp.
1493-
1494-
* Compute the final selectivity and return it.
1495-
*/
1496-
std::string prev_s;
1497-
bool have_prev_s=false;
1498-
for (auto &s : histogram_bounds)
1469+
double min_sel, max_sel;
1470+
if (min_endp)
14991471
{
1500-
if (!have_prev_s)
1501-
{
1502-
prev_s = s;
1503-
have_prev_s= true;
1504-
continue;
1505-
}
1506-
1507-
// It's a test code, so we only process min_endp.
1508-
if (min_endp)
1509-
{
1510-
const uchar *min_key= min_endp->key;
1511-
// TODO: also, properly handle SQL NULLs.
1512-
// in this test patch, we just assume the values are not SQL NULLs.
1513-
if (field->real_maybe_null())
1514-
min_key++;
1515-
1516-
int res1= field->key_cmp((uchar*)prev_s.data(), min_key);
1517-
const char *str1="<";
1518-
if (res1>0) str1=">";
1519-
if (res1==0) str1="=";
1520-
1521-
int res2= field->key_cmp(min_key, (uchar*)s.data());
1522-
const char *str2="<";
1523-
if (res2>0) str2=">";
1524-
if (res2==0) str2="=";
1525-
fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2);
1526-
1527-
if (res1<0 && res2 < 0)
1528-
{
1529-
double sel;
1530-
if (field->pos_through_val_str())
1531-
sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(),
1532-
(uchar*)s.data(), (uchar*)min_key);
1533-
else
1534-
sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(),
1535-
(uchar*)s.data(), (uchar*)min_key);
1472+
const uchar *min_key= min_endp->key;
1473+
// TODO: also, properly handle SQL NULLs.
1474+
// in this test patch, we just assume the values are not SQL NULLs.
1475+
if (field->real_maybe_null())
1476+
min_key++;
15361477

1537-
fprintf(stderr, " pos_in_interval=%g\n", sel);
1538-
}
1478+
min_sel= selection_in_interval(field, min_key);
1479+
fprintf(stderr, "min pos_in_interval(min_endp)=%g\n", min_sel);
1480+
}
1481+
if (max_endp)
1482+
{
1483+
const uchar *max_key= max_endp->key;
1484+
if (field->real_maybe_null())
1485+
max_key++;
15391486

1540-
prev_s= s;
1541-
}
1487+
max_sel= selection_in_interval(field, max_key);
1488+
fprintf(stderr, "max pos_in_interval(min_endp)=%g\n", max_sel);
15421489
}
1490+
15431491
fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
15441492
return 0.5;
15451493
}
15461494

1495+
double Histogram_json::selection_in_interval(Field *field, const uchar* endpoint)
1496+
{
1497+
int min_bucket_idx, max_bucket_idx;
1498+
min_bucket_idx= find_bucket(field, endpoint);
1499+
std::string min_bucket, max_bucket;
1500+
1501+
// todo:
1502+
// this will probably trip up for cases where mind_endp > the last histogram value i.e min_bucket_idx = -1, but max_bucket_idx = 0 doesn't make sense.
1503+
max_bucket_idx= min_bucket_idx + 1;
1504+
double selection = 0;
1505+
if (min_bucket_idx != -1)
1506+
{
1507+
min_bucket= histogram_bounds[min_bucket_idx];
1508+
max_bucket= (max_bucket_idx < (int)histogram_bounds.size()) ? histogram_bounds[max_bucket_idx] : "";
1509+
1510+
if (field->pos_through_val_str())
1511+
selection = pos_in_interval_through_strxfrm(field, (uchar *) min_bucket.data(),
1512+
(uchar *) max_bucket.data(),
1513+
(uchar *) endpoint);
1514+
else
1515+
selection = pos_in_interval_through_val_real(field, (uchar *) min_bucket.data(),
1516+
(uchar *) max_bucket.data(),
1517+
(uchar *) endpoint);
1518+
}
1519+
return selection;
1520+
}
1521+
15471522
void Histogram_json::serialize(Field *field)
15481523
{
15491524
field->store((char*)get_values(), strlen((char*)get_values()),
15501525
&my_charset_bin);
15511526
}
15521527

1528+
int Histogram_json::find_bucket(Field *field, const uchar *endpoint)
1529+
{
1530+
int low = 0;
1531+
int high = (int)histogram_bounds.size()-1;
1532+
int mid;
1533+
int min_bucket_index = -1;
1534+
std::string mid_val;
1535+
1536+
while(low <= high) {
1537+
// c++ gives us the floor of integer divisions by default, below we get the ceiling (round-up).
1538+
// it works but it doesn't feel so readable, maybe we could make improvements?
1539+
int sum = (low+high);
1540+
mid = sum/2 + (sum % 2 != 0);
1541+
1542+
mid_val = histogram_bounds[mid];
1543+
1544+
int res = field->key_cmp((uchar*) mid_val.data(), endpoint);
1545+
min_bucket_index = mid;
1546+
if (res < 0) {
1547+
low = mid + 1;
1548+
} else if (res > 0) {
1549+
high = mid - 1;
1550+
} else {
1551+
//todo: endpoint is on a bucket boundary
1552+
break;
1553+
}
1554+
}
1555+
return min_bucket_index;
1556+
}
1557+
15531558
/*
15541559
An object of the class Index_stat is created to read statistical
15551560
data on tables from the statistical table table_stat, to update

sql/sql_statistics.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,13 @@ class Histogram_json : public Histogram_base
419419
*/
420420
double range_selectivity_new(Field *field, key_range *min_endp,
421421
key_range *max_endp) override;
422+
423+
/*
424+
* Returns the index of the biggest histogram value that is smaller than endpoint
425+
*/
426+
int find_bucket(Field *field, const uchar *endpoint);
427+
428+
double selection_in_interval(Field *field, const uchar* endpoint);
422429
};
423430

424431
class Columns_statistics;

0 commit comments

Comments
 (0)