diff --git a/mysql-test/main/statistics.result b/mysql-test/main/statistics.result index 7556900f4a299..3299c8904a90f 100644 --- a/mysql-test/main/statistics.result +++ b/mysql-test/main/statistics.result @@ -1885,6 +1885,7 @@ t1 id 1 17384 0.0000 4.0000 14.0000 0.15705,0.15711,0.21463,0.15705,0.15711,0.15 explain select * from t1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 229376 +drop table t0; drop table t1; set analyze_sample_percentage=@save_analyze_sample_percentage; set histogram_size=@save_histogram_size; diff --git a/mysql-test/main/statistics.test b/mysql-test/main/statistics.test index cbb923bee667f..e39a95300af4c 100644 --- a/mysql-test/main/statistics.test +++ b/mysql-test/main/statistics.test @@ -1019,7 +1019,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a from mysql.column_stats; explain select * from t1; - +drop table t0; drop table t1; set analyze_sample_percentage=@save_analyze_sample_percentage; set histogram_size=@save_histogram_size; diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result index 3cd9bfb181f6e..1e7e0d88e7242 100644 --- a/mysql-test/main/statistics_json.result +++ b/mysql-test/main/statistics_json.result @@ -4148,6 +4148,7 @@ t1 id 1 17384 0.0000 4.0000 14.0000 { explain select * from t1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 229376 +drop table t0; drop table t1; set analyze_sample_percentage=@save_analyze_sample_percentage; set histogram_size=@save_histogram_size; @@ -7530,3 +7531,104 @@ select c from t1 where c > '1'; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 16 16.00 75.00 75.00 Using where drop table t1; +# +# MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values +# +# +create table t0(a int); +insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1(a int); +insert into t1 select 100*A.a from t0 A, t0 B, t0 C; +select a, count(*) from t1 group by a order by a; +a count(*) +0 100 +100 100 +200 100 +300 100 +400 100 +500 100 +600 100 +700 100 +800 100 +900 100 +set histogram_type=json_hb, histogram_size=default; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK +select * from mysql.column_stats where table_name='t1'; +db_name table_name column_name min_value max_value nulls_ratio avg_length avg_frequency hist_size hist_type histogram +test t1 a 0 900 0.0000 4.0000 100.0000 10 JSON_HB { + "histogram_hb_v2": [ + { + "start": "0", + "size": 0.1, + "ndv": 1 + }, + { + "start": "100", + "size": 0.1, + "ndv": 1 + }, + { + "start": "200", + "size": 0.1, + "ndv": 1 + }, + { + "start": "300", + "size": 0.1, + "ndv": 1 + }, + { + "start": "400", + "size": 0.1, + "ndv": 1 + }, + { + "start": "500", + "size": 0.1, + "ndv": 1 + }, + { + "start": "600", + "size": 0.1, + "ndv": 1 + }, + { + "start": "700", + "size": 0.1, + "ndv": 1 + }, + { + "start": "800", + "size": 0.1, + "ndv": 1 + }, + { + "start": "900", + "end": "900", + "size": 0.1, + "ndv": 1 + } + ] +} +analyze select * from t1 where a=0; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 10.00 10.00 Using where +analyze select * from t1 where a=50; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 0.10 0.00 Using where +analyze select * from t1 where a=70; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 0.10 0.00 Using where +analyze select * from t1 where a=100; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 10.00 10.00 Using where +analyze select * from t1 where a=150; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 0.10 0.00 Using where +analyze select * from t1 where a=200; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 10.00 10.00 Using where +drop table t0,t1; diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test index 352150e8497b1..e968c910f4b9f 100644 --- a/mysql-test/main/statistics_json.test +++ b/mysql-test/main/statistics_json.test @@ -295,3 +295,25 @@ select c from t1 where c > '1'; drop table t1; +--echo # +--echo # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values +--echo # +--echo # + +create table t0(a int); +insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1(a int); +insert into t1 select 100*A.a from t0 A, t0 B, t0 C; +select a, count(*) from t1 group by a order by a; +set histogram_type=json_hb, histogram_size=default; +analyze table t1 persistent for all; +select * from mysql.column_stats where table_name='t1'; +analyze select * from t1 where a=0; +analyze select * from t1 where a=50; +analyze select * from t1 where a=70; +analyze select * from t1 where a=100; +analyze select * from t1 where a=150; +analyze select * from t1 where a=200; + +drop table t0,t1; + diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc index 1793aa7df0d56..89af9fe3c3f3e 100644 --- a/sql/opt_histogram_json.cc +++ b/sql/opt_histogram_json.cc @@ -616,7 +616,7 @@ double position_in_interval(Field *field, const uchar *key, uint key_len, double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint, - double avg_sel) + double avg_sel, double total_rows) { const uchar *key = endpoint->key; if (field->real_maybe_null()) @@ -631,9 +631,11 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint, if (buckets[idx].ndv == 1 && !equal) { - // The bucket has a single value and it doesn't match! Use the global - // average. - sel= avg_sel; + /* + The bucket has a single value and it doesn't match! Return a very + small value. + */ + sel= 1.0 / total_rows; } else { diff --git a/sql/opt_histogram_json.h b/sql/opt_histogram_json.h index df7547b31eeef..6771942e94e15 100644 --- a/sql/opt_histogram_json.h +++ b/sql/opt_histogram_json.h @@ -109,7 +109,8 @@ class Histogram_json_hb : public Histogram_base ulonglong size) override; double point_selectivity(Field *field, key_range *endpoint, - double avg_selection) override; + double avg_selection, + double total_rows) override; double range_selectivity(Field *field, key_range *min_endp, key_range *max_endp) override; diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index c62230a9af1bf..c87330316f45f 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -3882,7 +3882,8 @@ double get_column_range_cardinality(Field *field, { res= col_non_nulls * hist->point_selectivity(field, min_endp, - avg_frequency / col_non_nulls); + avg_frequency / col_non_nulls, + tab_records); } } else if (avg_frequency == 0.0) @@ -3973,7 +3974,8 @@ double get_column_range_cardinality(Field *field, */ double Histogram_binary::point_selectivity(Field *field, key_range *endpoint, - double avg_sel) + double avg_sel, + double total_records) { double sel; Column_statistics *col_stats= field->read_stats; diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 20a9bb9dd8383..c5b1aa5439938 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -186,7 +186,8 @@ class Histogram_base virtual double point_selectivity(Field *field, key_range *endpoint, - double avg_selection)=0; + double avg_selectivity, + double total_rows)=0; virtual double range_selectivity(Field *field, key_range *min_endp, key_range *max_endp)=0; @@ -355,7 +356,8 @@ class Histogram_binary : public Histogram_base Estimate selectivity of "col=const" using a histogram */ double point_selectivity(Field *field, key_range *endpoint, - double avg_sel) override; + double avg_sel, + double total_rows) override; };