Skip to content

Commit f0773b7

Browse files
committed
Introduce analyze_sample_percentage variable
The variable controls the amount of sampling analyze table performs. If ANALYZE table with histogram collection is too slow, one can reduce the time taken by setting analyze_sample_percentage to a lower value of the total number of rows. Setting it to 0 will use a formula to compute how many rows to sample: The number of rows collected is capped to a minimum of 50000 and increases logarithmically with a coffecient of 4096. The coffecient is chosen so that we expect an error of less than 3% in our estimations according to the paper: "Random Sampling for Histogram Construction: How much is enough?” – Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998. The drawback of sampling is that avg_frequency number is computed imprecisely and will yeild a smaller number than the real one.
1 parent 47f15ea commit f0773b7

File tree

8 files changed

+265
-10
lines changed

8 files changed

+265
-10
lines changed

mysql-test/main/mysqld--help.result

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ The following specify which files/extra groups are read (specified before remain
1515
--alter-algorithm[=name]
1616
Specify the alter table algorithm. One of: DEFAULT, COPY,
1717
INPLACE, NOCOPY, INSTANT
18+
--analyze-sample-percentage=#
19+
Percentage of rows from the table ANALYZE TABLE will
20+
sample to collect table statistics. Set to 0 to let
21+
MariaDB decide what percentage of rows to sample.
1822
-a, --ansi Use ANSI SQL syntax instead of MySQL syntax. This mode
1923
will also set transaction isolation level 'serializable'.
2024
--auto-increment-increment[=#]
@@ -1385,6 +1389,7 @@ The following specify which files/extra groups are read (specified before remain
13851389
Variables (--variable-name=value)
13861390
allow-suspicious-udfs FALSE
13871391
alter-algorithm DEFAULT
1392+
analyze-sample-percentage 100
13881393
auto-increment-increment 1
13891394
auto-increment-offset 1
13901395
autocommit TRUE

mysql-test/main/statistics.result

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1760,3 +1760,107 @@ DROP TABLE t1;
17601760
# End of 10.2 tests
17611761
#
17621762
set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
1763+
#
1764+
# Start of 10.4 tests
1765+
#
1766+
#
1767+
# Test analyze_sample_percentage system variable.
1768+
#
1769+
set @save_use_stat_tables=@@use_stat_tables;
1770+
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
1771+
set session rand_seed1=42;
1772+
set session rand_seed2=62;
1773+
set use_stat_tables=PREFERABLY;
1774+
set histogram_size=10;
1775+
CREATE TABLE t1 (id int);
1776+
INSERT INTO t1 (id) VALUES (1), (1), (1), (1), (1), (1), (1);
1777+
INSERT INTO t1 (id) SELECT id FROM t1;
1778+
INSERT INTO t1 SELECT id+1 FROM t1;
1779+
INSERT INTO t1 SELECT id+2 FROM t1;
1780+
INSERT INTO t1 SELECT id+4 FROM t1;
1781+
INSERT INTO t1 SELECT id+8 FROM t1;
1782+
INSERT INTO t1 SELECT id+16 FROM t1;
1783+
INSERT INTO t1 SELECT id+32 FROM t1;
1784+
INSERT INTO t1 SELECT id+64 FROM t1;
1785+
INSERT INTO t1 SELECT id+128 FROM t1;
1786+
INSERT INTO t1 SELECT id+256 FROM t1;
1787+
INSERT INTO t1 SELECT id+512 FROM t1;
1788+
INSERT INTO t1 SELECT id+1024 FROM t1;
1789+
INSERT INTO t1 SELECT id+2048 FROM t1;
1790+
INSERT INTO t1 SELECT id+4096 FROM t1;
1791+
INSERT INTO t1 SELECT id+9192 FROM t1;
1792+
#
1793+
# This query will should show a full table scan analysis.
1794+
#
1795+
ANALYZE TABLE t1;
1796+
Table Op Msg_type Msg_text
1797+
test.t1 analyze status Engine-independent statistics collected
1798+
test.t1 analyze status OK
1799+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
1800+
DECODE_HISTOGRAM(hist_type, histogram)
1801+
from mysql.column_stats;
1802+
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
1803+
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
1804+
set analyze_sample_percentage=0.1;
1805+
#
1806+
# This query will show an innacurate avg_frequency value.
1807+
#
1808+
ANALYZE TABLE t1;
1809+
Table Op Msg_type Msg_text
1810+
test.t1 analyze status Engine-independent statistics collected
1811+
test.t1 analyze status Table is already up to date
1812+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
1813+
DECODE_HISTOGRAM(hist_type, histogram)
1814+
from mysql.column_stats;
1815+
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
1816+
t1 id 111 17026 0.0000 4.0000 1.0047 0.039,0.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098
1817+
#
1818+
# This query will show a better avg_frequency value.
1819+
#
1820+
set analyze_sample_percentage=25;
1821+
ANALYZE TABLE t1;
1822+
Table Op Msg_type Msg_text
1823+
test.t1 analyze status Engine-independent statistics collected
1824+
test.t1 analyze status Table is already up to date
1825+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
1826+
DECODE_HISTOGRAM(hist_type, histogram)
1827+
from mysql.column_stats;
1828+
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
1829+
t1 id 1 17384 0.0000 4.0000 3.5736 0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090
1830+
set analyze_sample_percentage=0;
1831+
#
1832+
# Test self adjusting sampling level.
1833+
#
1834+
ANALYZE TABLE t1;
1835+
Table Op Msg_type Msg_text
1836+
test.t1 analyze status Engine-independent statistics collected
1837+
test.t1 analyze status Table is already up to date
1838+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
1839+
DECODE_HISTOGRAM(hist_type, histogram)
1840+
from mysql.column_stats;
1841+
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
1842+
t1 id 1 17384 0.0000 4.0000 7.4523 0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086
1843+
#
1844+
# Test record estimation is working properly.
1845+
#
1846+
select count(*) from t1;
1847+
count(*)
1848+
229376
1849+
explain select * from t1;
1850+
id select_type table type possible_keys key key_len ref rows Extra
1851+
1 SIMPLE t1 ALL NULL NULL NULL NULL 229060
1852+
set analyze_sample_percentage=100;
1853+
ANALYZE TABLE t1;
1854+
Table Op Msg_type Msg_text
1855+
test.t1 analyze status Engine-independent statistics collected
1856+
test.t1 analyze status Table is already up to date
1857+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
1858+
DECODE_HISTOGRAM(hist_type, histogram)
1859+
from mysql.column_stats;
1860+
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
1861+
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
1862+
explain select * from t1;
1863+
id select_type table type possible_keys key key_len ref rows Extra
1864+
1 SIMPLE t1 ALL NULL NULL NULL NULL 229376
1865+
set use_stat_tables=@save_use_stat_tables;
1866+
drop table t1;

mysql-test/main/statistics.test

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -901,3 +901,91 @@ DROP TABLE t1;
901901
--echo #
902902
set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
903903

904+
905+
906+
--echo #
907+
--echo # Start of 10.4 tests
908+
--echo #
909+
910+
--echo #
911+
--echo # Test analyze_sample_percentage system variable.
912+
--echo #
913+
set @save_use_stat_tables=@@use_stat_tables;
914+
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
915+
916+
set session rand_seed1=42;
917+
set session rand_seed2=62;
918+
919+
set use_stat_tables=PREFERABLY;
920+
set histogram_size=10;
921+
922+
CREATE TABLE t1 (id int);
923+
INSERT INTO t1 (id) VALUES (1), (1), (1), (1), (1), (1), (1);
924+
INSERT INTO t1 (id) SELECT id FROM t1;
925+
INSERT INTO t1 SELECT id+1 FROM t1;
926+
INSERT INTO t1 SELECT id+2 FROM t1;
927+
INSERT INTO t1 SELECT id+4 FROM t1;
928+
INSERT INTO t1 SELECT id+8 FROM t1;
929+
INSERT INTO t1 SELECT id+16 FROM t1;
930+
INSERT INTO t1 SELECT id+32 FROM t1;
931+
INSERT INTO t1 SELECT id+64 FROM t1;
932+
INSERT INTO t1 SELECT id+128 FROM t1;
933+
INSERT INTO t1 SELECT id+256 FROM t1;
934+
INSERT INTO t1 SELECT id+512 FROM t1;
935+
INSERT INTO t1 SELECT id+1024 FROM t1;
936+
INSERT INTO t1 SELECT id+2048 FROM t1;
937+
INSERT INTO t1 SELECT id+4096 FROM t1;
938+
INSERT INTO t1 SELECT id+9192 FROM t1;
939+
940+
--echo #
941+
--echo # This query will should show a full table scan analysis.
942+
--echo #
943+
ANALYZE TABLE t1;
944+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
945+
DECODE_HISTOGRAM(hist_type, histogram)
946+
from mysql.column_stats;
947+
948+
set analyze_sample_percentage=0.1;
949+
950+
--echo #
951+
--echo # This query will show an innacurate avg_frequency value.
952+
--echo #
953+
ANALYZE TABLE t1;
954+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
955+
DECODE_HISTOGRAM(hist_type, histogram)
956+
from mysql.column_stats;
957+
958+
--echo #
959+
--echo # This query will show a better avg_frequency value.
960+
--echo #
961+
set analyze_sample_percentage=25;
962+
ANALYZE TABLE t1;
963+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
964+
DECODE_HISTOGRAM(hist_type, histogram)
965+
from mysql.column_stats;
966+
967+
968+
set analyze_sample_percentage=0;
969+
--echo #
970+
--echo # Test self adjusting sampling level.
971+
--echo #
972+
ANALYZE TABLE t1;
973+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
974+
DECODE_HISTOGRAM(hist_type, histogram)
975+
from mysql.column_stats;
976+
-- echo #
977+
-- echo # Test record estimation is working properly.
978+
-- echo #
979+
select count(*) from t1;
980+
explain select * from t1;
981+
982+
set analyze_sample_percentage=100;
983+
ANALYZE TABLE t1;
984+
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
985+
DECODE_HISTOGRAM(hist_type, histogram)
986+
from mysql.column_stats;
987+
explain select * from t1;
988+
989+
set use_stat_tables=@save_use_stat_tables;
990+
991+
drop table t1;

mysql-test/suite/sys_vars/r/sysvars_server_embedded.result

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,20 @@ NUMERIC_BLOCK_SIZE NULL
4040
ENUM_VALUE_LIST DEFAULT,COPY,INPLACE,NOCOPY,INSTANT
4141
READ_ONLY NO
4242
COMMAND_LINE_ARGUMENT OPTIONAL
43+
VARIABLE_NAME ANALYZE_SAMPLE_PERCENTAGE
44+
SESSION_VALUE 100.000000
45+
GLOBAL_VALUE 100.000000
46+
GLOBAL_VALUE_ORIGIN COMPILE-TIME
47+
DEFAULT_VALUE 100.000000
48+
VARIABLE_SCOPE SESSION
49+
VARIABLE_TYPE DOUBLE
50+
VARIABLE_COMMENT Percentage of rows from the table ANALYZE TABLE will sample to collect table statistics. Set to 0 to let MariaDB decide what percentage of rows to sample.
51+
NUMERIC_MIN_VALUE 0
52+
NUMERIC_MAX_VALUE 100
53+
NUMERIC_BLOCK_SIZE NULL
54+
ENUM_VALUE_LIST NULL
55+
READ_ONLY NO
56+
COMMAND_LINE_ARGUMENT REQUIRED
4357
VARIABLE_NAME AUTOCOMMIT
4458
SESSION_VALUE ON
4559
GLOBAL_VALUE ON

mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,20 @@ NUMERIC_BLOCK_SIZE NULL
4040
ENUM_VALUE_LIST DEFAULT,COPY,INPLACE,NOCOPY,INSTANT
4141
READ_ONLY NO
4242
COMMAND_LINE_ARGUMENT OPTIONAL
43+
VARIABLE_NAME ANALYZE_SAMPLE_PERCENTAGE
44+
SESSION_VALUE 100.000000
45+
GLOBAL_VALUE 100.000000
46+
GLOBAL_VALUE_ORIGIN COMPILE-TIME
47+
DEFAULT_VALUE 100.000000
48+
VARIABLE_SCOPE SESSION
49+
VARIABLE_TYPE DOUBLE
50+
VARIABLE_COMMENT Percentage of rows from the table ANALYZE TABLE will sample to collect table statistics. Set to 0 to let MariaDB decide what percentage of rows to sample.
51+
NUMERIC_MIN_VALUE 0
52+
NUMERIC_MAX_VALUE 100
53+
NUMERIC_BLOCK_SIZE NULL
54+
ENUM_VALUE_LIST NULL
55+
READ_ONLY NO
56+
COMMAND_LINE_ARGUMENT REQUIRED
4357
VARIABLE_NAME AUTOCOMMIT
4458
SESSION_VALUE ON
4559
GLOBAL_VALUE ON

sql/sql_class.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,7 @@ typedef struct system_variables
622622
ulong optimizer_selectivity_sampling_limit;
623623
ulong optimizer_use_condition_selectivity;
624624
ulong use_stat_tables;
625+
double sample_percentage;
625626
ulong histogram_size;
626627
ulong histogram_type;
627628
ulong preload_buff_size;

sql/sql_statistics.cc

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2729,12 +2729,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
27292729
Field *table_field;
27302730
ha_rows rows= 0;
27312731
handler *file=table->file;
2732+
double sample_fraction= thd->variables.sample_percentage / 100;
2733+
const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000;
27322734

27332735
DBUG_ENTER("collect_statistics_for_table");
27342736

27352737
table->collected_stats->cardinality_is_null= TRUE;
27362738
table->collected_stats->cardinality= 0;
27372739

2740+
if (thd->variables.sample_percentage == 0)
2741+
{
2742+
if (file->records() < MIN_THRESHOLD_FOR_SAMPLING)
2743+
{
2744+
sample_fraction= 1;
2745+
}
2746+
else
2747+
{
2748+
sample_fraction= std::fmin(
2749+
(MIN_THRESHOLD_FOR_SAMPLING + 4096 *
2750+
log(200 * file->records())) / file->records(), 1);
2751+
}
2752+
}
2753+
27382754
for (field_ptr= table->field; *field_ptr; field_ptr++)
27392755
{
27402756
table_field= *field_ptr;
@@ -2747,7 +2763,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
27472763

27482764
/* Perform a full table scan to collect statistics on 'table's columns */
27492765
if (!(rc= file->ha_rnd_init(TRUE)))
2750-
{
2766+
{
27512767
DEBUG_SYNC(table->in_use, "statistics_collection_start");
27522768

27532769
while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE)
@@ -2758,17 +2774,20 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
27582774
if (rc)
27592775
break;
27602776

2761-
for (field_ptr= table->field; *field_ptr; field_ptr++)
2777+
if (thd_rnd(thd) <= sample_fraction)
27622778
{
2763-
table_field= *field_ptr;
2764-
if (!bitmap_is_set(table->read_set, table_field->field_index))
2765-
continue;
2766-
if ((rc= table_field->collected_stats->add()))
2779+
for (field_ptr= table->field; *field_ptr; field_ptr++)
2780+
{
2781+
table_field= *field_ptr;
2782+
if (!bitmap_is_set(table->read_set, table_field->field_index))
2783+
continue;
2784+
if ((rc= table_field->collected_stats->add()))
2785+
break;
2786+
}
2787+
if (rc)
27672788
break;
2789+
rows++;
27682790
}
2769-
if (rc)
2770-
break;
2771-
rows++;
27722791
}
27732792
file->ha_rnd_end();
27742793
}
@@ -2782,7 +2801,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
27822801
if (!rc)
27832802
{
27842803
table->collected_stats->cardinality_is_null= FALSE;
2785-
table->collected_stats->cardinality= rows;
2804+
table->collected_stats->cardinality=
2805+
static_cast<ha_rows>(rows / sample_fraction);
27862806
}
27872807

27882808
bitmap_clear_all(table->write_set);

sql/sys_vars.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,15 @@ static Sys_var_long Sys_pfs_connect_attrs_size(
350350

351351
#endif /* WITH_PERFSCHEMA_STORAGE_ENGINE */
352352

353+
static Sys_var_double Sys_analyze_sample_percentage(
354+
"analyze_sample_percentage",
355+
"Percentage of rows from the table ANALYZE TABLE will sample "
356+
"to collect table statistics. Set to 0 to let MariaDB decide "
357+
"what percentage of rows to sample.",
358+
SESSION_VAR(sample_percentage),
359+
CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 100),
360+
DEFAULT(100));
361+
353362
static Sys_var_ulong Sys_auto_increment_increment(
354363
"auto_increment_increment",
355364
"Auto-increment columns are incremented by this",

0 commit comments

Comments
 (0)