Skip to content

Commit

Permalink
MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
Browse files Browse the repository at this point in the history
Encode such characters in hex.
  • Loading branch information
spetrunia committed Jan 19, 2022
1 parent 748b293 commit d8d57d2
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 22 deletions.
32 changes: 29 additions & 3 deletions mysql-test/main/statistics_json.result
Original file line number Diff line number Diff line change
Expand Up @@ -7896,16 +7896,41 @@ a
drop table t1;
#
# Another testcase: use a character that cannot be represented in utf8:
# Also, now it's testcase for:
# MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
#
create table t1 ( a varchar(100) character set cp1251);
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98');
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88');
insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98');
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status Operation failed
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
select hist_type, histogram
from mysql.column_stats
where db_name=database() and table_name='t1';
hist_type histogram
JSON_HB {
"target_histogram_size": 10,
"collected_at": "REPLACED",
"collected_by": "REPLACED",
"histogram_hb": [
{
"start": "€",
"size": 0.6,
"ndv": 1
},
{
"start_hex": "98",
"end_hex": "98",
"size": 0.4,
"ndv": 1
}
]
}
analyze select * from t1 where a=_cp1251 x'88';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 5.00 60.00 60.00 Using where
drop table t1;
#
# ASAN use-after-poison my_strnxfrm_simple_internal / Histogram_json_hb::range_selectivity ...
Expand Down Expand Up @@ -8102,7 +8127,8 @@ set histogram_type= JSON_HB, histogram_size= 1;
insert into t1 values ('foo'),(unhex('9C'));
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status Operation failed
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
select * from t1;
a
foo
Expand Down
7 changes: 6 additions & 1 deletion mysql-test/main/statistics_json.test
Original file line number Diff line number Diff line change
Expand Up @@ -227,16 +227,21 @@ drop table t1;

--echo #
--echo # Another testcase: use a character that cannot be represented in utf8:
--echo # Also, now it's testcase for:
--echo # MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
--echo #
create table t1 ( a varchar(100) character set cp1251);
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98');
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88');
insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98');
analyze table t1 persistent for all;

--source include/histogram_replaces.inc
select hist_type, histogram
from mysql.column_stats
where db_name=database() and table_name='t1';

analyze select * from t1 where a=_cp1251 x'88';

drop table t1;

--echo #
Expand Down
100 changes: 83 additions & 17 deletions sql/opt_histogram_json.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ static bool json_unescape_to_string(const char *val, int val_len, String* out)
succeeds.
*/

static bool json_escape_to_string(const String *str, String* out)
static int json_escape_to_string(const String *str, String* out)
{
// Make sure 'out' has some memory allocated.
if (!out->alloced_length() && out->alloc(128))
return true;
return JSON_ERROR_OUT_OF_SPACE;

while (1)
{
Expand All @@ -90,15 +90,15 @@ static bool json_escape_to_string(const String *str, String* out)
if (res >= 0)
{
out->length(res);
return false; // Ok
return 0; // Ok
}

if (res != JSON_ERROR_OUT_OF_SPACE)
return true; // Some conversion error
return res; // Some conversion error

// Out of space error. Try with a bigger buffer
if (out->alloc(out->alloced_length()*2))
return true;
return JSON_ERROR_OUT_OF_SPACE;
}
}

Expand Down Expand Up @@ -208,8 +208,7 @@ class Histogram_json_builder : public Histogram_builder
*/
bool finalize_bucket_with_end_value(void *elem)
{
writer.add_member("end");
if (append_column_value(elem))
if (append_column_value(elem, false))
return true;
finalize_bucket();
return false;
Expand All @@ -224,19 +223,18 @@ class Histogram_json_builder : public Histogram_builder
{
DBUG_ASSERT(bucket.size == 0);
writer.start_object();
writer.add_member("start");
if (append_column_value(elem))
if (append_column_value(elem, true))
return true;

bucket.ndv= 1;
bucket.size= cnt;
return false;
}

/*
Append the passed value into the JSON writer as string value
*/
bool append_column_value(void *elem)
bool append_column_value(void *elem, bool is_start)
{
StringBuffer<MAX_FIELD_WIDTH> val;

Expand All @@ -246,12 +244,21 @@ class Histogram_json_builder : public Histogram_builder

// Escape the value for JSON
StringBuffer<MAX_FIELD_WIDTH> escaped_val;
if (json_escape_to_string(str, &escaped_val))
return true;

// Note: The Json_writer does NOT do escapes (perhaps this should change?)
writer.add_str(escaped_val.c_ptr_safe());
return false;
int rc= json_escape_to_string(str, &escaped_val);
if (!rc)
{
writer.add_member(is_start? "start": "end");
writer.add_str(escaped_val.c_ptr_safe());
return false;
}
if (rc == JSON_ERROR_ILLEGAL_SYMBOL)
{
escaped_val.set_hex(val.ptr(), val.length());
writer.add_member(is_start? "start_hex": "end_hex");
writer.add_str(escaped_val.c_ptr_safe());
return false;
}
return true;
}

/*
Expand Down Expand Up @@ -496,6 +503,41 @@ bool read_bucket_endpoint(json_engine_t *je, Field *field, String *out,
}


bool read_hex_bucket_endpoint(json_engine_t *je, Field *field, String *out,
const char **err)
{
if (json_read_value(je))
return true;

if (je->value_type != JSON_VALUE_STRING || je->value_escaped ||
(je->value_len & 1))
{
*err= "Expected a hex string";
return true;
}
StringBuffer<128> buf;

for (auto pc= je->value; pc < je->value + je->value_len; pc+=2)
{
int hex_char1= hexchar_to_int(pc[0]);
int hex_char2= hexchar_to_int(pc[1]);
if (hex_char1 == -1 || hex_char2 == -1)
{
*err= "Expected a hex string";
return true;
}
buf.append((hex_char1 << 4) | hex_char2);
}

field->store_text(buf.ptr(), buf.length(), field->charset());
out->alloc(field->pack_length());
uint bytes= field->get_key_image((uchar*)out->ptr(),
field->key_length(), Field::itRAW);
out->length(bytes);
return false;
}


/*
@brief Parse a JSON reprsentation for one histogram bucket
Expand Down Expand Up @@ -619,6 +661,30 @@ int Histogram_json_hb::parse_bucket(json_engine_t *je, Field *field,
}
save1.restore_to(je);

// Less common endoints:
Json_string start_hex_str("start_hex");
if (json_key_matches(je, start_hex_str.get()))
{
if (read_hex_bucket_endpoint(je, field, &value_buf, err))
return 1;

have_start= true;
continue;
}
save1.restore_to(je);

Json_string end_hex_str("end_hex");
if (json_key_matches(je, end_hex_str.get()))
{
if (read_hex_bucket_endpoint(je, field, &value_buf, err))
return 1;
last_bucket_end_endp.assign(value_buf.ptr(), value_buf.length());
*assigned_last_end= true;
continue;
}
save1.restore_to(je);


// Some unknown member. Skip it.
if (json_skip_key(je))
return 1;
Expand Down
13 changes: 12 additions & 1 deletion sql/opt_histogram_json.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,18 @@
"histogram_hb": [
{ "start": "value", "size":nnn.nn, "ndv": nnn },
...
{ "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"}
// Optionally, start and/or end can be replaced with _hex variant
{ "start_hex: "value", "size":nnn.nn, "ndv":nnn},
...
{ "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"},
]
}
The histogram is an object with single member named Histogram_json_hb::
JSON_NAME. The value of that member is an array of buckets.
Each bucket is an object with these members:
"start" - the first value in the bucket.
"size" - fraction of table rows that is contained in the bucket.
Expand All @@ -51,6 +57,11 @@
The exception is single-point buckets where last value is the same as the
first value.
start/end can be replaced with start_hex/end_hex. In _hex variant, the
constant is encoded in hex. This encoding is used to handle so called
"unassigned characters": some non-UTF8 charsets have byte combinations that
are not mapped to any UTF8 character.
*/

class Histogram_json_hb : public Histogram_base
Expand Down

0 comments on commit d8d57d2

Please sign in to comment.