Skip to content

Commit

Permalink
MDEV-32578 row_merge_fts_doc_tokenize() handles parser plugin inconsi…
Browse files Browse the repository at this point in the history
…stently

When mysql/mysql-server@0c954c2
added a plugin interface for FULLTEXT INDEX tokenization to MySQL 5.7,
fts_tokenize_ctx::processed_len got a second meaning, which is only
partly implemented in row_merge_fts_doc_tokenize().

This inconsistency could cause a crash when using FULLTEXT...WITH PARSER.
A test case that would crash MySQL 8.0 when using an n-gram parser and
single-character words would fail to crash in MySQL 5.7, because the
buf_full condition in row_merge_fts_doc_tokenize() was not met.

This change is inspired by
mysql/mysql-server@38e9a07
that appeared in MySQL 5.7.44.
  • Loading branch information
dr-m committed Oct 27, 2023
1 parent 728bca4 commit 15ae97b
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 4 deletions.
6 changes: 5 additions & 1 deletion storage/innobase/include/row0ftsort.h
Expand Up @@ -108,7 +108,10 @@ typedef UT_LIST_BASE_NODE_T(row_fts_token_t) fts_token_list_t;

/** Structure stores information from string tokenization operation */
struct fts_tokenize_ctx {
ulint processed_len; /*!< processed string length */
/** the processed string length in bytes
(when using the built-in tokenizer),
or the number of row_merge_fts_doc_tokenize_by_parser() calls */
ulint processed_len;
ulint init_pos; /*!< doc start position */
ulint buf_used; /*!< the sort buffer (ID) when
tokenization stops, which
Expand All @@ -119,6 +122,7 @@ struct fts_tokenize_ctx {
ib_rbt_t* cached_stopword;/*!< in: stopword list */
dfield_t sort_field[FTS_NUM_FIELDS_SORT];
/*!< in: sort field */
/** parsed tokens (when using an external parser) */
fts_token_list_t fts_token_list;

fts_tokenize_ctx() :
Expand Down
11 changes: 8 additions & 3 deletions storage/innobase/row/row0ftsort.cc
Expand Up @@ -506,7 +506,10 @@ row_merge_fts_doc_tokenize(

/* Tokenize the data and add each word string, its corresponding
doc id and position to sort buffer */
while (t_ctx->processed_len < doc->text.f_len) {
while (parser
? (!t_ctx->processed_len
|| UT_LIST_GET_LEN(t_ctx->fts_token_list))
: t_ctx->processed_len < doc->text.f_len) {
ulint idx = 0;
ulint cur_len;
doc_id_t write_doc_id;
Expand Down Expand Up @@ -847,7 +850,8 @@ DECLARE_THREAD(fts_parallel_tokenization)(
/* Not yet finish processing the "doc" on hand,
continue processing it */
ut_ad(doc.text.f_str);
ut_ad(t_ctx.processed_len < doc.text.f_len);
ut_ad(buf[0]->index->parser
|| t_ctx.processed_len < doc.text.f_len);
}

processed = row_merge_fts_doc_tokenize(
Expand All @@ -857,7 +861,8 @@ DECLARE_THREAD(fts_parallel_tokenization)(

/* Current sort buffer full, need to recycle */
if (!processed) {
ut_ad(t_ctx.processed_len < doc.text.f_len);
ut_ad(buf[0]->index->parser
|| t_ctx.processed_len < doc.text.f_len);
ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
break;
}
Expand Down

0 comments on commit 15ae97b

Please sign in to comment.