Skip to content

Commit 15ae97b

Browse files
committed
MDEV-32578 row_merge_fts_doc_tokenize() handles parser plugin inconsistently
When mysql/mysql-server@0c954c2 added a plugin interface for FULLTEXT INDEX tokenization to MySQL 5.7, fts_tokenize_ctx::processed_len got a second meaning, which is only partly implemented in row_merge_fts_doc_tokenize(). This inconsistency could cause a crash when using FULLTEXT...WITH PARSER. A test case that would crash MySQL 8.0 when using an n-gram parser and single-character words would fail to crash in MySQL 5.7, because the buf_full condition in row_merge_fts_doc_tokenize() was not met. This change is inspired by mysql/mysql-server@38e9a07 that appeared in MySQL 5.7.44.
1 parent 728bca4 commit 15ae97b

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

storage/innobase/include/row0ftsort.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,10 @@ typedef UT_LIST_BASE_NODE_T(row_fts_token_t) fts_token_list_t;
108108

109109
/** Structure stores information from string tokenization operation */
110110
struct fts_tokenize_ctx {
111-
ulint processed_len; /*!< processed string length */
111+
/** the processed string length in bytes
112+
(when using the built-in tokenizer),
113+
or the number of row_merge_fts_doc_tokenize_by_parser() calls */
114+
ulint processed_len;
112115
ulint init_pos; /*!< doc start position */
113116
ulint buf_used; /*!< the sort buffer (ID) when
114117
tokenization stops, which
@@ -119,6 +122,7 @@ struct fts_tokenize_ctx {
119122
ib_rbt_t* cached_stopword;/*!< in: stopword list */
120123
dfield_t sort_field[FTS_NUM_FIELDS_SORT];
121124
/*!< in: sort field */
125+
/** parsed tokens (when using an external parser) */
122126
fts_token_list_t fts_token_list;
123127

124128
fts_tokenize_ctx() :

storage/innobase/row/row0ftsort.cc

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,10 @@ row_merge_fts_doc_tokenize(
506506

507507
/* Tokenize the data and add each word string, its corresponding
508508
doc id and position to sort buffer */
509-
while (t_ctx->processed_len < doc->text.f_len) {
509+
while (parser
510+
? (!t_ctx->processed_len
511+
|| UT_LIST_GET_LEN(t_ctx->fts_token_list))
512+
: t_ctx->processed_len < doc->text.f_len) {
510513
ulint idx = 0;
511514
ulint cur_len;
512515
doc_id_t write_doc_id;
@@ -847,7 +850,8 @@ DECLARE_THREAD(fts_parallel_tokenization)(
847850
/* Not yet finish processing the "doc" on hand,
848851
continue processing it */
849852
ut_ad(doc.text.f_str);
850-
ut_ad(t_ctx.processed_len < doc.text.f_len);
853+
ut_ad(buf[0]->index->parser
854+
|| t_ctx.processed_len < doc.text.f_len);
851855
}
852856

853857
processed = row_merge_fts_doc_tokenize(
@@ -857,7 +861,8 @@ DECLARE_THREAD(fts_parallel_tokenization)(
857861

858862
/* Current sort buffer full, need to recycle */
859863
if (!processed) {
860-
ut_ad(t_ctx.processed_len < doc.text.f_len);
864+
ut_ad(buf[0]->index->parser
865+
|| t_ctx.processed_len < doc.text.f_len);
861866
ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
862867
break;
863868
}

0 commit comments

Comments
 (0)