Skip to content
Permalink
Browse files
MDEV-20797 FULLTEXT search with apostrophe, and mandatory words
- InnoDB should ignore the single word followed by apostrophe while
tokenising the document. Example is that if the input string is O'brien
then right now, InnoDB seperates into two tokens as O, brien. But
after this patch, InnoDB can ignore the token 'O' and consider
only 'brien'.
  • Loading branch information
Thirunarayanan committed Jul 26, 2022
1 parent a8a27f1 commit f076dc2
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 1 deletion.
@@ -732,4 +732,32 @@ ALTER TABLE t1 DROP KEY `ftidx` ;
INSERT INTO t1 (col_int, col_text) VALUES ( 1255, NULL);
DROP TABLE t1;
SET @@global.innodb_file_per_table = @save;
#
# MDEV-20797 FULLTEXT search with apostrophe,
# and mandatory words
#
CREATE TABLE t1(f1 TINYTEXT NOT NULL, FULLTEXT(f1))ENGINE=InnoDB;
INSERT INTO t1 VALUES('O''Brien'), ('O Brien'), ('�''Brien');
INSERT INTO t1 VALUES('Brien'), ('O ''Brien'), ('O'' Brien');
INSERT INTO t1 VALUES('Doh''nuts');
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+O'Brien" IN BOOLEAN MODE);
f1
O'Brien
O Brien
�'Brien
Brien
O 'Brien
O' Brien
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+Doh'nuts" IN BOOLEAN MODE);
f1
Doh'nuts
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+�''Brien" IN BOOLEAN MODE);
f1
O'Brien
O Brien
�'Brien
Brien
O 'Brien
O' Brien
DROP TABLE t1;
# End of 10.3 tests
@@ -757,4 +757,17 @@ INSERT INTO t1 (col_int, col_text) VALUES ( 1255, NULL);
DROP TABLE t1;
SET @@global.innodb_file_per_table = @save;

--echo #
--echo # MDEV-20797 FULLTEXT search with apostrophe,
--echo # and mandatory words
--echo #
CREATE TABLE t1(f1 TINYTEXT NOT NULL, FULLTEXT(f1))ENGINE=InnoDB;
INSERT INTO t1 VALUES('O''Brien'), ('O Brien'), ('�''Brien');
INSERT INTO t1 VALUES('Brien'), ('O ''Brien'), ('O'' Brien');
INSERT INTO t1 VALUES('Doh''nuts');
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+O'Brien" IN BOOLEAN MODE);
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+Doh'nuts" IN BOOLEAN MODE);
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+�''Brien" IN BOOLEAN MODE);
DROP TABLE t1;

--echo # End of 10.3 tests
@@ -6912,7 +6912,8 @@ innobase_mysql_fts_get_token(

ulint mwc = 0;
ulint length = 0;

bool reset_token_str = false;
reset:
token->f_str = const_cast<byte*>(doc);

while (doc < end) {
@@ -6923,6 +6924,9 @@ innobase_mysql_fts_get_token(
cs, &ctype, (uchar*) doc, (uchar*) end);
if (true_word_char(ctype, *doc)) {
mwc = 0;
} else if (*doc == '\'' && length == 1) {
/* Could be apostrophe */
reset_token_str = true;
} else if (!misc_word_char(*doc) || mwc) {
break;
} else {
@@ -6932,6 +6936,14 @@ innobase_mysql_fts_get_token(
++length;

doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
if (reset_token_str) {
/* Reset the token if the single character
followed by apostrophe */
mwc = 0;
length = 0;
reset_token_str = false;
goto reset;
}
}

token->f_len = (uint) (doc - token->f_str) - mwc;

0 comments on commit f076dc2

Please sign in to comment.