Skip to content

Commit

Permalink
A follow-up patch MDEV-27266 Improve UCA collation performance for ut…
Browse files Browse the repository at this point in the history
…f8mb3 and utf8mb4

Moving these members:

   CHARSET_INFO *cs;
   const MY_UCA_WEIGHT_LEVEL *level;

from my_uca_scanner to a new separate structure my_uca_scanner_param.

Rationale:

During a comparison of two strings these members were initialized two times
(one time for every string).

After the change these members initialized only one time inside
a shared instance of my_uca_scanner_param, and the instance is
shared between two scanners (its const address is passed as new a parameter
to the underlying scanner functions).

This change gives a slight performance improvement (~5%).
  • Loading branch information
abarkov committed Sep 2, 2022
1 parent e71aca8 commit f6118ac
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 72 deletions.
29 changes: 16 additions & 13 deletions strings/ctype-uca-scanner_next.inl
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ static inline
#ifdef SCANNER_NEXT_NCHARS
weight_and_nchars_t
MY_FUNCTION_NAME(scanner_next_with_nchars)(my_uca_scanner *scanner,
const my_uca_scanner_param *param,
size_t nchars)
#else
int
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner,
const my_uca_scanner_param *param)
#endif
{
#ifdef SCANNER_NEXT_NCHARS
Expand Down Expand Up @@ -82,7 +84,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
if (scanner->sbeg + 1 < scanner->send)
{
const MY_UCA_2BYTES_ITEM *ww;
ww= my_uca_level_booster_2bytes_item_addr_const(scanner->level->booster,
ww= my_uca_level_booster_2bytes_item_addr_const(param->level->booster,
scanner->sbeg[0],
scanner->sbeg[1]);
if (my_uca_2bytes_item_is_applicable(ww))
Expand Down Expand Up @@ -126,9 +128,10 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
scanner->sbeg+= 1;

#if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, currwc))
if (my_uca_needs_context_handling(param->level, currwc))
{
const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, currwc,
const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param,
currwc,
LOCAL_MAX_CONTRACTION_LENGTH);
if (cnt)
{
Expand All @@ -141,15 +144,15 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)

scanner->page= 0;
scanner->code= (int) currwc;
cweight= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
cweight= param->level->weights[0] + scanner->code * param->level->lengths[0];
if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
continue; /* Ignorable character */
}
else
#endif
/* Get next MB character */
if (((mblen= MY_MB_WC(scanner, &currwc, scanner->sbeg,
if (((mblen= MY_MB_WC(scanner, param, &currwc, scanner->sbeg,
scanner->send)) <= 0))
{
if (scanner->sbeg >= scanner->send)
Expand All @@ -161,7 +164,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
There are some more bytes left. Non-positive mb_len means that
we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
*/
if ((scanner->sbeg+= scanner->cs->mbminlen) > scanner->send)
if ((scanner->sbeg+= param->cs->mbminlen) > scanner->send)
{
/* For safety purposes don't go beyond the string range. */
scanner->sbeg= scanner->send;
Expand All @@ -175,16 +178,16 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
}

scanner->sbeg+= mblen;
if (currwc > scanner->level->maxchar)
if (currwc > param->level->maxchar)
{
SCANNER_NEXT_RETURN(my_uca_scanner_set_weight_outside_maxchar(scanner),
ignorable_nchars + 1);
}

#if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, currwc))
if (my_uca_needs_context_handling(param->level, currwc))
{
const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, currwc,
const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param, currwc,
LOCAL_MAX_CONTRACTION_LENGTH);
if (cnt)
{
Expand All @@ -200,12 +203,12 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
scanner->code= currwc & 0xFF;

/* If weight page for w[0] does not exist, then calculate algoritmically */
if (!(wpage= scanner->level->weights[scanner->page]))
SCANNER_NEXT_RETURN(my_uca_scanner_next_implicit(scanner),
if (!(wpage= param->level->weights[scanner->page]))
SCANNER_NEXT_RETURN(my_uca_scanner_next_implicit(scanner, param),
ignorable_nchars + 1);

/* Calculate pointer to w[0]'s weight, using page and offset */
cweight= wpage + scanner->code * scanner->level->lengths[scanner->page];
cweight= wpage + scanner->code * param->level->lengths[scanner->page];
if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
continue; /* Ignorable character */
Expand Down
79 changes: 48 additions & 31 deletions strings/ctype-uca.c
Original file line number Diff line number Diff line change
Expand Up @@ -31312,6 +31312,13 @@ my_uca1400_info_tailored[MY_CS_ENCODING_LAST+1]
[MY_UCA1400_COLLATION_DEFINITION_COUNT];


typedef struct my_uca_scanner_param_st
{
const MY_UCA_WEIGHT_LEVEL *level;
CHARSET_INFO *cs;
} my_uca_scanner_param;


/*
Unicode Collation Algorithm:
Collation element (weight) scanner,
Expand All @@ -31323,11 +31330,9 @@ typedef struct my_uca_scanner_st
const uint16 *wbeg; /* Beginning of the current weight string */
const uchar *sbeg; /* Beginning of the input string */
const uchar *send; /* End of the input string */
const MY_UCA_WEIGHT_LEVEL *level;
uint16 implicit[2];
int page;
int code;
CHARSET_INFO *cs;
} my_uca_scanner;


Expand Down Expand Up @@ -31870,6 +31875,7 @@ my_uca_contraction_find(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)

static const MY_CONTRACTION *
my_uca_scanner_contraction_hash_find(my_uca_scanner *scanner,
const my_uca_scanner_param *param,
my_wc_t currwc,
size_t max_char_length)
{
Expand All @@ -31887,10 +31893,10 @@ my_uca_scanner_contraction_hash_find(my_uca_scanner *scanner,
flag<<= 1)
{
int mblen;
if ((mblen= my_ci_mb_wc(scanner->cs, &wc[clen], s, scanner->send)) <= 0)
if ((mblen= my_ci_mb_wc(param->cs, &wc[clen], s, scanner->send)) <= 0)
break;
beg[clen]= s= s + mblen;
if (!my_uca_can_be_contraction_part(&scanner->level->contractions,
if (!my_uca_can_be_contraction_part(&param->level->contractions,
wc[clen++], flag))
break;
}
Expand All @@ -31899,9 +31905,9 @@ my_uca_scanner_contraction_hash_find(my_uca_scanner *scanner,
for ( ; clen > 1; clen--)
{
const MY_CONTRACTION *cnt;
if (my_uca_can_be_contraction_tail(&scanner->level->contractions,
if (my_uca_can_be_contraction_tail(&param->level->contractions,
wc[clen - 1]) &&
(cnt= my_uca_contraction_hash_find(&scanner->level->contraction_hash,
(cnt= my_uca_contraction_hash_find(&param->level->contraction_hash,
wc, clen)))
{
scanner->sbeg= beg[clen - 1];
Expand Down Expand Up @@ -31959,12 +31965,14 @@ my_uca_previous_context_find(const MY_CONTRACTIONS *list,
@retval non null pointer - the address of MY_CONTRACTION found
*/
static inline const MY_CONTRACTION *
my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc,
my_uca_context_weight_find(my_uca_scanner *scanner,
const my_uca_scanner_param *param,
my_wc_t currwc,
size_t max_char_length)
{
const MY_CONTRACTION *cnt;
my_wc_t prevwc;
DBUG_ASSERT(scanner->level->contractions.nitems);
DBUG_ASSERT(param->level->contractions.nitems);
/*
If we have scanned a character which can have previous context,
and there were some more characters already before,
Expand All @@ -31974,23 +31982,23 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc,
Note, we support only 2-character long sequences with previous
context at the moment. CLDR does not have longer sequences.
*/
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
if (my_uca_can_be_previous_context_tail(&param->level->contractions,
currwc) &&
scanner->wbeg != nochar && /* if not the very first character */
my_uca_can_be_previous_context_head(&scanner->level->contractions,
my_uca_can_be_previous_context_head(&param->level->contractions,
(prevwc= ((scanner->page << 8) +
scanner->code))) &&
(cnt= my_uca_previous_context_find(&scanner->level->contractions,
(cnt= my_uca_previous_context_find(&param->level->contractions,
prevwc, currwc)))
{
scanner->page= scanner->code= 0; /* Clear for the next character */
return cnt;
}
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
else if (my_uca_can_be_contraction_head(&param->level->contractions,
currwc))
{
/* Check if w[0] starts a contraction */
if ((cnt= my_uca_scanner_contraction_hash_find(scanner, currwc,
if ((cnt= my_uca_scanner_contraction_hash_find(scanner, param, currwc,
max_char_length)))
return cnt;
}
Expand Down Expand Up @@ -32026,10 +32034,11 @@ my_uca_implicit_weight_put(uint16 *to, const MY_UCA_INFO *src_uca,
*/

static inline int
my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner)
my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner,
const my_uca_scanner_param *param)
{
my_wc_t wc= (scanner->page << 8) + scanner->code;
uint version= scanner->cs->uca->version;
uint version= param->cs->uca->version;
MY_UCA_IMPLICIT_WEIGHT weight= my_uca_implicit_weight_primary(version, wc);
scanner->implicit[0]= weight.weight[1]; /* The second weight */
scanner->implicit[1]= 0; /* 0 terminator */
Expand All @@ -32040,14 +32049,15 @@ my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner)

/**
Return an implicit weight for the current level
(according to scanner->level->levelno).
(according to param->level->levelno).

*/
static inline int
my_uca_scanner_next_implicit(my_uca_scanner *scanner)
my_uca_scanner_next_implicit(my_uca_scanner *scanner,
const my_uca_scanner_param *param)
{
switch (scanner->level->levelno) {
case 0: return my_uca_scanner_next_implicit_primary(scanner);/* Primary level*/
switch (param->level->levelno) {
case 0: return my_uca_scanner_next_implicit_primary(scanner, param);/* Primary level*/
case 1: scanner->wbeg= nochar; return 0x0020; /* Secondary level */
case 2: scanner->wbeg= nochar; return 0x0002; /* Tertiary level */
default: scanner->wbeg= nochar; break;
Expand All @@ -32056,21 +32066,28 @@ my_uca_scanner_next_implicit(my_uca_scanner *scanner)
return 0;
}


static void
my_uca_scanner_param_init(my_uca_scanner_param *param,
CHARSET_INFO *cs,
const MY_UCA_WEIGHT_LEVEL *level)
{
param->cs= cs;
param->level= level;
}


/*
The same two functions for any character set
*/
static void
my_uca_scanner_init_any(my_uca_scanner *scanner,
CHARSET_INFO *cs,
const MY_UCA_WEIGHT_LEVEL *level,
const uchar *str, size_t length)
{
/* Note, no needs to initialize scanner->wbeg */
scanner->sbeg= str;
scanner->send= str + length;
scanner->wbeg= nochar;
scanner->level= level;
scanner->cs= cs;
}


Expand Down Expand Up @@ -34648,7 +34665,7 @@ static void my_uca_handler_map(struct charset_info_st *cs,
instead of generic.
*/
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _generic
#define MY_MB_WC(scanner, wc, beg, end) (my_ci_mb_wc(scanner->cs, wc, beg, end))
#define MY_MB_WC(scanner, param, wc, beg, end) (my_ci_mb_wc(param->cs, wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
Expand Down Expand Up @@ -34813,7 +34830,7 @@ create_tailoring(struct charset_info_st *cs,

#include "ctype-ucs2.h"
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _ucs2
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_ucs2_quick(wc, beg, end))
#define MY_MB_WC(scanner, param, wc, beg, end) (my_mb_wc_ucs2_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
Expand Down Expand Up @@ -35775,15 +35792,15 @@ my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader);

#include "ctype-utf8.h"
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb3
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
#define MY_MB_WC(scanner, param, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3
#include "ctype-uca.inl"

#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb3
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
#define MY_MB_WC(scanner, param, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 0
Expand Down Expand Up @@ -36780,15 +36797,15 @@ my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader);


#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb4
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
#define MY_MB_WC(scanner, param, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4
#include "ctype-uca.inl"

#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb4
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
#define MY_MB_WC(scanner, param, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 0
Expand Down Expand Up @@ -37756,7 +37773,7 @@ struct charset_info_st my_charset_utf8mb4_unicode_520_nopad_ci=

#include "ctype-utf32.h"
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf32
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf32_quick(wc, beg, end))
#define MY_MB_WC(scanner, param, wc, beg, end) (my_mb_wc_utf32_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
Expand Down Expand Up @@ -38713,7 +38730,7 @@ struct charset_info_st my_charset_utf32_unicode_520_nopad_ci=

#include "ctype-utf16.h"
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf16
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf16_quick(wc, beg, end))
#define MY_MB_WC(scanner, param, wc, beg, end) (my_mb_wc_utf16_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
Expand Down
Loading

0 comments on commit f6118ac

Please sign in to comment.