Skip to content

Commit

Permalink
MDEV-17474 Change Unicode collation implementation from "handler" to …
Browse files Browse the repository at this point in the history
…"inline" style (part#2)

Additional changes:

1. Adding a fast path for ASCII characters
2. Adding dedicated MY_COLLATION_HANDLERs for collations with no contractions
   (for utf8 and for utf8mb4 character sets). The choice between
   the full-featured handler and the "no contraction" handler is
   made at the collation initialization time.
  • Loading branch information
abarkov committed Oct 18, 2018
1 parent d88c136 commit 475c6ec
Show file tree
Hide file tree
Showing 2 changed files with 239 additions and 32 deletions.
183 changes: 183 additions & 0 deletions strings/ctype-uca.c
Original file line number Diff line number Diff line change
Expand Up @@ -31409,6 +31409,28 @@ my_uca_can_be_previous_context_tail(const MY_CONTRACTIONS *list, my_wc_t wc)
}


/**
Check if a character needs previous/next context handling:
- can be a previois context tail
- can be a contraction start

@param level Pointer to an UCA weight level data
@param wc Code point

@return
@retval FALSE - does not need context handling
@retval TRUE - needs context handing
*/

static inline my_bool
my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc)
{
return level->contractions.nitems > 0 &&
level->contractions.flags[wc & MY_UCA_CNT_FLAG_MASK] &
(MY_UCA_PREVIOUS_CONTEXT_TAIL | MY_UCA_CNT_HEAD);
}


/**
Compare two wide character strings, wide analog to strncmp().

Expand Down Expand Up @@ -31543,6 +31565,60 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
return NULL;
}


/*
Find a context dependent weight of a character.
@param scanner - UCA weight scanner. The caller should set
its members "page" and "code" to the previous character
(or to zeros if there is no a previous character).
@param wc - an array of wide characters which has at least
MY_UCA_MAX_CONTRACTION elements, where wc[0] is set
to the current character (whose weight is being resolved).
The values of wc[i>0] is not important, but if wc[0]
appears to be a known contraction head, the function
will collect further contraction parts into wc[i>0].
If wc[0] and the previous character make a previous context
pair, then wc[1] is set to the previous character.

@retval NULL if could not find any contextual weights for wc[0]
@retval non null pointer to a zero-terminated weight string otherwise
*/
static inline uint16 *
my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc)
{
uint16 *cweight;
DBUG_ASSERT(scanner->level->contractions.nitems);
/*
If we have scanned a character which can have previous context,
and there were some more characters already before,
then reconstruct codepoint of the previous character
from "page" and "code" into w[1], and verify that {wc[1], wc[0]}
together form a real previous context pair.
Note, we support only 2-character long sequences with previous
context at the moment. CLDR does not have longer sequences.
*/
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
wc[0]) &&
scanner->wbeg != nochar && /* if not the very first character */
my_uca_can_be_previous_context_head(&scanner->level->contractions,
(wc[1]= ((scanner->page << 8) +
scanner->code))) &&
(cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
{
scanner->page= scanner->code= 0; /* Clear for the next character */
return cweight;
}
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
wc[0]))
{
/* Check if w[0] starts a contraction */
if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
return cweight;
}
return NULL;
}


/****************************************************************/

/**
Expand Down Expand Up @@ -31934,6 +32010,23 @@ int my_wildcmp_uca(CHARSET_INFO *cs,
}


/*
Tests if an optimized "no contraction" handler can be used for
the given collation.
*/
static my_bool
my_uca_collation_can_optimize_no_contractions(CHARSET_INFO *cs)
{
uint i;
for (i= 0; i < cs->levels_for_order ; i++)
{
if (my_uca_have_contractions_quick(&cs->uca->level[i]))
return FALSE;
}
return TRUE;
}


/*
Collation language is implemented according to
subset of ICU Collation Customization (tailorings):
Expand Down Expand Up @@ -33644,6 +33737,31 @@ static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len)
}


/*
This structure is used at the collation initialization time, to switch
from a full-featured collation handler to a "no contraction" collation
handler if the collation is known not to have any contractions.
*/
typedef struct
{
MY_COLLATION_HANDLER *pad;
MY_COLLATION_HANDLER *nopad;
MY_COLLATION_HANDLER *multilevel_pad;
MY_COLLATION_HANDLER *multilevel_nopad;
} MY_COLLATION_HANDLER_PACKAGE;


static void my_uca_handler_map(struct charset_info_st *cs,
const MY_COLLATION_HANDLER_PACKAGE *from,
const MY_COLLATION_HANDLER_PACKAGE *to)
{
if (cs->coll == from->pad) cs->coll= to->pad;
else if (cs->coll == from->nopad) cs->coll= to->nopad;
else if (cs->coll == from->multilevel_pad) cs->coll= to->multilevel_pad;
else if (cs->coll == from->multilevel_nopad) cs->coll= to->multilevel_nopad;
}


/*
Define generic collation handlers for multi-level collations with tailoring:

Expand All @@ -33656,6 +33774,9 @@ static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len)
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _generic
#define MY_MB_WC(scanner, wc, beg, end) (scanner->cs->cset->mb_wc(scanner->cs, wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"


Expand Down Expand Up @@ -33758,6 +33879,9 @@ create_tailoring(struct charset_info_st *cs,
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _ucs2
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_ucs2_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"


Expand Down Expand Up @@ -34711,12 +34835,38 @@ struct charset_info_st my_charset_ucs2_unicode_520_nopad_ci=

#ifdef HAVE_CHARSET_utf8

static my_bool
my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader);

#include "ctype-utf8.h"
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb3
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3
#include "ctype-uca.ic"

#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb3
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 0
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3
#include "ctype-uca.ic"


static my_bool
my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader)
{
if (my_coll_init_uca(cs, loader))
return TRUE;
if (my_uca_collation_can_optimize_no_contractions(cs))
my_uca_handler_map(cs, &my_uca_package_utf8mb3,
&my_uca_package_no_contractions_utf8mb3);
return FALSE;
}


/*
We consider bytes with code more than 127 as a letter.
Expand Down Expand Up @@ -35690,12 +35840,39 @@ struct charset_info_st my_charset_utf8_unicode_520_nopad_ci=

#ifdef HAVE_CHARSET_utf8mb4

static my_bool
my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader);


#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb4
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4
#include "ctype-uca.ic"

#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb4
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 0
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4
#include "ctype-uca.ic"


static my_bool
my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader)
{
if (my_coll_init_uca(cs, loader))
return TRUE;
if (my_uca_collation_can_optimize_no_contractions(cs))
my_uca_handler_map(cs, &my_uca_package_utf8mb4,
&my_uca_package_no_contractions_utf8mb4);
return FALSE;
}


extern MY_CHARSET_HANDLER my_charset_utf8mb4_handler;

#define MY_CS_UTF8MB4_UCA_FLAGS (MY_CS_COMMON_UCA_FLAGS|MY_CS_UNICODE_SUPPLEMENT)
Expand Down Expand Up @@ -36646,6 +36823,9 @@ struct charset_info_st my_charset_utf8mb4_unicode_520_nopad_ci=
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf32
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf32_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"


Expand Down Expand Up @@ -37601,6 +37781,9 @@ struct charset_info_st my_charset_utf32_unicode_520_nopad_ci=
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf16
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf16_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"


Expand Down
Loading

0 comments on commit 475c6ec

Please sign in to comment.