Skip to content

Commit

Permalink
MDEV-8419 utf32: compare broken bytes as "greater than any non-broken…
Browse files Browse the repository at this point in the history
… character"
  • Loading branch information
Alexander Barkov committed Jul 7, 2015
1 parent a5f4412 commit e4f8cea
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 232 deletions.
18 changes: 18 additions & 0 deletions mysql-test/r/ctype_utf32.result
Original file line number Diff line number Diff line change
Expand Up @@ -2206,3 +2206,21 @@ DEALLOCATE PREPARE stmt;
#
# End of 10.0 tests
#
#
# Start of 10.1 tests
#
#
# MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf32, KEY(a));
INSERT INTO t1 VALUES (0x10000),(0x10001),(0x10002);
SELECT COUNT(DISTINCT a) FROM t1;
COUNT(DISTINCT a)
1
DROP TABLE t1;
SELECT _utf32 0x10001=_utf32 0x10002;
_utf32 0x10001=_utf32 0x10002
1
#
# End of 10.1 tests
#
17 changes: 17 additions & 0 deletions mysql-test/t/ctype_utf32.test
Original file line number Diff line number Diff line change
Expand Up @@ -956,3 +956,20 @@ DEALLOCATE PREPARE stmt;
--echo # End of 10.0 tests
--echo #

--echo #
--echo # Start of 10.1 tests
--echo #

--echo #
--echo # MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
--echo #
# Make sure that all non-BMP characters are compared as equal
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf32, KEY(a));
INSERT INTO t1 VALUES (0x10000),(0x10001),(0x10002);
SELECT COUNT(DISTINCT a) FROM t1;
DROP TABLE t1;
SELECT _utf32 0x10001=_utf32 0x10002;

--echo #
--echo # End of 10.1 tests
--echo #
263 changes: 31 additions & 232 deletions strings/ctype-ucs2.c
Original file line number Diff line number Diff line change
Expand Up @@ -1892,14 +1892,42 @@ struct charset_info_st my_charset_utf16le_bin=
*/
#define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))

#define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1))

#define MY_UTF32_WC4(b0,b1,b2,b3) ((b0 << 24) + (b1 << 16) + (b2 << 8) + (b3))

static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
uchar b2, uchar b3)
{
my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3);
if (wc <= 0xFFFF)
{
MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
return (int) (page ? page[wc & 0xFF].sort : wc);
}
return MY_CS_REPLACEMENT_CHARACTER;
}
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3)
#include "strcoll.ic"

#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_bin
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF32_WC4(b0, b1, b2, b3))
#include "strcoll.ic"

#undef IS_MB2_CHAR
#undef IS_MB4_CHAR


static int
my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t *pwc, const uchar *s, const uchar *e)
{
if (s + 4 > e)
return MY_CS_TOOSMALL4;
*pwc= (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]);
*pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]);
return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
}

Expand Down Expand Up @@ -2029,144 +2057,6 @@ my_casedn_utf32(CHARSET_INFO *cs, char *src, size_t srclen,
}


static int
my_strnncoll_utf32(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
const uchar *se= s + slen;
const uchar *te= t + tlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;

while (s < se && t < te)
{
int s_res= my_utf32_uni(cs, &s_wc, s, se);
int t_res= my_utf32_uni(cs, &t_wc, t, te);

if ( s_res <= 0 || t_res <= 0)
{
/* Incorrect string, compare by char value */
return my_bincmp(s, se, t, te);
}

my_tosort_utf32(uni_plane, &s_wc);
my_tosort_utf32(uni_plane, &t_wc);

if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
}

s+= s_res;
t+= t_res;
}
return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
}


/**
Compare strings, discarding end space
If one string is shorter as the other, then we space extend the other
so that the strings have equal length.
This will ensure that the following things hold:
"a" == "a "
"a\0" < "a"
"a\0" < "a "
@param cs Character set pinter.
@param a First string to compare.
@param a_length Length of 'a'.
@param b Second string to compare.
@param b_length Length of 'b'.
IMPLEMENTATION
@return Comparison result.
@retval Negative number, if a less than b.
@retval 0, if a is equal to b
@retval Positive number, if a > b
*/


static int
my_strnncollsp_utf32(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool diff_if_only_endspace_difference)
{
int res;
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
const uchar *se= s + slen, *te= t + tlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;

DBUG_ASSERT((slen % 4) == 0);
DBUG_ASSERT((tlen % 4) == 0);

#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
diff_if_only_endspace_difference= FALSE;
#endif

while ( s < se && t < te )
{
int s_res= my_utf32_uni(cs, &s_wc, s, se);
int t_res= my_utf32_uni(cs, &t_wc, t, te);

if ( s_res <= 0 || t_res <= 0 )
{
/* Incorrect string, compare bytewise */
return my_bincmp(s, se, t, te);
}

my_tosort_utf32(uni_plane, &s_wc);
my_tosort_utf32(uni_plane, &t_wc);

if ( s_wc != t_wc )
{
return s_wc > t_wc ? 1 : -1;
}

s+= s_res;
t+= t_res;
}

slen= (size_t) (se - s);
tlen= (size_t) (te - t);
res= 0;

if (slen != tlen)
{
int s_res, swap= 1;
if (diff_if_only_endspace_difference)
res= 1; /* Assume 's' is bigger */
if (slen < tlen)
{
slen= tlen;
s= t;
se= te;
swap= -1;
res= -res;
}

for ( ; s < se; s+= s_res)
{
if ((s_res= my_utf32_uni(cs, &s_wc, s, se)) < 0)
{
DBUG_ASSERT(0);
return 0;
}
if (s_wc != ' ')
return (s_wc < ' ') ? -swap : swap;
}
}
return res;
}


static uint
my_ismbchar_utf32(CHARSET_INFO *cs __attribute__((unused)),
const char *b,
Expand Down Expand Up @@ -2578,97 +2468,6 @@ my_wildcmp_utf32_bin(CHARSET_INFO *cs,
}


static int
my_strnncoll_utf32_bin(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
const uchar *se= s + slen;
const uchar *te= t + tlen;

while (s < se && t < te)
{
int s_res= my_utf32_uni(cs, &s_wc, s, se);
int t_res= my_utf32_uni(cs, &t_wc, t, te);

if (s_res <= 0 || t_res <= 0)
{
/* Incorrect string, compare by char value */
return my_bincmp(s, se, t, te);
}
if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
}

s+= s_res;
t+= t_res;
}
return (int) (t_is_prefix ? (t-te) : ((se - s) - (te - t)));
}


static inline my_wc_t
my_utf32_get(const uchar *s)
{
return
((my_wc_t) s[0] << 24) +
((my_wc_t) s[1] << 16) +
((my_wc_t) s[2] << 8) +
s[3];
}


static int
my_strnncollsp_utf32_bin(CHARSET_INFO *cs __attribute__((unused)),
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool diff_if_only_endspace_difference
__attribute__((unused)))
{
const uchar *se, *te;
size_t minlen;

DBUG_ASSERT((slen % 4) == 0);
DBUG_ASSERT((tlen % 4) == 0);

se= s + slen;
te= t + tlen;

for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 4)
{
my_wc_t s_wc= my_utf32_get(s);
my_wc_t t_wc= my_utf32_get(t);
if (s_wc != t_wc)
return s_wc > t_wc ? 1 : -1;

s+= 4;
t+= 4;
}

if (slen != tlen)
{
int swap= 1;
if (slen < tlen)
{
s= t;
se= te;
swap= -1;
}

for ( ; s < se ; s+= 4)
{
my_wc_t s_wc= my_utf32_get(s);
if (s_wc != ' ')
return (s_wc < ' ') ? -swap : swap;
}
}
return 0;
}


static size_t
my_scan_utf32(CHARSET_INFO *cs,
const char *str, const char *end, int sequence_type)
Expand Down Expand Up @@ -2696,8 +2495,8 @@ my_scan_utf32(CHARSET_INFO *cs,
static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
{
NULL, /* init */
my_strnncoll_utf32,
my_strnncollsp_utf32,
my_strnncoll_utf32_general_ci,
my_strnncollsp_utf32_general_ci,
my_strnxfrm_unicode,
my_strnxfrmlen_unicode,
my_like_range_generic,
Expand Down
Loading

0 comments on commit e4f8cea

Please sign in to comment.