Skip to content
Permalink
Browse files
Simplify caseup() and casedn() in charsets
After the MDEV-13118 fix there's no code in the server that
wants caseup/casedn to change the argument in place for simple
charsets.  Let's remove this logic and always return the result in a
new string for all charsets, both simple and complex.

1. Removing the optimization that *some* character sets used in casedn()
  and caseup(), which allowed (and required) to change the case in-place,
  overwriting the string passed as the "src" argument.
  Now all CHARSET_INFO's work in the same way:
  non of them change the source string in-place, all of them now convert
  case from the source string to the destination string, leaving
  the source string untouched.

2. Adding "const" qualifier to the "char *src" parameter
   to caseup() and casedn().

3. Removing duplicate implementations in ctype-mb.c.
  Now both caseup() and casedn() implementations for all CJK character sets
  use internally the same function my_casefold_mb()
  (the former my_casefold_mb_varlen()).

4. Removing the "unused" attribute from parameters of some my_case{up|dn}_xxx()
   implementations, as the affected parameters are now *used* in the code.
   Previously these parameters were used only in DBUG_ASSERT().
  • Loading branch information
abarkov committed Jul 19, 2018
1 parent ab58493 commit e2ac409
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 177 deletions.
@@ -365,7 +365,7 @@ typedef int (*my_charset_conv_mb_wc)(CHARSET_INFO *, my_wc_t *,
typedef int (*my_charset_conv_wc_mb)(CHARSET_INFO *, my_wc_t,
uchar *, uchar *);
typedef size_t (*my_charset_conv_case)(CHARSET_INFO *,
char *, size_t, char *, size_t);
const char *, size_t, char *, size_t);


/* See strings/CHARSET_INFO.txt about information on this structure */
@@ -565,9 +565,11 @@ extern uint my_instr_simple(CHARSET_INFO *,
/* Functions for 8bit */
extern size_t my_caseup_str_8bit(CHARSET_INFO *, char *);
extern size_t my_casedn_str_8bit(CHARSET_INFO *, char *);
extern size_t my_caseup_8bit(CHARSET_INFO *, char *src, size_t srclen,
extern size_t my_caseup_8bit(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_8bit(CHARSET_INFO *, char *src, size_t srclen,
extern size_t my_casedn_8bit(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);

extern int my_strcasecmp_8bit(CHARSET_INFO * cs, const char *, const char *);
@@ -658,17 +660,17 @@ uint my_mbcharlen_8bit(CHARSET_INFO *, uint c);
/* Functions for multibyte charsets */
extern size_t my_caseup_str_mb(CHARSET_INFO *, char *);
extern size_t my_casedn_str_mb(CHARSET_INFO *, char *);
extern size_t my_caseup_mb(CHARSET_INFO *, char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_mb(CHARSET_INFO *, char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_caseup_mb_varlen(CHARSET_INFO *, char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_mb_varlen(CHARSET_INFO *, char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_caseup_ujis(CHARSET_INFO *, char *src, size_t srclen,
extern size_t my_caseup_mb(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_mb(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_caseup_ujis(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_ujis(CHARSET_INFO *, char *src, size_t srclen,
extern size_t my_casedn_ujis(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern int my_strcasecmp_mb(CHARSET_INFO * cs,const char *, const char *);

@@ -1572,19 +1572,10 @@ String *Item_str_conv::val_str(String *str)
str->alloc((alloced_length= res->length() * multiply)))))
return 0;

if (multiply == 1)
{
str->copy(*res); // Should not fail (it was alloced above)
len= converter(collation.collation, (char*) str->ptr(), str->length(),
(char*) str->ptr(), alloced_length);
}
else
{
len= converter(collation.collation, (char*) res->ptr(), res->length(),
(char*) str->ptr(), alloced_length);
str->set_charset(collation.collation);
}
len= converter(collation.collation, (char*) res->ptr(), res->length(),
(char*) str->ptr(), alloced_length);
DBUG_ASSERT(len <= alloced_length);
str->set_charset(collation.collation);
str->length(len);
return str;
}
@@ -220,11 +220,11 @@ static size_t my_case_str_bin(CHARSET_INFO *cs __attribute__((unused)),


static size_t my_case_bin(CHARSET_INFO *cs __attribute__((unused)),
char *src __attribute__((unused)),
size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
const char *src, size_t srclen,
char *dst, size_t dstlen)
{
DBUG_ASSERT(srclen <= dstlen);
memcpy(dst, src, srclen);
return srclen;
}

@@ -9994,8 +9994,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_mb_ctype_mb,
my_caseup_str_mb,
my_casedn_str_mb,
my_caseup_mb_varlen, /* UPPER() can reduce length: Turkish DOTLESS i -> I */
my_casedn_mb, /* LOWER() does not change length, use simple version*/
my_caseup_mb, /* UPPER() can reduce length: Turkish DOTLESS i -> I */
my_casedn_mb, /* LOWER() does not change length */
my_snprintf_8bit,
my_long10_to_str_8bit,
my_longlong10_to_str_8bit,
@@ -71,95 +71,23 @@ get_case_info_for_ch(const CHARSET_INFO *cs, uint page, uint offs)


/*
For character sets which don't change octet length in case conversion.
*/
size_t my_caseup_mb(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
{
register uint32 l;
register char *srcend= src + srclen;
register const uchar *map= cs->to_upper;

DBUG_ASSERT(cs->caseup_multiply == 1);
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(cs->mbmaxlen == 2);

while (src < srcend)
{
if ((l=my_ismbchar(cs, src, srcend)))
{
MY_UNICASE_CHARACTER *ch;
if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1])))
{
*src++= ch->toupper >> 8;
*src++= ch->toupper & 0xFF;
}
else
src+= l;
}
else
{
*src=(char) map[(uchar) *src];
src++;
}
}
return srclen;
}


size_t my_casedn_mb(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
{
register uint32 l;
register char *srcend= src + srclen;
register const uchar *map=cs->to_lower;

DBUG_ASSERT(cs->casedn_multiply == 1);
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(cs->mbmaxlen == 2);

while (src < srcend)
{
if ((l= my_ismbchar(cs, src, srcend)))
{
MY_UNICASE_CHARACTER *ch;
if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1])))
{
*src++= ch->tolower >> 8;
*src++= ch->tolower & 0xFF;
}
else
src+= l;
}
else
{
*src= (char) map[(uchar)*src];
src++;
}
}
return srclen;
}


/*
Case folding functions for character set
where case conversion can change string octet length.
Case folding functions for CJK character set.
Case conversion can optionally reduce string octet length.
For example, in EUCKR,
_euckr 0xA9A5 == "LATIN LETTER DOTLESS I" (Turkish letter)
is upper-cased to to
_euckr 0x49 "LATIN CAPITAL LETTER I" ('usual' letter I)
Length is reduced in this example from two bytes to one byte.
*/
static size_t
my_casefold_mb_varlen(CHARSET_INFO *cs,
char *src, size_t srclen,
char *dst, size_t dstlen __attribute__((unused)),
const uchar *map,
size_t is_upper)
my_casefold_mb(CHARSET_INFO *cs,
const char *src, size_t srclen,
char *dst, size_t dstlen __attribute__((unused)),
const uchar *map,
size_t is_upper)
{
char *srcend= src + srclen, *dst0= dst;
const char *srcend= src + srclen;
char *dst0= dst;

DBUG_ASSERT(cs->mbmaxlen == 2);

@@ -193,22 +121,22 @@ my_casefold_mb_varlen(CHARSET_INFO *cs,


size_t
my_casedn_mb_varlen(CHARSET_INFO * cs, char *src, size_t srclen,
my_casedn_mb(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
DBUG_ASSERT(dstlen >= srclen * cs->casedn_multiply);
DBUG_ASSERT(src != dst || cs->casedn_multiply == 1);
return my_casefold_mb_varlen(cs, src, srclen, dst, dstlen, cs->to_lower, 0);
return my_casefold_mb(cs, src, srclen, dst, dstlen, cs->to_lower, 0);
}


size_t
my_caseup_mb_varlen(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst, size_t dstlen)
my_caseup_mb(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
DBUG_ASSERT(dstlen >= srclen * cs->caseup_multiply);
DBUG_ASSERT(src != dst || cs->caseup_multiply == 1);
return my_casefold_mb_varlen(cs, src, srclen, dst, dstlen, cs->to_upper, 1);
return my_casefold_mb(cs, src, srclen, dst, dstlen, cs->to_upper, 1);
}


@@ -214,28 +214,26 @@ size_t my_casedn_str_8bit(CHARSET_INFO * cs,char *str)
}


size_t my_caseup_8bit(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
size_t my_caseup_8bit(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
char *end= src + srclen;
const char *end= src + srclen;
register const uchar *map= cs->to_upper;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
for ( ; src != end ; src++)
*src= (char) map[(uchar) *src];
*dst++= (char) map[(uchar) *src];
return srclen;
}


size_t my_casedn_8bit(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
size_t my_casedn_8bit(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
char *end= src + srclen;
const char *end= src + srclen;
register const uchar *map=cs->to_lower;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
for ( ; src != end ; src++)
*src= (char) map[(uchar) *src];
*dst++= (char) map[(uchar) *src];
return srclen;
}

0 comments on commit e2ac409

Please sign in to comment.