Skip to content

Commit

Permalink
MDEV-27009 Add UCA-14.0.0 collations - adding uca-dump into build tar…
Browse files Browse the repository at this point in the history
…gets

- Adding uca-dump into build targets
- Adding ctype-uca.h and moving implicit weight related routines there
- Reusing implicit weight routines in ctype-uca.c and uca-dump.c
- Adding handling of command line arguments to uca-dump
- Fixing some compile-time warnings in uca-dump.c
  • Loading branch information
abarkov authored and sanja-byelkin committed Aug 10, 2022
1 parent 45e0373 commit bb84f61
Show file tree
Hide file tree
Showing 5 changed files with 309 additions and 113 deletions.
1 change: 1 addition & 0 deletions debian/not-installed
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ usr/lib/x86_64-linux-gnu/libidbboot.a # ColumnStore header file
usr/lib/x86_64-linux-gnu/libprocessor.a # ColumnStore header file
usr/lib/x86_64-linux-gnu/libwe_xml.a # ColumnStore header file
usr/bin/test-connect-t
usr/bin/uca-dump
usr/bin/wsrep_sst_backup
usr/lib/mysql/plugin/type_test.so
usr/lib/sysusers.d/mariadb.conf # Not used (yet) in Debian systemd
Expand Down
2 changes: 2 additions & 0 deletions strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ MAYBE_DISABLE_IPO(strings)
ADD_EXECUTABLE(conf_to_src EXCLUDE_FROM_ALL conf_to_src.c)
SET_TARGET_PROPERTIES(conf_to_src PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
TARGET_LINK_LIBRARIES(conf_to_src mysys strings)

ADD_EXECUTABLE(uca-dump uca-dump.c)
65 changes: 9 additions & 56 deletions strings/ctype-uca.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

#include "strings_def.h"
#include <m_ctype.h>
#include "ctype-uca.h"

typedef struct
{
Expand Down Expand Up @@ -31689,62 +31690,13 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc,

/****************************************************************/

/**
Implicit weights for a code CP are constructed as follows:
[.AAAA.0020.0002][.BBBB.0000.0000]

where:
AAAA= BASE + (CP >> 15);
BBBB= (CP & 0x7FFF) | 0x8000;

There are two weights in the primary level (AAAA followed by BBBB).
There is one weight on other levels:
- 0020 on the secondary level
- 0002 on the tertiary level
*/


/**
Return BASE for an implicit weight on the primary level

According to UCA, BASE is calculated as follows:
- FB40 for Unified_Ideograph=True AND
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FB80 for Unified_Ideograph=True AND NOT
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FBC0 for any other code point
TODO: it seems we're not handling BASE correctly:
- check what are those blocks
- there are more Unified Ideograph blocks in the latest Unicode versions
*/
static inline uint16
my_uca_implicit_weight_base(my_wc_t code)
{
if (code >= 0x3400 && code <= 0x4DB5)
return 0xFB80;
if (code >= 0x4E00 && code <= 0x9FA5)
return 0xFB40;
return 0xFBC0;
}


static inline void
my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level)
{
switch (level) {
case 1: to[0]= 0x0020; to[1]= 0; break; /* Secondary level */
case 2: to[0]= 0x0002; to[1]= 0; break; /* Tertiary level */
case 3: to[0]= 0x0001; to[1]= 0; break; /* Quaternary level */
default:
DBUG_ASSERT(0);
case 0:
break;
}
/* Primary level */
to[0]= (uint16)(code >> 15) + my_uca_implicit_weight_base(code);
to[1]= (code & 0x7FFF) | 0x8000;
MY_UCA_IMPLICIT_WEIGHT weight;
weight= my_uca_520_implicit_weight_on_level(code, level);
to[0]= weight.weight[0];
to[1]= weight.weight[1];
to[2]= 0;
}

Expand All @@ -31766,10 +31718,11 @@ static inline int
my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner)
{
my_wc_t wc= (scanner->page << 8) + scanner->code;
scanner->implicit[0]= (wc & 0x7FFF) | 0x8000; /* The second weight */
scanner->implicit[1]= 0; /* 0 terminator */
MY_UCA_IMPLICIT_WEIGHT weight= my_uca_520_implicit_weight_primary(wc);
scanner->implicit[0]= weight.weight[1]; /* The second weight */
scanner->implicit[1]= 0; /* 0 terminator */
scanner->wbeg= scanner->implicit;
return my_uca_implicit_weight_base(wc) + (wc >> 15);
return weight.weight[0]; /* The first weight */
}


Expand Down
155 changes: 155 additions & 0 deletions strings/ctype-uca.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#ifndef CTYPE_UCA_H
#define CTYPE_UCA_H
/* Copyright (c) 2021, MariaDB
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; version 2
of the License.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
MA 02110-1335 USA */


/*
Implicit weight handling is done according to
the section "Computing Implicit Weights" in
https://unicode.org/reports/tr10/#Values_For_Base_Table
(as of Unicode 14.0.0)
Implicit weights for a code CP are constructed as follows:
[.AAAA.0020.0002][.BBBB.0000.0000]
- There are two primary weights, depending on the character type and block.
- There is one weight on the secondary and tertiary levels.
AAAA and BBBB are computed using different formulas for:
- Siniform ideographic scripts
- Han
- Unassigned characters
*/

typedef struct my_uca_implict_weight_t
{
uint16 weight[2];
} MY_UCA_IMPLICIT_WEIGHT;


/*
By default, implicit weights for a code CP are constructed as follows:
[.AAAA.0020.0002][.BBBB.0000.0000]
where AAAA and BBBB are :
AAAA= BASE + (CP >> 15);
BBBB= (CP & 0x7FFF) | 0x8000;
This formula covers the following implicit weight subtypes:
- Core Han Unified Ideographs
- All other Han Unified Ideographs
- Unassigned characters
Every mentioned subtype passes a different BASE.
This formula does not cover Siniform ideographic scripts.
They are handled by separate functions.
*/
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_primary_default(uint16 base, my_wc_t code)
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= (uint16) ((code >> 15) + base);
res.weight[1]= (uint16) ((code & 0x7FFF)|0x8000);
return res;
}


/**
Calculate Unicode-5.2.0 implicit weight on the primary level.
According to UCA, BASE is calculated as follows:
- FB40 for Unified_Ideograph=True AND
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FB80 for Unified_Ideograph=True AND NOT
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FBC0 for any other code point
But for Unicode-5.2.0 and Unicode-4.0.0 we used
a simplified formula as implemented before.
*/
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_520_implicit_weight_primary(my_wc_t code)
{
uint16 base;
/*
3400;<CJK Ideograph Extension A, First>
4DB5;<CJK Ideograph Extension A, Last>
4E00;<CJK Ideograph, First>
9FA5;<CJK Ideograph, Last>
*/
if (code >= 0x3400 && code <= 0x4DB5)
base= 0xFB80;
else if (code >= 0x4E00 && code <= 0x9FA5)
base= 0xFB40;
else
base= 0xFBC0;

return my_uca_implicit_weight_primary_default(base, code);
}


static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_secondary()
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= 0x0020;
res.weight[1]= 0;
return res;
}


static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_tertiary()
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= 0x0002;
res.weight[1]= 0;
return res;
}


static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_quaternary()
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= 0x0001;
res.weight[1]= 0;
return res;
}


static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_520_implicit_weight_on_level(my_wc_t code, uint level)
{
switch (level) {
case 0:
return my_uca_520_implicit_weight_primary(code);
case 1:
return my_uca_implicit_weight_secondary();
case 2:
return my_uca_implicit_weight_tertiary();
default:
break;
}
return my_uca_implicit_weight_quaternary();
}


#endif /* CTYPE_UCA_H */
Loading

0 comments on commit bb84f61

Please sign in to comment.