Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ The C library is found in this directory after successful compilation
and is named `libutf8proc.a` (for the static library) and
`libutf8proc.so` (for the dynamic library).

The Unicode version supported is 10.0.0.
The Unicode version supported is 11.0.0.

For Unicode normalizations, the following options are used:

Expand Down
11 changes: 7 additions & 4 deletions data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ CURLFLAGS = --retry 5 --location

.DELETE_ON_ERROR:

utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
$(RUBY) data_generator.rb < UnicodeData.txt > $@

# GNU Unifont version for font metric calculations:
UNIFONT_VERSION=10.0.07
UNIFONT_VERSION=11.0.01

unifont.ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
Expand All @@ -35,7 +35,7 @@ CharWidths.txt: charwidths.jl unifont.sfd unifont_upper.sfd EastAsianWidth.txt
$(JULIA) charwidths.jl > $@

# Unicode data version
UNICODE_VERSION=10.0.0
UNICODE_VERSION=11.0.0

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
Expand All @@ -61,6 +61,9 @@ NormalizationTest.txt:
GraphemeBreakTest.txt:
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@

emoji-data.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://unicode.org/Public/emoji/`echo $(UNICODE_VERSION) | cut -d. -f1-2`/emoji-data.txt

clean:
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd emoji-data.txt
rm -f utf8proc_data.c.new
13 changes: 13 additions & 0 deletions data/data_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,19 @@
end
end

$emoji_data_list = File.read("emoji-data.txt")
$emoji_data_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC"
elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" }
elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND"
end
end

$charwidth_list = File.read("CharWidths.txt")
$charwidth = Hash.new(0)
$charwidth_list.each_line do |entry|
Expand Down
4 changes: 3 additions & 1 deletion test/case.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ int main(int argc, char **argv)
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
check(t == c || utf8proc_codepoint_valid(t), "invalid totitle");

if (utf8proc_codepoint_valid(c) && (l == u) != (l == t)) {
if (utf8proc_codepoint_valid(c) && (l == u) != (l == t) &&
/* Unicode 11: Georgian Mkhedruli chars have uppercase but no titlecase. */
!(((c >= 0x10d0 && c <= 0x10fa) || c >= (0x10fd && c <= 0x10ff)) && l != u)) {
fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c);
++error;
}
Expand Down
23 changes: 11 additions & 12 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -271,12 +271,8 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below)
tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
true; // GB999
Expand All @@ -295,12 +291,15 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
// Special support for GB10. Fold any EXTEND codepoints into the previous
// boundclass if we're dealing with an emoji base boundclass.
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
*state = UTF8PROC_BOUNDCLASS_E_BASE;
// Special support for GB11 (emoji extend* zwj / emoji)
else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
*state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
else
*state = tbc;
}
else
*state = tbc;
}
Expand Down
8 changes: 8 additions & 0 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,10 +382,18 @@ typedef enum {
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */

/* the following are no longer used in Unicode 11, but we keep
the constants here for backward compatibility */
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */

/* the Extended_Pictographic property is used in the Unicode 11
grapheme-boundary rules, so we store it in the boundclass field */
UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19,
UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
} utf8proc_boundclass_t;

/**
Expand Down
Loading