Browse files

export utf8proc functionality in Julia (followup to #5462 and #5434)

  • Loading branch information...
1 parent c19a8e9 commit 6039a46fca6d7e5357b733d028cf2b0cb5bbee21 @stevengj stevengj committed Jan 26, 2014
Showing with 126 additions and 2 deletions.
  1. +0 −2 base/char.jl
  2. +2 −0 base/exports.jl
  3. +2 −0 base/sysimg.jl
  4. +87 −0 base/utf8proc.jl
  5. +35 −0 doc/stdlib/base.rst
View
2 base/char.jl
@@ -1,8 +1,6 @@
char(x) = convert(Char, x)
char(x::FloatingPoint) = char(iround(x))
-is_valid_char(c) = !('\ud800' <= c <= '\udfff' || '\U10ffff' < c)
-
integer(x::Char) = int(x)
unsigned(x::Char) = uint(x)
View
2 base/exports.jl
@@ -766,6 +766,7 @@ export
hex2bytes,
ind2chr,
info,
+ is_assigned_char,
is_valid_ascii,
is_valid_char,
is_valid_utf8,
@@ -793,6 +794,7 @@ export
matchall,
ndigits,
nextind,
+ normalize_string,
oct,
parsefloat,
parseint,
View
2 base/sysimg.jl
@@ -75,6 +75,8 @@ include("utf8.jl")
include("utf16.jl")
include("iobuffer.jl")
include("string.jl")
+include("utf8proc.jl")
+importall .UTF8proc
include("regex.jl")
include("base64.jl")
importall .Base64
View
87 base/utf8proc.jl
@@ -0,0 +1,87 @@
+# Various Unicode functionality from the utf8proc library
+module UTF8proc
+
+import Base: show, showcompact, ==, string, symbol, isless, hash
+
+# also exported by Base:
+export normalize_string, is_valid_char, is_assigned_char
+
+# whether codepoints are valid Unicode
+is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))
+
+const UTF8PROC_NULLTERM = (1<<0)
+const UTF8PROC_STABLE = (1<<1)
+const UTF8PROC_COMPAT = (1<<2)
+const UTF8PROC_COMPOSE = (1<<3)
+const UTF8PROC_DECOMPOSE = (1<<4)
+const UTF8PROC_IGNORE = (1<<5)
+const UTF8PROC_REJECTNA = (1<<6)
+const UTF8PROC_NLF2LS = (1<<7)
+const UTF8PROC_NLF2PS = (1<<8)
+const UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
+const UTF8PROC_STRIPCC = (1<<9)
+const UTF8PROC_CASEFOLD = (1<<10)
+const UTF8PROC_CHARBOUND = (1<<11)
+const UTF8PROC_LUMP = (1<<12)
+const UTF8PROC_STRIPMARK = (1<<13)
+
+let
+ const p = Array(Ptr{Uint8}, 1)
+ global utf8proc_map
+ function utf8proc_map(s::String, flags::Integer)
+ result = ccall(:utf8proc_map, Cssize_t,
+ (Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint),
+ bytestring(s), 0, p, flags | UTF8PROC_NULLTERM)
+ result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8},
+ (Cssize_t,), result)))
+ a = ccall(:jl_ptr_to_array_1d, Vector{Uint8},
+ (Any, Ptr{Uint8}, Csize_t, Cint),
+ Vector{Uint8}, p[1], result, true)
+ ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
+ end
+end
+
+function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=false, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
+ flags = 0
+ stable && (flags = flags | UTF8PROC_STABLE)
+ compat && (flags = flags | UTF8PROC_COMPAT | (decompose ? 0 : UTF8PROC_COMPOSE))
+ compose && (flags = flags | UTF8PROC_COMPOSE)
+ if decompose
+ compose && throw(ArgumentError("compose=true and decompose=true cannot both be specified"))
+ flags = flags | UTF8PROC_DECOMPOSE
+ end
+ stripignore && (flags = flags | UTF8PROC_IGNORE)
+ rejectna && (flags = flags | UTF8PROC_REJECTNA)
+ newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
+ newline2ls && (flags = flags | UTF8PROC_NLF2LS)
+ newline2ps && (flags = flags | UTF8PROC_NLF2PS)
+ newline2lf && (flags = flags | UTF8PROC_NLF2LF)
+ stripcc && (flags = flags | UTF8PROC_STRIPCC)
+ casefold && (flags = flags | UTF8PROC_CASEFOLD)
+ lump && (flags = flags | UTF8PROC_LUMP)
+ stripmark && (flags = flags | UTF8PROC_STRIPMARK | (decompose ? 0 : UTF8PROC_COMPOSE))
+ utf8proc_map(s, flags)
+end
+
+function normalize_string(s::String, nf::Symbol)
+ utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
+ nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
+ nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
+ | UTF8PROC_COMPAT) :
+ nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
+ | UTF8PROC_COMPAT) :
+ throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
+end
+
+# returns UTF8PROC_CATEGORY code in 0..30 giving Unicode category
+function category_code(c)
+ # note: utf8proc returns 0, not UTF8PROC_CATEGORY_CN, for unassigned c
+ c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
+ unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
+end
+
+is_assigned_char(c) = category_code(c) != 0
+
+# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?
+
+end # module
View
35 doc/stdlib/base.rst
@@ -945,6 +945,37 @@ Strings
Convert a string to a contiguous UTF-8 string (all characters must be valid UTF-8 characters).
+.. function:: normalize_string(s, normalform::Symbol)
+
+ Normalize the string ``s`` according to one of the four "normal
+ forms" of the Unicode standard: ``normalform`` can be ``:NFC``,
+ ``:NFD``, ``:NFKC``, or ``:NFKD``. Normal forms C (canonical
+ composition) and D (canonical decomposition) convert different
+ visually identical representations of the same abstract string into
+ a single canonical form, with form C being more compact. Normal
+ forms KC and KD additionally canonicalize "compatibility
+ equivalents": they convert characters that are abstractly similar
+ but visually distinct into a single canonical choice (e.g. they expand
+ ligatures into the individual characters), with form KC being more compact.
+
+ Alternatively, finer control and additional transformations may be
+ be obtained by calling `normalize_string(s; keywords...)`, where
+ any number of the following boolean keywords options (which all default
+ to ``false``) are specified:
+
+ * ``compose=true`` or ``decompose=true``: canonical composition or decomposition, respectively
+ * ``compat=true``: compatibility equivalents are canonicalized (implies `compose=true` unless `decompose=true` was specified)
+ * ``casefold=true``: perform Unicode case folding, e.g. for case-insensitive string comparison
+ * ``lump=true``: non--standard canonicalization of various similar-looking characters into a single ASCII character, as defined by the utf8proc library (e.g. fraction and division slashes, space characters, dash characters, etcetera)
+ * ``newline2lf=true``, ``newline2ls=true``, or ``newline2ps=true``: convert various newline sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or paragraph-separation (PS) character, respectively
+ * ``stripmark=true``: strip diacritical marks (e.g. accents) (implies `compose=true` unless `decompose=true` was specified)
+ * ``stripignore=true``: strip Unicode's "default ignorable" characters (e.g. the soft hyphen or the left-to-right marker)
+ * ``stripcc=true``: strip control characters; horizontal tabs and form feeds are converted to spaces; newlines are also converted to spaces unless a newline-conversion flag was specified
+ * ``rejectna=true``: throw an error if unassigned code points are found
+ * ``stable=true``: enforce Unicode Versioning Stability
+
+ For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.
+
.. function:: is_valid_ascii(s) -> Bool
Returns true if the string or byte vector is valid ASCII, false otherwise.
@@ -957,6 +988,10 @@ Strings
Returns true if the given char or integer is a valid Unicode code point.
+.. function:: is_assigned_char(c) -> Bool
+
+ Returns true if the given char or integer is an assigned Unicode code point.
+
.. function:: ismatch(r::Regex, s::String) -> Bool
Test whether a string contains a match of the given regular expression.

0 comments on commit 6039a46

Please sign in to comment.