-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
/
Unicode.jl
84 lines (67 loc) · 3.29 KB
/
Unicode.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# This file is a part of Julia. License is MIT: https://julialang.org/license
module Unicode
export graphemes
"""
Unicode.normalize(s::AbstractString, normalform::Symbol)
Normalize the string `s` according to one of the four "normal forms" of the Unicode
standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C
(canonical composition) and D (canonical decomposition) convert different visually identical
representations of the same abstract string into a single canonical form, with form C being
more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents":
they convert characters that are abstractly similar but visually distinct into a single
canonical choice (e.g. they expand ligatures into the individual characters), with form KC
being more compact.
Alternatively, finer control and additional transformations may be be obtained by calling
`Unicode.normalize(s; keywords...)`, where any number of the following boolean keywords
options (which all default to `false` except for `compose`) are specified:
* `compose=false`: do not perform canonical composition
* `decompose=true`: do canonical decomposition instead of canonical composition
(`compose=true` is ignored if present)
* `compat=true`: compatibility equivalents are canonicalized
* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison
* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline
sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or
paragraph-separation (PS) character, respectively
* `stripmark=true`: strip diacritical marks (e.g. accents)
* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen
or the left-to-right marker)
* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to
spaces; newlines are also converted to spaces unless a newline-conversion flag was
specified
* `rejectna=true`: throw an error if unassigned code points are found
* `stable=true`: enforce Unicode Versioning Stability
For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
# Examples
```jldoctest
julia> "μ" == Unicode.normalize("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5
true
julia> Unicode.normalize("JuLiA", casefold=true)
"julia"
julia> Unicode.normalize("JúLiA", stripmark=true)
"JuLiA"
```
"""
function normalize end
normalize(s::AbstractString, nf::Symbol) = Base.Unicode.normalize(s, nf)
normalize(s::AbstractString; kwargs...) = Base.Unicode.normalize(s; kwargs...)
"""
Unicode.isassigned(c) -> Bool
Returns `true` if the given char or integer is an assigned Unicode code point.
# Examples
```jldoctest
julia> Unicode.isassigned(101)
true
julia> Unicode.isassigned('\\x01')
true
```
"""
isassigned(c) = Base.Unicode.isassigned(c)
"""
graphemes(s::AbstractString) -> GraphemeIterator
Returns an iterator over substrings of `s` that correspond to the extended graphemes in the
string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as
single characters, even though they may contain more than one codepoint; for example a
letter combined with an accent mark is a single grapheme.)
"""
graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)
end