Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

char predicates using libmojibake #8110

Closed
wants to merge 10 commits into from
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ Library improvements

* Efficient `mean` and `median` for ranges ([#8089]).

* Character predicates such as `islower()`, `isspace()`, etc. use `utf8proc`\`libmojibake`
to provide uniform cross-platform behavior and up-to-date, locale-independent support
for Unicode standards ([#5939]).

Julia v0.3.0 Release Notes
==========================

Expand Down Expand Up @@ -901,6 +905,7 @@ Too numerous to mention.
[#5832]: https://github.com/JuliaLang/julia/issues/5832
[#5927]: https://github.com/JuliaLang/julia/issues/5927
[#5936]: https://github.com/JuliaLang/julia/issues/5936
[#5939]: https://github.com/JuliaLang/julia/issues/5939
[#5970]: https://github.com/JuliaLang/julia/issues/5970
[#6056]: https://github.com/JuliaLang/julia/issues/6056
[#6057]: https://github.com/JuliaLang/julia/issues/6057
Expand Down
6 changes: 5 additions & 1 deletion base/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,8 @@ scale!{T<:Base.LinAlg.BlasReal}(X::Array{T}, s::Complex) = error("scale!: Cannot

@deprecate rsplit(x,y,l::Integer,k::Bool) rsplit(x,y;limit=l,keep=k)
@deprecate rsplit(x,y,l::Integer) rsplit(x,y;limit=l)
@deprecate rsplit(x,y,k::Bool) rsplit(x,y;keep=k)
@deprecate rsplit(x,y,k::Bool) rsplit(x,y;keep=k)

@deprecate isblank(c::Char) c == ' ' || c == '\t'
@deprecate isblank(s::String) all(c -> c == ' ' || c == '\t', s)

2 changes: 1 addition & 1 deletion base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -823,12 +823,12 @@ export
isalnum,
isalpha,
isascii,
isblank,
iscntrl,
isdigit,
isgraph,
islower,
ismatch,
isnumber,
isprint,
ispunct,
isspace,
Expand Down
16 changes: 2 additions & 14 deletions base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -541,22 +541,10 @@ strwidth(s::String) = (w=0; for c in s; w += charwidth(c); end; w)
strwidth(s::ByteString) = int(ccall(:u8_strwidth, Csize_t, (Ptr{Uint8},), s.data))
# TODO: implement and use u8_strnwidth that takes a length argument

## libc character class predicates ##

isascii(c::Char) = c < 0x80
isascii(s::String) = all(isascii, s)
isascii(s::ASCIIString) = true

for name = ("alnum", "alpha", "cntrl", "digit", "graph",
"lower", "print", "punct", "space", "upper")
f = symbol(string("is",name))
@eval ($f)(c::Char) = bool(ccall($(string("isw",name)), Int32, (Cwchar_t,), c))
@eval $f(s::String) = all($f, s)
end

isblank(c::Char) = c==' ' || c=='\t'
isblank(s::String) = all(isblank, s)

## generic string uses only endof and next ##

immutable GenericString <: String
Expand Down Expand Up @@ -987,7 +975,7 @@ end
function indentation(s::String)
count = 0
for c in s
if isblank(c)
if c == ' ' || c == '\t'
count += blank_width(c)
else
return count, false
Expand All @@ -1005,7 +993,7 @@ function unindent(s::String, indent::Int)
cut = 0
while !done(s,i)
c,i_ = next(s,i)
if cutting && isblank(c)
if cutting && (c == ' ' || c == '\t')
a = i_
cut += blank_width(c)
if cut == indent
Expand Down
60 changes: 59 additions & 1 deletion base/utf8proc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ module UTF8proc
import Base: show, showcompact, ==, string, symbol, isless

# also exported by Base:
export normalize_string, is_valid_char, is_assigned_char
export normalize_string, is_valid_char, is_assigned_char,
islower, isupper, isalpha, isdigit, isnumber, isalnum,
iscntrl, ispunct, isspace, isprint, isgraph, isblank


# whether codepoints are valid Unicode
is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))
Expand Down Expand Up @@ -115,8 +118,63 @@ function category_code(c)
cat == 0 ? UTF8PROC_CATEGORY_CN : cat
end

# category_code() modified to ignore case of unassigned category CN
# used by character class predicates for improved performance
function _catcode(c)
c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
cat = unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
end

is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN

# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?

## libc character class predicates ##

islower(c::Char) = (_catcode(c)==UTF8PROC_CATEGORY_LL)

function isupper(c::Char)
ccode=_catcode(c)
return ccode==UTF8PROC_CATEGORY_LU || ccode==UTF8PROC_CATEGORY_LT
end

isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= _catcode(c) <=
UTF8PROC_CATEGORY_LO)

isdigit(c::Char) = ('0' <= c <= '9')

isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= _catcode(c) <=
UTF8PROC_CATEGORY_NO)

function isalnum(c::Char)
ccode=_catcode(c)
return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) ||
(UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO)
end

iscntrl(c::Char) = (uint(c)<= 0x1f || 0x7f<=uint(c)<=0x9f)

ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <=_catcode(c) <= UTF8PROC_CATEGORY_PO)

isspace(c::Char) = c==' ' || '\t'<=c<='\r' || c==0x85 || _catcode(c)==UTF8PROC_CATEGORY_ZS

isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= _catcode(c) <= UTF8PROC_CATEGORY_ZS)
isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= _catcode(c) <= UTF8PROC_CATEGORY_SO)

for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
"lower", "print", "punct", "space", "upper")
f = symbol(string("is",name))
@eval begin
function $f(s::String)
for c in s
if !$f(c)
return false
end
end
return true
end
end
end


end # module
137 changes: 137 additions & 0 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1105,3 +1105,140 @@ let
@test srep[7] == 'β'
@test_throws BoundsError srep[8]
end

#issue #5939 uft8proc/libmojibake character predicates
let
alower=['a', 'd', 'j', 'y', 'z']
ulower=['α', 'β', 'γ', 'δ', 'ф', 'я']
for c in vcat(alower,ulower)
@test islower(c) == true
@test isupper(c) == false
@test isdigit(c) == false
@test isnumber(c) == false
end

aupper=['A', 'D', 'J', 'Y', 'Z']
uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'Dž', 'Ж', 'Д']

for c in vcat(aupper,uupper)
@test islower(c) == false
@test isupper(c) == true
@test isdigit(c) == false
@test isnumber(c) == false
end

nocase=['電', '仮', 'ऊ', 'א','ﺵ']
alphas=vcat(alower,ulower,aupper,uupper,nocase)

for c in alphas
@test isalpha(c) == true
@test isnumber(c) == false
end


anumber=['0', '1', '5', '9']
unumber=['٣', '٥', '٨', '೬', '¹', 'ⅳ' ]

for c in anumber
@test isdigit(c) == true
@test isnumber(c) == true
end
for c in unumber
@test isdigit(c) == false
@test isnumber(c) == true
end

alnums=vcat(alphas,anumber,unumber)
for c in alnums
@test isalnum(c) == true
@test ispunct(c) == false
end

asymbol = ['(',')', '~', '$' ]
usymbol = ['∪', '∩', '⊂', '⊃', '√', '€', '¥', '↰', '△', '§']

apunct =['.',',',';',':','&']
upunct =['‡', '؟', '჻' ]

for c in vcat(apunct,upunct)
@test ispunct(c) == true
@test isalnum(c) == false
end

for c in vcat(alnums,asymbol,usymbol,apunct,upunct)
@test isprint(c) == true
@test isgraph(c) == true
@test isspace(c) == false
@test iscntrl(c) == false
end

NBSP = char(0x0000A0)
ENSPACE = char(0x002002)
EMSPACE = char(0x002003)
THINSPACE = char(0x002009)
ZWSPACE = char(0x002060)

uspace = [ENSPACE, EMSPACE, THINSPACE]
aspace = [' ']
acntrl_space = ['\t', '\n', '\v', '\f', '\r']
for c in vcat(aspace,uspace)
#println(c," ",uint(c),": ",category_code_assigned(c))
@test isspace(c) == true
@test isprint(c) == true
@test isgraph(c) == false
end

for c in vcat(acntrl_space)
@test isspace(c) == true
@test isprint(c) == false
@test isgraph(c) == false
end

@test isspace(ZWSPACE) == false # zero-width space

acontrol = [ char(0x001c), char(0x001d), char(0x001e), char(0x001f)]
latincontrol = [ char(0x0080), char(0x0085) ]
ucontrol = [ char(0x200E), char(0x202E) ]

for c in vcat(acontrol, acntrl_space, latincontrol)
@test iscntrl(c) == true
@test isalnum(c) == false
@test isprint(c) == false
@test isgraph(c) == false
end

for c in ucontrol #non-latin1 controls
if c!=char(0x0085)
@test iscntrl(c) == false
@test isspace(c) == false
@test isalnum(c) == false
@test isprint(c) == false
@test isgraph(c) == false
end
end

end

@test isspace(" \t \n \r ")==true
@test isgraph(" \t \n \r ")==false
@test isprint(" \t \n \r ")==false
@test isalpha(" \t \n \r ")==false
@test isnumber(" \t \n \r ")==false
@test ispunct(" \t \n \r ")==false

@test isspace("ΣβΣβ")==false
@test isalpha("ΣβΣβ")==true
@test isgraph("ΣβΣβ")==true
@test isprint("ΣβΣβ")==true
@test isupper("ΣβΣβ")==false
@test islower("ΣβΣβ")==false
@test isnumber("ΣβΣβ")==false
@test iscntrl("ΣβΣβ")==false
@test ispunct("ΣβΣβ")==false

@test isnumber("23435")==true
@test isdigit("23435")==true
@test isalnum("23435")==true
@test isalpha("23435")==false
@test iscntrl( string(char(0x0080))) == true
@test ispunct( "‡؟჻") ==true