JuliaLang · catawbasam · Aug 24, 2014 · Aug 25, 2014 · Aug 25, 2014 · Aug 25, 2014
diff --git a/NEWS.md b/NEWS.md
@@ -27,6 +27,10 @@ Library improvements
 
   * Efficient `mean` and `median` for ranges ([#8089]).
 
+  * Character predicates such as `islower()`, `isspace()`, etc. use `utf8proc`\`libmojibake`
+    to provide uniform cross-platform behavior and up-to-date, locale-independent support
+    for Unicode standards ([#5939]).
+
 Julia v0.3.0 Release Notes
 ==========================
 
@@ -901,6 +905,7 @@ Too numerous to mention.
 [#5832]: https://github.com/JuliaLang/julia/issues/5832
 [#5927]: https://github.com/JuliaLang/julia/issues/5927
 [#5936]: https://github.com/JuliaLang/julia/issues/5936
+[#5939]: https://github.com/JuliaLang/julia/issues/5939
 [#5970]: https://github.com/JuliaLang/julia/issues/5970
 [#6056]: https://github.com/JuliaLang/julia/issues/6056
 [#6057]: https://github.com/JuliaLang/julia/issues/6057

diff --git a/base/deprecated.jl b/base/deprecated.jl
@@ -168,4 +168,8 @@ scale!{T<:Base.LinAlg.BlasReal}(X::Array{T}, s::Complex) = error("scale!: Cannot
 
 @deprecate rsplit(x,y,l::Integer,k::Bool) rsplit(x,y;limit=l,keep=k)
 @deprecate rsplit(x,y,l::Integer) rsplit(x,y;limit=l)
-@deprecate rsplit(x,y,k::Bool) rsplit(x,y;keep=k)
+@deprecate rsplit(x,y,k::Bool) rsplit(x,y;keep=k)
+
+@deprecate isblank(c::Char) c == ' ' || c == '\t'
+@deprecate isblank(s::String) all(c -> c == ' ' || c == '\t', s)
+
diff --git a/base/exports.jl b/base/exports.jl
@@ -823,12 +823,12 @@ export
     isalnum,
     isalpha,
     isascii,
-    isblank,
     iscntrl,
     isdigit,
     isgraph,
     islower,
     ismatch,
+    isnumber,
     isprint,
     ispunct,
     isspace,

diff --git a/base/string.jl b/base/string.jl
@@ -541,22 +541,10 @@ strwidth(s::String) = (w=0; for c in s; w += charwidth(c); end; w)
 strwidth(s::ByteString) = int(ccall(:u8_strwidth, Csize_t, (Ptr{Uint8},), s.data))
 # TODO: implement and use u8_strnwidth that takes a length argument
 
-## libc character class predicates ##
-
 isascii(c::Char) = c < 0x80
 isascii(s::String) = all(isascii, s)
 isascii(s::ASCIIString) = true
 
-for name = ("alnum", "alpha", "cntrl", "digit", "graph",
-            "lower", "print", "punct", "space", "upper")
-    f = symbol(string("is",name))
-    @eval ($f)(c::Char) = bool(ccall($(string("isw",name)), Int32, (Cwchar_t,), c))
-    @eval $f(s::String) = all($f, s)
-end
-
-isblank(c::Char) = c==' ' || c=='\t'
-isblank(s::String) = all(isblank, s)
-
 ## generic string uses only endof and next ##
 
 immutable GenericString <: String
@@ -987,7 +975,7 @@ end
 function indentation(s::String)
     count = 0
     for c in s
-        if isblank(c)
+        if c == ' ' || c == '\t'
             count += blank_width(c)
         else
             return count, false
@@ -1005,7 +993,7 @@ function unindent(s::String, indent::Int)
     cut = 0
     while !done(s,i)
         c,i_ = next(s,i)
-        if cutting && isblank(c)
+        if cutting && (c == ' ' || c == '\t')
             a = i_
             cut += blank_width(c)
             if cut == indent

diff --git a/base/utf8proc.jl b/base/utf8proc.jl
@@ -4,7 +4,10 @@ module UTF8proc
 import Base: show, showcompact, ==, string, symbol, isless
 
 # also exported by Base:
-export normalize_string, is_valid_char, is_assigned_char
+export normalize_string, is_valid_char, is_assigned_char,
+   islower, isupper, isalpha, isdigit, isnumber, isalnum, 
+   iscntrl, ispunct, isspace, isprint, isgraph, isblank
+
 
 # whether codepoints are valid Unicode
 is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))
@@ -115,8 +118,63 @@ function category_code(c)
     cat == 0 ? UTF8PROC_CATEGORY_CN : cat
 end
 
+# category_code() modified to ignore case of unassigned category CN
+#  used by character class predicates for improved performance
+function _catcode(c)
+    c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
+    cat = unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
+end
+
 is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN
 
 # TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?
 
+## libc character class predicates ##
+
+islower(c::Char) = (_catcode(c)==UTF8PROC_CATEGORY_LL)
+
+function isupper(c::Char)
+    ccode=_catcode(c)
+    return ccode==UTF8PROC_CATEGORY_LU || ccode==UTF8PROC_CATEGORY_LT
+end
+
+isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= _catcode(c) <=
+                                            UTF8PROC_CATEGORY_LO)
+
+isdigit(c::Char) = ('0' <= c <= '9')
+
+isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= _catcode(c) <=
+                                            UTF8PROC_CATEGORY_NO)
+
+function isalnum(c::Char)
+    ccode=_catcode(c)
+    return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) ||
+                    (UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO)
+end
+
+iscntrl(c::Char) = (uint(c)<= 0x1f || 0x7f<=uint(c)<=0x9f)
+
+ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <=_catcode(c) <= UTF8PROC_CATEGORY_PO)
+
+isspace(c::Char) = c==' ' || '\t'<=c<='\r' || c==0x85 || _catcode(c)==UTF8PROC_CATEGORY_ZS
+
+isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= _catcode(c) <= UTF8PROC_CATEGORY_ZS)
+isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= _catcode(c) <= UTF8PROC_CATEGORY_SO)
+
+for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
+            "lower", "print", "punct", "space", "upper")
+    f = symbol(string("is",name))
+    @eval begin
+        function $f(s::String)
+            for c in s
+                if !$f(c)
+                    return false
+                end
+            end
+            return true
+        end
+    end
+end
+
+
 end # module
diff --git a/test/strings.jl b/test/strings.jl
@@ -1105,3 +1105,140 @@ let
     @test srep[7] == 'β'
     @test_throws BoundsError srep[8]
 end
+
+#issue #5939  uft8proc/libmojibake character predicates
+let
+    alower=['a', 'd', 'j', 'y', 'z']
+    ulower=['α', 'β', 'γ', 'δ', 'ф', 'я']
+    for c in vcat(alower,ulower)
+        @test islower(c) == true
+        @test isupper(c) == false
+        @test isdigit(c) == false
+        @test isnumber(c) == false
+    end
+
+    aupper=['A', 'D', 'J', 'Y', 'Z']
+    uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'ǅ', 'Ж', 'Д']
+
+    for c in vcat(aupper,uupper)
+        @test islower(c) == false
+        @test isupper(c) == true
+        @test isdigit(c) == false
+        @test isnumber(c) == false
+    end
+
+    nocase=['電', '仮', 'ऊ', 'א','ﺵ']
+    alphas=vcat(alower,ulower,aupper,uupper,nocase)
+
+    for c in alphas
+         @test isalpha(c) == true
+         @test isnumber(c) == false
+    end
+
+
+    anumber=['0', '1', '5', '9']
+    unumber=['٣', '٥', '٨', '೬', '¹', 'ⅳ' ]
+
+    for c in anumber
+         @test isdigit(c) == true
+         @test isnumber(c) == true
+    end
+    for c in unumber
+         @test isdigit(c) == false
+         @test isnumber(c) == true
+    end
+
+    alnums=vcat(alphas,anumber,unumber)
+    for c in alnums
+         @test isalnum(c) == true
+         @test ispunct(c) == false
+    end
+
+    asymbol = ['(',')', '~', '$' ]
+    usymbol = ['∪', '∩', '⊂', '⊃', '√', '€', '¥', '↰', '△', '§']
+
+    apunct =['.',',',';',':','&']
+    upunct =['‡', '؟', '჻' ]
+
+    for c in vcat(apunct,upunct)
+         @test ispunct(c) == true
+         @test isalnum(c) == false
+    end
+
+    for c in vcat(alnums,asymbol,usymbol,apunct,upunct)
+        @test isprint(c) == true
+        @test isgraph(c) == true
+        @test isspace(c) == false
+        @test iscntrl(c) == false
+    end
+
+    NBSP = char(0x0000A0)
+    ENSPACE = char(0x002002)
+    EMSPACE = char(0x002003)
+    THINSPACE = char(0x002009)
+    ZWSPACE = char(0x002060)
+
+    uspace = [ENSPACE, EMSPACE, THINSPACE]
+    aspace = [' ']
+    acntrl_space = ['\t', '\n', '\v', '\f', '\r']
+    for c in vcat(aspace,uspace)
+        #println(c," ",uint(c),":  ",category_code_assigned(c))
+        @test isspace(c) == true
+        @test isprint(c) == true
+        @test isgraph(c) == false
+    end
+
+    for c in vcat(acntrl_space)
+        @test isspace(c) == true
+        @test isprint(c) == false
+        @test isgraph(c) == false
+    end
+
+    @test isspace(ZWSPACE) == false # zero-width space
+
+    acontrol = [ char(0x001c), char(0x001d), char(0x001e), char(0x001f)]
+    latincontrol = [ char(0x0080), char(0x0085) ]
+    ucontrol = [ char(0x200E), char(0x202E) ]
+
+    for c in vcat(acontrol, acntrl_space, latincontrol)
+        @test iscntrl(c) == true
+        @test isalnum(c) == false
+        @test isprint(c) == false
+        @test isgraph(c) == false
+    end
+
+    for c in ucontrol  #non-latin1 controls
+        if c!=char(0x0085)
+            @test iscntrl(c) == false
+            @test isspace(c) == false
+            @test isalnum(c) == false
+            @test isprint(c) == false
+            @test isgraph(c) == false
+        end
+    end
+
+end
+
+@test isspace("  \t   \n   \r  ")==true
+@test isgraph("  \t   \n   \r  ")==false
+@test isprint("  \t   \n   \r  ")==false
+@test isalpha("  \t   \n   \r  ")==false
+@test isnumber("  \t   \n   \r  ")==false
+@test ispunct("  \t   \n   \r  ")==false
+
+@test isspace("ΣβΣβ")==false
+@test isalpha("ΣβΣβ")==true
+@test isgraph("ΣβΣβ")==true
+@test isprint("ΣβΣβ")==true
+@test isupper("ΣβΣβ")==false
+@test islower("ΣβΣβ")==false
+@test isnumber("ΣβΣβ")==false
+@test iscntrl("ΣβΣβ")==false
+@test ispunct("ΣβΣβ")==false
+
+@test isnumber("23435")==true
+@test isdigit("23435")==true
+@test isalnum("23435")==true
+@test isalpha("23435")==false
+@test iscntrl( string(char(0x0080))) == true
+@test ispunct( "‡؟჻") ==true