JuliaLang · StefanKarpinski · Jul 11, 2016 · May 25, 2016 · Jul 2, 2016 · Jul 6, 2016
diff --git a/base/deprecated.jl b/base/deprecated.jl
@@ -488,16 +488,6 @@ end
     end
 )
 
-if sizeof(Cwchar_t) == 2
-    @deprecate_binding WString UTF16String
-    @deprecate_binding wstring utf16
-    utf16(s::Cwstring) = utf16(convert(Ptr{Cwchar_t}, s))
-elseif sizeof(Cwchar_t) == 4
-    @deprecate_binding WString UTF32String
-    @deprecate_binding wstring utf32
-    utf32(s::Cwstring) = utf32(convert(Ptr{Cwchar_t}, s))
-end
-
 @deprecate ==(x::Char, y::Integer) UInt32(x) == y
 @deprecate ==(x::Integer, y::Char) x == UInt32(y)
 @deprecate isless(x::Char, y::Integer) UInt32(x) < y

diff --git a/base/docs/helpdb/Base.jl b/base/docs/helpdb/Base.jl
@@ -95,32 +95,6 @@ Get the step size of a [`Range`](:obj:`Range`) object.
 """
 step
 
-"""
-    utf32(s)
-
-Create a UTF-32 string from a byte array, array of `Char` or `UInt32`, or any other string
-type. (Conversions of byte arrays check for a byte-order marker in the first four bytes, and
-do not include it in the resulting string.)
-
-Note that the resulting `UTF32String` data is terminated by the NUL codepoint (32-bit zero),
-which is not treated as a character in the string (so that it is mostly invisible in Julia);
-this allows the string to be passed directly to external functions requiring NUL-terminated
-data. This NUL is appended automatically by the `utf32(s)` conversion function. If you have
-a `Char` or `UInt32` array `A` that is already NUL-terminated UTF-32 data, then you can
-instead use `UTF32String(A)` to construct the string without making a copy of the data and
-treating the NUL as a terminator rather than as part of the string.
-"""
-utf32(s)
-
-"""
-    utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}} [, length])
-
-Create a string from the address of a NUL-terminated UTF-32 string. A copy is made; the
-pointer can be safely freed. If `length` is specified, the string does not have to be
-NUL-terminated.
-"""
-utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}}, length=?)
-
 """
     takebuf_array(b::IOBuffer)
 
@@ -3620,32 +3594,6 @@ Compute ``\\sin(\\pi x) / (\\pi x)`` if ``x \\neq 0``, and ``1`` if ``x = 0``.
 """
 sinc
 
-"""
-    utf16(s)
-
-Create a UTF-16 string from a byte array, array of `UInt16`, or any other string type. (Data
-must be valid UTF-16. Conversions of byte arrays check for a byte-order marker in the first
-two bytes, and do not include it in the resulting string.)
-
-Note that the resulting `UTF16String` data is terminated by the NUL codepoint (16-bit zero),
-which is not treated as a character in the string (so that it is mostly invisible in Julia);
-this allows the string to be passed directly to external functions requiring NUL-terminated
-data. This NUL is appended automatically by the `utf16(s)` conversion function. If you have
-a `UInt16` array `A` that is already NUL-terminated valid UTF-16 data, then you can instead
-use `UTF16String(A)` to construct the string without making a copy of the data and treating
-the NUL as a terminator rather than as part of the string.
-"""
-utf16(s)
-
-"""
-    utf16(::Union{Ptr{UInt16},Ptr{Int16}} [, length])
-
-Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the
-pointer can be safely freed. If `length` is specified, the string does not have to be
-NUL-terminated.
-"""
-utf16(::Union{Ptr{UInt16},Ptr{Int16}}, length=?)
-
 """
     median(v[, region])
 
@@ -8908,19 +8856,17 @@ vecnorm
 """
     isvalid(value) -> Bool
 
-Returns `true` if the given value is valid for its type, which currently can be one of
-`Char`, `String`, `UTF16String`, or `UTF32String`.
+Returns `true` if the given value is valid for its type, which currently can be either
+`Char` or `String`.
 """
 isvalid(value)
 
 """
     isvalid(T, value) -> Bool
 
-Returns `true` if the given value is valid for that type. Types currently can be `Char`,
-`String`, `UTF16String`, or `UTF32String` Values for `Char` can be of
-type `Char` or `UInt32` Values for `String` can be of that type, or
-`Vector{UInt8}` Values for `UTF16String` can be `UTF16String` or `Vector{UInt16}` Values for
-`UTF32String` can be `UTF32String`, `Vector{Char}` or `Vector{UInt32}`
+Returns `true` if the given value is valid for that type. Types currently can
+be either `Char` or `String`. Values for `Char` can be of type `Char` or `UInt32`.
+Values for `String` can be of that type, or `Vector{UInt8}`.
 """
 isvalid(T,value)
 

diff --git a/base/exports.jl b/base/exports.jl
@@ -119,8 +119,6 @@ export
     Tridiagonal,
     UnitRange,
     UpperTriangular,
-    UTF16String,
-    UTF32String,
     Val,
     VecOrMat,
     Vector,
@@ -878,8 +876,6 @@ export
     ucfirst,
     unescape_string,
     uppercase,
-    utf16,
-    utf32,
     warn,
 
 # random numbers

diff --git a/base/replutil.jl b/base/replutil.jl
@@ -233,14 +233,24 @@ end
 showerror(io::IO, ::DivideError) = print(io, "DivideError: integer division error")
 showerror(io::IO, ::StackOverflowError) = print(io, "StackOverflowError:")
 showerror(io::IO, ::UndefRefError) = print(io, "UndefRefError: access to undefined reference")
-showerror(io::IO, ex::UndefVarError) = print(io, "UndefVarError: $(ex.var) not defined")
 showerror(io::IO, ::EOFError) = print(io, "EOFError: read end of file")
 showerror(io::IO, ex::ErrorException) = print(io, ex.msg)
 showerror(io::IO, ex::KeyError) = print(io, "KeyError: key $(repr(ex.key)) not found")
 showerror(io::IO, ex::InterruptException) = print(io, "InterruptException:")
 showerror(io::IO, ex::ArgumentError) = print(io, "ArgumentError: $(ex.msg)")
 showerror(io::IO, ex::AssertionError) = print(io, "AssertionError: $(ex.msg)")
 
+function showerror(io::IO, ex::UndefVarError)
+    if ex.var in [:UTF16String, :UTF32String, :WString, :utf16, :utf32, :wstring]
+        return showerror(io, ErrorException("""
+        `$(ex.var)` has been moved to the package LegacyStrings.jl:
+        Run Pkg.add("LegacyStrings") to install LegacyStrings on Julia v0.5-;
+        Then do `using LegacyStrings` to get `$(ex.var)`.
+        """))
+    end
+    print(io, "UndefVarError: $(ex.var) not defined")
+end
+
 function showerror(io::IO, ex::MethodError)
     # ex.args is a tuple type if it was thrown from `invoke` and is
     # a tuple of the arguments otherwise.

diff --git a/base/serialize.jl b/base/serialize.jl
@@ -21,8 +21,7 @@ const TAGS = Any[
     Symbol, Tuple, Expr,  # dummy entries, intentionally shadowed by earlier ones
     LineNumberNode, Slot, LabelNode, GotoNode,
     QuoteNode, :reserved23 #=was TopNode=#, TypeVar, Core.Box, LambdaInfo,
-    Module, #=UndefRefTag=#Symbol, Task, String,
-    UTF16String, UTF32String, Float16,
+    Module, #=UndefRefTag=#Symbol, Task, String, Float16,
     SimpleVector, #=BackrefTag=#Symbol, Method, GlobalRef,
 
     (), Bool, Any, :Any, Bottom, :reserved21, :reserved22, Type,
@@ -42,7 +41,7 @@ const TAGS = Any[
     28, 29, 30, 31, 32
 ]
 
-const ser_version = 3 # do not make changes without bumping the version #!
+const ser_version = 4 # do not make changes without bumping the version #!
 
 const NTAGS = length(TAGS)
 

diff --git a/base/strings/errors.jl b/base/strings/errors.jl
@@ -3,23 +3,7 @@
 ##    Error messages for Unicode / UTF support
 
 const UTF_ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> missing one or more continuation bytes)"
-const UTF_ERR_CONT              = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
-const UTF_ERR_LONG              = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_LEAD          = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_TRAIL         = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_SURROGATE     = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
-const UTF_ERR_INVALID           = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
-const UTF_ERR_SURROGATE         = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
-const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
-const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
-const UTF_ERR_ODD_BYTES_16      = "UTF16String can't have odd number of bytes <<1>>"
-const UTF_ERR_ODD_BYTES_32      = "UTF32String must have multiple of 4 bytes <<1>>"
-const UTF_ERR_INVALID_CHAR      = "invalid Unicode character (0x<<2>> > 0x10ffff)"
-const UTF_ERR_INVALID_8         = "invalid UTF-8 data"
-const UTF_ERR_INVALID_16        = "invalid UTF-16 data"
 const UTF_ERR_INVALID_INDEX     = "invalid character index"
-const UTF_ERR_MAP_CHAR          = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
 
 type UnicodeError <: Exception
     errmsg::AbstractString      ##< A UTF_ERR_ message

diff --git a/base/strings/io.jl b/base/strings/io.jl
@@ -324,3 +324,22 @@ function unindent(str::AbstractString, indent::Int; tabwidth=8)
     end
     takebuf_string(buf)
 end
+
+function convert(::Type{String}, chars::AbstractVector{Char})
+    sprint(length(chars), io->begin
+        state = start(chars)
+        while !done(chars, state)
+            c, state = next(chars, state)
+            if '\ud7ff' < c && c + 1024 < '\ue000'
+                d, state = next(chars, state)
+                if '\ud7ff' < d - 1024 && d < '\ue000'
+                    c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff))
+                else
+                    write(io, c)
+                    c = d
+                end
+            end
+            write(io, c)
+        end
+    end)
+end
diff --git a/base/strings/string.jl b/base/strings/string.jl
@@ -33,6 +33,8 @@ const utf8_trailing = [
 
 ## required core functionality ##
 
+is_valid_continuation(c) = ((c & 0xc0) == 0x80)
+
 function endof(s::String)
     d = s.data
     i = length(d)
@@ -102,7 +104,7 @@ function first_utf8_byte(ch::Char)
 end
 
 function reverseind(s::String, i::Integer)
-    j = lastidx(s) + 1 - i
+    j = length(s.data) + 1 - i
     d = s.data
     while is_valid_continuation(d[j])
         j -= 1
@@ -114,8 +116,6 @@ end
 
 sizeof(s::String) = sizeof(s.data)
 
-lastidx(s::String) = length(s.data)
-
 isvalid(s::String, i::Integer) =
     (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
 
@@ -239,109 +239,10 @@ function reverse(s::String)
     String(buf)
 end
 
-## outputting UTF-8 strings ##
-
 write(io::IO, s::String) = write(io, s.data)
 
 pointer(x::String) = pointer(x.data)
 pointer(x::String, i::Integer) = pointer(x.data)+(i-1)
 
-## transcoding to UTF-8 ##
-
 convert(::Type{String}, s::String) = s
-
-function convert(::Type{String}, dat::Vector{UInt8})
-    # handle zero length string quickly
-    isempty(dat) && return empty_utf8
-    # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat)
-    if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
-        len = sizeof(dat)
-        @inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
-    end
-    # Copy, but eliminate over-long encodings and surrogate pairs
-    len += num2byte + num3byte*2 + num4byte*3
-    buf = Vector{UInt8}(len)
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch::UInt32 = dat[pos += 1]
-        # Handle ASCII characters
-        if ch <= 0x7f
-            buf[out += 1] = ch
-        # Handle overlong < 0x100
-        elseif ch < 0xc2
-            buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
-        # Handle 0x100-0x7ff
-        elseif ch < 0xe0
-            buf[out += 1] = ch
-            buf[out += 1] = dat[pos += 1]
-        elseif ch != 0xed
-            buf[out += 1] = ch
-            buf[out += 1] = dat[pos += 1]
-            buf[out += 1] = dat[pos += 1]
-            # Copy 4-byte encoded value
-            ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
-        # Handle surrogate pairs
-        else
-            ch = dat[pos += 1]
-            if ch < 0xa0 # not surrogate pairs
-                buf[out += 1] = 0xed
-                buf[out += 1] = ch
-                buf[out += 1] = dat[pos += 1]
-            else
-                # Pick up surrogate pairs (CESU-8 format)
-                ch = ((((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
-                       + (((dat[pos + 3] & 0x3f)%UInt32 << 6) | (dat[pos + 4] & 0x3f)))
-                      - 0x01f0c00)
-                pos += 4
-                output_utf8_4byte!(buf, out, ch)
-                out += 4
-            end
-        end
-    end
-    String(buf)
-end
-
-"""
-Converts an already validated vector of `UInt16` or `UInt32` to a `String`
-
-Input Arguments:
-
-* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
-* `len` length of output in bytes
-
-Returns:
-
-* `String`
-"""
-function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
-    buf = Vector{UInt8}(len)
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch::UInt32 = dat[pos += 1]
-        # Handle ASCII characters
-        if ch <= 0x7f
-            buf[out += 1] = ch
-        # Handle 0x80-0x7ff
-        elseif ch < 0x800
-            buf[out += 1] = 0xc0 | (ch >>> 6)
-            buf[out += 1] = 0x80 | (ch & 0x3f)
-        # Handle 0x10000-0x10ffff (if input is UInt32)
-        elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
-            output_utf8_4byte!(buf, out, ch)
-            out += 4
-        # Handle surrogate pairs
-        elseif is_surrogate_codeunit(ch)
-            output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
-            out += 4
-        # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
-        else
-            buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
-            buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
-            buf[out += 1] = 0x80 | (ch & 0x3f)
-        end
-    end
-    String(buf)
-end
+convert(::Type{String}, v::Vector{UInt8}) = String(v)
diff --git a/base/strings/types.jl b/base/strings/types.jl
@@ -118,12 +118,11 @@ reverse(s::RevString) = s.string
 
 ## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)]
 
+reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i))
 reverseind(s::Union{DirectIndexString,SubString{DirectIndexString}}, i::Integer) = length(s) + 1 - i
 reverseind(s::RevString, i::Integer) = endof(s) - i + 1
-lastidx(s::AbstractString) = nextind(s, endof(s)) - 1
-lastidx(s::DirectIndexString) = length(s)
-reverseind(s::SubString, i::Integer) =
-    reverseind(s.string, lastidx(s.string)-s.offset-s.endof+i) - s.offset
+reverseind(s::SubString{String}, i::Integer) =
+    reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset
 
 ## efficient representation of repeated strings ##
 

diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -145,7 +145,6 @@ include("iobuffer.jl")
 include("char.jl")
 include("intfuncs.jl")
 include("strings/strings.jl")
-include("unicode/unicode.jl")
 include("parse.jl")
 include("shell.jl")
 include("regex.jl")