-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
/
string.jl
364 lines (308 loc) · 11.6 KB
/
string.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# This file is a part of Julia. License is MIT: https://julialang.org/license
"""
StringIndexError(str, i)
An error occurred when trying to access `str` at index `i` that is not valid.
"""
struct StringIndexError <: Exception
string::AbstractString
index::Integer
end
@noinline string_index_err(s::AbstractString, i::Integer) =
throw(StringIndexError(s, Int(i)))
function Base.showerror(io::IO, exc::StringIndexError)
s = exc.string
print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
if firstindex(s) <= exc.index <= ncodeunits(s)
iprev = thisind(s, exc.index)
inext = nextind(s, iprev)
if inext <= ncodeunits(s)
print(io, ", valid nearby indices [$iprev]=>'$(s[iprev])', [$inext]=>'$(s[inext])'")
else
print(io, ", valid nearby index [$iprev]=>'$(s[iprev])'")
end
end
end
const ByteArray = Union{Vector{UInt8},Vector{Int8}}
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
## constructors and conversions ##
# String constructor docstring from boot.jl, workaround for #16730
# and the unavailability of @doc in boot.jl context.
"""
String(v::AbstractVector{UInt8})
Create a new `String` object from a byte vector `v` containing UTF-8 encoded
characters. If `v` is `Vector{UInt8}` it will be truncated to zero length and
future modification of `v` cannot affect the contents of the resulting string.
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
`AbstractVector` types, `String(v)` already makes a copy.
When possible, the memory of `v` will be used without copying when the `String`
object is created. This is guaranteed to be the case for byte vectors returned
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
to guarantee consistent behavior.
"""
String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v))
String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v)
"""
unsafe_string(p::Ptr{UInt8}, [length::Integer])
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
(The pointer can be safely freed afterwards.) If `length` is specified
(the length of the data in bytes), the string does not have to be NUL-terminated.
This function is labeled "unsafe" because it will crash if `p` is not
a valid memory address to data of the requested length.
"""
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
end
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
end
_string_n(n::Integer) = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n)
"""
String(s::AbstractString)
Convert a string to a contiguous byte array representation encoded as UTF-8 bytes.
This representation is often appropriate for passing strings to C.
"""
String(s::AbstractString) = print_to_string(s)
@pure String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
Array{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
String(s::CodeUnits{UInt8,String}) = s.s
## low-level functions ##
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
@pure ncodeunits(s::String) = Core.sizeof(s)
codeunit(s::String) = UInt8
@inline function codeunit(s::String, i::Integer)
@boundscheck checkbounds(s, i)
b = GC.@preserve s unsafe_load(pointer(s, i))
return b
end
## comparison ##
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len) =
ccall(:memcmp, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), a, b, len % Csize_t) % Int
function cmp(a::String, b::String)
al, bl = sizeof(a), sizeof(b)
c = _memcmp(a, b, min(al,bl))
return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
end
function ==(a::String, b::String)
pointer_from_objref(a) == pointer_from_objref(b) && return true
al = sizeof(a)
return al == sizeof(b) && 0 == _memcmp(a, b, al)
end
typemin(::Type{String}) = ""
typemin(::String) = typemin(String)
## thisind, nextind ##
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
# s should be String or SubString{String}
@inline function _thisind_str(s, i::Int)
i == 0 && return 0
n = ncodeunits(s)
i == n + 1 && return i
@boundscheck between(i, 1, n) || throw(BoundsError(s, i))
@inbounds b = codeunit(s, i)
(b & 0xc0 == 0x80) & (i-1 > 0) || return i
@inbounds b = codeunit(s, i-1)
between(b, 0b11000000, 0b11110111) && return i-1
(b & 0xc0 == 0x80) & (i-2 > 0) || return i
@inbounds b = codeunit(s, i-2)
between(b, 0b11100000, 0b11110111) && return i-2
(b & 0xc0 == 0x80) & (i-3 > 0) || return i
@inbounds b = codeunit(s, i-3)
between(b, 0b11110000, 0b11110111) && return i-3
return i
end
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
# s should be String or SubString{String}
@inline function _nextind_str(s, i::Int)
i == 0 && return 1
n = ncodeunits(s)
@boundscheck between(i, 1, n) || throw(BoundsError(s, i))
@inbounds l = codeunit(s, i)
(l < 0x80) | (0xf8 ≤ l) && return i+1
if l < 0xc0
i′ = @inbounds thisind(s, i)
return i′ < i ? @inbounds(nextind(s, i′)) : i+1
end
# first continuation byte
(i += 1) > n && return i
@inbounds b = codeunit(s, i)
b & 0xc0 ≠ 0x80 && return i
((i += 1) > n) | (l < 0xe0) && return i
# second continuation byte
@inbounds b = codeunit(s, i)
b & 0xc0 ≠ 0x80 && return i
((i += 1) > n) | (l < 0xf0) && return i
# third continuation byte
@inbounds b = codeunit(s, i)
ifelse(b & 0xc0 ≠ 0x80, i, i+1)
end
## checking UTF-8 & ACSII validity ##
byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))
# 0: neither valid ASCII nor UTF-8
# 1: valid ASCII
# 2: valid UTF-8
isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0
isvalid(s::String) = isvalid(String, s)
is_valid_continuation(c) = c & 0xc0 == 0x80
## required core functionality ##
@inline function iterate(s::String, i::Int=firstindex(s))
(i % UInt) - 1 < ncodeunits(s) || return nothing
b = @inbounds codeunit(s, i)
u = UInt32(b) << 24
between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
return iterate_continued(s, i, u)
end
function iterate_continued(s::String, i::Int, u::UInt32)
u < 0xc0000000 && (i += 1; @goto ret)
n = ncodeunits(s)
# first continuation byte
(i += 1) > n && @goto ret
@inbounds b = codeunit(s, i)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 16
# second continuation byte
((i += 1) > n) | (u < 0xe0000000) && @goto ret
@inbounds b = codeunit(s, i)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 8
# third continuation byte
((i += 1) > n) | (u < 0xf0000000) && @goto ret
@inbounds b = codeunit(s, i)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b); i += 1
@label ret
return reinterpret(Char, u), i
end
@propagate_inbounds function getindex(s::String, i::Int)
b = codeunit(s, i)
u = UInt32(b) << 24
between(b, 0x80, 0xf7) || return reinterpret(Char, u)
return getindex_continued(s, i, u)
end
function getindex_continued(s::String, i::Int, u::UInt32)
if u < 0xc0000000
# called from `getindex` which checks bounds
@inbounds isvalid(s, i) && @goto ret
string_index_err(s, i)
end
n = ncodeunits(s)
(i += 1) > n && @goto ret
@inbounds b = codeunit(s, i) # cont byte 1
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 16
((i += 1) > n) | (u < 0xe0000000) && @goto ret
@inbounds b = codeunit(s, i) # cont byte 2
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 8
((i += 1) > n) | (u < 0xf0000000) && @goto ret
@inbounds b = codeunit(s, i) # cont byte 3
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b)
@label ret
return reinterpret(Char, u)
end
getindex(s::String, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
@inline function getindex(s::String, r::UnitRange{Int})
isempty(r) && return ""
i, j = first(r), last(r)
@boundscheck begin
checkbounds(s, r)
@inbounds isvalid(s, i) || string_index_err(s, i)
@inbounds isvalid(s, j) || string_index_err(s, j)
end
j = nextind(s, j) - 1
n = j - i + 1
ss = _string_n(n)
GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
return ss
end
length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
@inline function length(s::String, i::Int, j::Int)
@boundscheck begin
0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
end
j < i && return 0
@inbounds i, k = thisind(s, i), i
c = j - i + (i == k)
length_continued(s, i, j, c)
end
@inline function length_continued(s::String, i::Int, n::Int, c::Int)
i < n || return c
@inbounds b = codeunit(s, i)
@inbounds while true
while true
(i += 1) ≤ n || return c
0xc0 ≤ b ≤ 0xf7 && break
b = codeunit(s, i)
end
l = b
b = codeunit(s, i) # cont byte 1
c -= (x = b & 0xc0 == 0x80)
x & (l ≥ 0xe0) || continue
(i += 1) ≤ n || return c
b = codeunit(s, i) # cont byte 2
c -= (x = b & 0xc0 == 0x80)
x & (l ≥ 0xf0) || continue
(i += 1) ≤ n || return c
b = codeunit(s, i) # cont byte 3
c -= (b & 0xc0 == 0x80)
end
end
## overload methods for efficiency ##
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
function isascii(s::String)
@inbounds for i = 1:ncodeunits(s)
codeunit(s, i) >= 0x80 && return false
end
return true
end
"""
repeat(c::AbstractChar, r::Integer) -> String
Repeat a character `r` times. This can equivalently be accomplished by calling
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
# Examples
```jldoctest
julia> repeat('A', 3)
"AAA"
```
"""
repeat(c::AbstractChar, r::Integer) = repeat(Char(c), r) # fallback
function repeat(c::Char, r::Integer)
r == 0 && return ""
r < 0 && throw(ArgumentError("can't repeat a character $r times"))
u = bswap(reinterpret(UInt32, c))
n = 4 - (leading_zeros(u | 0xff) >> 3)
s = _string_n(n*r)
p = pointer(s)
GC.@preserve s if n == 1
ccall(:memset, Ptr{Cvoid}, (Ptr{UInt8}, Cint, Csize_t), p, u % UInt8, r)
elseif n == 2
p16 = reinterpret(Ptr{UInt16}, p)
for i = 1:r
unsafe_store!(p16, u % UInt16, i)
end
elseif n == 3
b1 = (u >> 0) % UInt8
b2 = (u >> 8) % UInt8
b3 = (u >> 16) % UInt8
for i = 0:r-1
unsafe_store!(p, b1, 3i + 1)
unsafe_store!(p, b2, 3i + 2)
unsafe_store!(p, b3, 3i + 3)
end
elseif n == 4
p32 = reinterpret(Ptr{UInt32}, p)
for i = 1:r
unsafe_store!(p32, u, i)
end
end
return s
end