-
Notifications
You must be signed in to change notification settings - Fork 99
/
specialized.jl
144 lines (126 loc) · 4.35 KB
/
specialized.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
function maxsize_buffer(maxsize::Int)
IOBuffer(maxsize=maxsize)
end
# Specialized functions for increased performance when JSON is in-memory
function parse_string(ps::MemoryParserState)
# "Dry Run": find length of string so we can allocate the right amount of
# memory from the start. Does not do full error checking.
fastpath, len = predict_string(ps)
# Now read the string itself:
# Fast path occurs when the string has no escaped characters. This is quite
# often the case in real-world data, especially when keys are short strings.
# We can just copy the data from the buffer in this case.
if fastpath
s = ps.s
ps.s = s + len + 2 # byte after closing quote
return unsafe_string(pointer(ps.utf8)+s, len)
else
String(take!(parse_string(ps, maxsize_buffer(len))))
end
end
"""
Scan through a string at the current parser state and return a tuple containing
information about the string. This function avoids memory allocation where
possible.
The first element of the returned tuple is a boolean indicating whether the
string may be copied directly from the parser state. Special casing string
parsing when there are no escaped characters leads to substantially increased
performance in common situations.
The second element of the returned tuple is an integer representing the exact
length of the string, in bytes when encoded as UTF-8. This information is useful
for pre-sizing a buffer to contain the parsed string.
This function will throw an error if:
- invalid control characters are found
- an invalid unicode escape is read
- the string is not terminated
No error is thrown when other invalid backslash escapes are encountered.
"""
function predict_string(ps::MemoryParserState)
e = length(ps)
fastpath = true # true if no escapes in this string, so it can be copied
len = 0 # the number of UTF8 bytes the string contains
s = ps.s + 1 # skip past opening string character "
@inbounds while s <= e
c = ps[s]
if c == BACKSLASH
fastpath = false
(s += 1) > e && break
if ps[s] == LATIN_U # Unicode escape
t = ps.s
ps.s = s + 1
len += write(devnull, read_unicode_escape!(ps))
s = ps.s
ps.s = t
continue
end
elseif c == STRING_DELIM
return fastpath, len
elseif c < SPACE
ps.s = s
_error(E_BAD_CONTROL, ps)
end
len += 1
s += 1
end
ps.s = s
_error(E_UNEXPECTED_EOF, ps)
end
"""
Parse the string starting at the parser state’s current location into the given
pre-sized IOBuffer. The only correctness checking is for escape sequences, so the
passed-in buffer must exactly represent the amount of space needed for parsing.
"""
function parse_string(ps::MemoryParserState, b::IOBuffer)
s = ps.s
e = length(ps)
s += 1 # skip past opening string character "
len = b.maxsize
@inbounds while b.size < len
c = ps[s]
if c == BACKSLASH
s += 1
s > e && break
c = ps[s]
if c == LATIN_U # Unicode escape
ps.s = s + 1
write(b, read_unicode_escape!(ps))
s = ps.s
continue
else
c = get(ESCAPES, c, 0x00)
if c == 0x00
ps.s = s
_error(E_BAD_ESCAPE, ps)
end
end
end
# UTF8-encoded non-ascii characters will be copied verbatim, which is
# the desired behaviour
write(b, c)
s += 1
end
# don't worry about non-termination or other edge cases; those should have
# been caught in the dry run.
ps.s = s + 1
b
end
function parse_number(ps::MemoryParserState)
s = p = ps.s
e = length(ps)
isint = true
# Determine the end of the floating point by skipping past ASCII values
# 0-9, +, -, e, E, and .
while p ≤ e
@inbounds c = ps[p]
if isjsondigit(c) || MINUS_SIGN == c # no-op
elseif PLUS_SIGN == c || LATIN_E == c || LATIN_UPPER_E == c ||
DECIMAL_POINT == c
isint = false
else
break
end
p += 1
end
ps.s = p
number_from_bytes(ps, isint, ps, s, p - 1)
end