Skip to content

Commit

Permalink
Improve memory use and performance of multipart parsing
Browse files Browse the repository at this point in the history
Memory usage and performance noted in
https://discourse.julialang.org/t/http-parse-multipart-form-does-a-lot-of-memory-allocations/66661.
This uses some of the same regex tricks we use in the other http parsing
code that operates on SubStrings and avoids excessive allocations. We
also streamline things to avoid allocating when comparing byte buffers
and the unnecessary allocated arrays of the content disposition
processing.

On my machine for current master, I see `35.022 μs (866 allocations:
70.27 KiB)` for parsing a simple multipart request from the tests, and
with this PR, I see `3.603 μs (42 allocations: 2.55 KiB)`.
  • Loading branch information
quinnj committed Aug 31, 2021
1 parent a2b467e commit a52b672
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 155 deletions.
2 changes: 1 addition & 1 deletion src/HTTP.jl
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ end
include("download.jl")
include("Servers.jl") ;using .Servers; using .Servers: listen
include("Handlers.jl") ;using .Handlers; using .Handlers: serve
include("parsemultipart.jl")
include("parsemultipart.jl") ;using .MultiPartParsing: parse_multipart_form
include("WebSockets.jl") ;using .WebSockets

import .ConnectionPool: Transaction, Connection
Expand Down
205 changes: 92 additions & 113 deletions src/parsemultipart.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,35 @@
module MultiPartParsing

import ..access_threaded, ..Request, ..Multipart, ..payload
using ..Parsers

export parse_multipart_form

const CR_BYTE = 0x0d # \r
const LF_BYTE = 0x0a # \n
const DASH_BYTE = 0x2d # -
const HTAB_BYTE = 0x09 # \t
const SPACE_BYTE = 0x20
const RETURN_BYTES = [CR_BYTE, LF_BYTE]
const SEMICOLON_BYTE = UInt8(';')
const CRLFCRLF = [CR_BYTE, LF_BYTE, CR_BYTE, LF_BYTE]

"compare byte buffer `a` from index `i` to index `j` with `b` and check if they are byte-equal"
function byte_buffers_eq(a, i, j, b)
l = 1
@inbounds for k = i:j
a[k] == b[l] || return false
l += 1
end
return true
end

"""
find_multipart_boundary(bytes, boundaryDelimiter; start::Int=1)
Find the first and last index of the next boundary delimiting a part, and if
the discovered boundary is the terminating boundary.
"""
function find_multipart_boundary(bytes::AbstractVector{UInt8}, boundaryDelimiter::AbstractVector{UInt8}; start::Int=1)
@inline function find_multipart_boundary(bytes::AbstractVector{UInt8}, boundaryDelimiter::AbstractVector{UInt8}; start::Int=1)
# The boundary delimiter line is prepended with two '-' characters
# The boundary delimiter line starts on a new line, so must be preceded by a \r\n.
# The boundary delimiter line ends with \r\n, and can have "optional linear whitespace" between
Expand All @@ -20,26 +38,26 @@ function find_multipart_boundary(bytes::AbstractVector{UInt8}, boundaryDelimiter
# [RFC2046 5.1.1](https://tools.ietf.org/html/rfc2046#section-5.1.1)

i = start
end_index = i+length(boundaryDelimiter)-1
end_index = i + length(boundaryDelimiter) + 1
while end_index <= length(bytes)
if bytes[i:end_index] == boundaryDelimiter
if bytes[i] == DASH_BYTE && bytes[i + 1] == DASH_BYTE && byte_buffers_eq(bytes, i + 2, end_index, boundaryDelimiter)
# boundary delimiter line start on a new line ...
if i > 1
(i == 2 || bytes[i-2] != CR_BYTE || bytes[i-1] != LF_BYTE) && error("boundary delimiter found, but it was not the start of a line")
# the CRLF preceding the boundary delimiter is "conceptually attached
# to the boundary", so account for this with the index
i-=2
i -= 2
end

# need to check if there are enough characters for the CRLF or for two dashes
end_index < length(bytes)-1 || error("boundary delimiter found, but did not end with new line")

is_terminating_delimiter = bytes[end_index+1] == DASH_BYTE && bytes[end_index+2] == DASH_BYTE
is_terminating_delimiter && (end_index+=2)
is_terminating_delimiter && (end_index += 2)

# ... there can be arbitrary SP and HTAB space between the boundary delimiter ...
while (end_index < length(bytes) && (bytes[end_index+1] in [HTAB_BYTE, SPACE_BYTE]))
end_index+=1
while end_index < length(bytes) && (bytes[end_index+1] in (HTAB_BYTE, SPACE_BYTE))
end_index += 1
end
# ... and ends with a new line
newlineEnd = end_index < length(bytes)-1 &&
Expand Down Expand Up @@ -89,107 +107,45 @@ start index(1) and the end index. Headers are separated from the body by CRLFCRL
[RFC822 3.1](https://tools.ietf.org/html/rfc822#section-3.1)
"""
function find_header_boundary(bytes::AbstractVector{UInt8})
delimiter = UInt8[CR_BYTE, LF_BYTE, CR_BYTE, LF_BYTE]
length(delimiter) > length(bytes) && (return nothing)
length(CRLFCRLF) > length(bytes) && return nothing

l = length(bytes) - length(delimiter) + 1
l = length(bytes) - length(CRLFCRLF) + 1
i = 1
end_index = length(delimiter)
end_index = length(CRLFCRLF)
while (i <= l)
bytes[i:end_index] == delimiter && (return (1, end_index))
byte_buffers_eq(bytes, i, end_index, CRLFCRLF) && return (1, end_index)
i += 1
end_index += 1
end
error("no delimiter found separating header from multipart body")
end

"""
content_disposition_tokenize(str)
Tokenize the "arguments" for the Content-Disposition declaration. A vector of
strings is returned that contains each token and separator found in the source
string. Tokens are separated by either an equal sign(=) or a semi-colon(;) and
may be quoted or escaped with a backslash(\\). All tokens returned are stripped
of whitespace at the beginning and end of the string, quotes are retained.
"""
function content_disposition_tokenize(str)
retval = Vector{SubString}()
start = 1
quotes = false
escaped = false

for offset in eachindex(str)
if escaped == false
if quotes == true && str[offset] == '"'
quotes = false
elseif str[offset] == '\\'
escaped = true
elseif str[offset] == '"'
quotes = true
elseif quotes == false && (str[offset] == ';' || str[offset] == '=')
prev = prevind(str, offset)
if prev > start
push!(retval, strip(SubString(str, start, prev)))
end
push!(retval, SubString(str, offset, offset))
start = nextind(str, offset)
end
else
escaped = false
end
end

if start != lastindex(str)
push!(retval, strip(SubString(str, start)))
end

return retval
const content_disposition_regex = Parsers.RegexAndMatchData[]
function content_disposition_regex_f()
r = Parsers.RegexAndMatchData(r"^Content-Disposition:[ \t]*form-data;[ \t]*(.*)\r\n"x)
Parsers.init!(r)
end

"""
content_disposition_extract(str)
Extract all the flags and key/value arguments from the Content-Disposition
line. The result is returned as an array of tuples.
In the case of a flag the first value of the tuple is false the second value
is the flag and the third value is nothing.
In the case of a key/value argument the first value is true, the second is the
key, and the third is the value (or nothing if no value was specified).
"""
function content_disposition_extract(str)
retval = Vector{Tuple{Bool, SubString, Union{SubString,Nothing}}}()
tokens = content_disposition_tokenize(str)
total = length(tokens)

function strip_quotes(val)
if val[1] == '"' && val[end] == '"'
SubString(val, 2, lastindex(val) - 1)
else
val
end
end

i = 1
while i < total
if tokens[i] != ';'
pair = (i + 1 <= total && tokens[i + 1] == "=")
key = strip_quotes(tokens[i])
value = (pair && i + 2 <= total && tokens[i + 2] != ";" ? strip_quotes(tokens[i + 2]) : nothing)
const content_disposition_flag_regex = Parsers.RegexAndMatchData[]
function content_disposition_flag_regex_f()
r = Parsers.RegexAndMatchData(r"""^
[ \t]*([!#$%&'*+\-.^_`|~[:alnum:]]+);?
"""x)
Parsers.init!(r)
end

push!(retval, (pair, key, value))
const content_disposition_pair_regex = Parsers.RegexAndMatchData[]
function content_disposition_pair_regex_f()
r = Parsers.RegexAndMatchData(r"""^
[ \t]*([!#$%&'*+\-.^_`|~[:alnum:]]+)[ \t]*=[ \t]*"(.*?)";?
"""x)
Parsers.init!(r)
end

if pair
i += 3
else
i += 1
end
else
i += 1
end
end
return retval
const content_type_regex = Parsers.RegexAndMatchData[]
function content_type_regex_f()
r = Parsers.RegexAndMatchData(r"(?i)Content-Type: (\S*[^;\s])"x)
Parsers.init!(r)
end

"""
Expand All @@ -199,33 +155,46 @@ Parse a single multi-part chunk into a Multipart object. This will decode
the header and extract the contents from the byte array.
"""
function parse_multipart_chunk(chunk)
(startIndex, end_index) = find_header_boundary(chunk)

headers = String(view(chunk, startIndex:end_index))
startIndex, end_index = find_header_boundary(chunk)
header = SubString(unsafe_string(pointer(chunk, startIndex), end_index - startIndex + 1))
content = view(chunk, end_index+1:lastindex(chunk))

disposition = match(r"(?i)Content-Disposition: form-data(.*)\r\n", headers)

if disposition === nothing
@warn "Content disposition is not specified dropping the chunk." chunk
return # Specifying content disposition is mandatory
# find content disposition
re = access_threaded(content_disposition_regex_f, content_disposition_regex)
if !Parsers.exec(re, header)
@warn "Content disposition is not specified dropping the chunk." String(chunk)
return nothing # Specifying content disposition is mandatory
end
content_disposition = Parsers.group(1, re, header)

re_flag = access_threaded(content_disposition_flag_regex_f, content_disposition_flag_regex)
re_pair = access_threaded(content_disposition_pair_regex_f, content_disposition_pair_regex)
name = nothing
filename = nothing

for (pair, key, value) in content_disposition_extract(disposition[1])
if pair && key == "name"
name = value
elseif pair && key == "filename"
filename = value
while !isempty(content_disposition)
if Parsers.exec(re_pair, content_disposition)
key = Parsers.group(1, re_pair, content_disposition)
value = Parsers.group(2, re_pair, content_disposition)
if key == "name"
name = value
elseif key == "filename"
filename = value
else
# do stuff with other content disposition key-value pairs
end
content_disposition = Parsers.nextbytes(re_pair, content_disposition)
elseif Parsers.exec(re_flag, content_disposition)
# do stuff with content disposition flags
content_disposition = Parsers.nextbytes(re_flag, content_disposition)
else
break
end
end

name === nothing && return

match_contenttype = match(r"(?i)Content-Type: (\S*[^;\s])", headers)
contenttype = match_contenttype !== nothing ? match_contenttype[1] : "text/plain" # if content_type is not specified, the default text/plain is assumed
re_ct = access_threaded(content_type_regex_f, content_type_regex)
contenttype = Parsers.exec(re_ct, header) ? Parsers.group(1, re_ct, header) : "text/plain"

return Multipart(filename, IOBuffer(content), contenttype, "", name)
end
Expand All @@ -238,7 +207,7 @@ chunks which are returned as an array of Multipart objects.
"""
function parse_multipart_body(body::AbstractVector{UInt8}, boundary::AbstractString)::Vector{Multipart}
multiparts = Multipart[]
idxs = find_multipart_boundaries(body, Vector{UInt8}("--$(boundary)"))
idxs = find_multipart_boundaries(body, codeunits(boundary))
length(idxs) > 1 || (return multiparts)

for i in 1:length(idxs)-1
Expand Down Expand Up @@ -275,3 +244,13 @@ function parse_multipart_form(req::Request)::Union{Vector{Multipart}, Nothing}

return parse_multipart_body(payload(req), boundary_delimiter)
end

function __init__()
resize!(empty!(content_disposition_regex), Threads.nthreads())
resize!(empty!(content_disposition_flag_regex), Threads.nthreads())
resize!(empty!(content_disposition_pair_regex), Threads.nthreads())
resize!(empty!(content_type_regex), Threads.nthreads())
return
end

end # module MultiPartParsing
49 changes: 8 additions & 41 deletions test/parsemultipart.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
using Test
using HTTP

import HTTP.MultiPartParsing: find_multipart_boundary, find_multipart_boundaries, find_header_boundary, parse_multipart_chunk, parse_multipart_body, parse_multipart_form

function generate_test_body()
Vector{UInt8}("----------------------------918073721150061572809433\r\nContent-Disposition: form-data; name=\"namevalue\"; filename=\"multipart.txt\"\r\nContent-Type: text/plain\r\n\r\nnot much to say\n\r\n----------------------------918073721150061572809433\r\nContent-Disposition: form-data; name=\"key1\"\r\n\r\n1\r\n----------------------------918073721150061572809433\r\nContent-Disposition: form-data; name=\"key2\"\r\n\r\nkey the second\r\n----------------------------918073721150061572809433\r\nContent-Disposition: form-data; name=\"namevalue2\"; filename=\"multipart-leading-newline.txt\"\r\nContent-Type: text/plain\r\n\r\n\nfile with leading newline\n\r\n----------------------------918073721150061572809433--\r\n")
end


function generate_test_request()
headers = [
"User-Agent" => "PostmanRuntime/7.15.2",
Expand Down Expand Up @@ -46,35 +45,35 @@ end

# NOTE: this is the start of a "boundary delimiter line" and has two leading
# '-' characters prepended to the boundary delimiter from Content-Type header
delimiter = Vector{UInt8}("----------------------------918073721150061572809433")
delimiter = Vector{UInt8}("--------------------------918073721150061572809433")
body = generate_test_body()
# length of the delimiter, CRLF, and -1 for the end index to be the LF character
endIndexOffset = length(delimiter) + 2 - 1
endIndexOffset = length(delimiter) + 4 - 1

(isTerminatingDelimiter, startIndex, endIndex) = HTTP.find_multipart_boundary(body, delimiter)
(isTerminatingDelimiter, startIndex, endIndex) = find_multipart_boundary(body, delimiter)
@test !isTerminatingDelimiter
@test 1 == startIndex
@test (startIndex + endIndexOffset) == endIndex

# the remaining "boundary delimiter lines" will have a CRLF preceding them
endIndexOffset += 2

(isTerminatingDelimiter, startIndex, endIndex) = HTTP.find_multipart_boundary(body, delimiter, start = startIndex + 1)
(isTerminatingDelimiter, startIndex, endIndex) = find_multipart_boundary(body, delimiter, start = startIndex + 1)
@test !isTerminatingDelimiter
@test 175 == startIndex
@test (startIndex + endIndexOffset) == endIndex

(isTerminatingDelimiter, startIndex, endIndex) = HTTP.find_multipart_boundary(body, delimiter, start = startIndex + 3)
(isTerminatingDelimiter, startIndex, endIndex) = find_multipart_boundary(body, delimiter, start = startIndex + 3)
@test !isTerminatingDelimiter
@test 279 == startIndex
@test (startIndex + endIndexOffset) == endIndex

(isTerminatingDelimiter, startIndex, endIndex) = HTTP.find_multipart_boundary(body, delimiter, start = startIndex + 3)
(isTerminatingDelimiter, startIndex, endIndex) = find_multipart_boundary(body, delimiter, start = startIndex + 3)
@test !isTerminatingDelimiter
@test 396 == startIndex
@test (startIndex + endIndexOffset) == endIndex

(isTerminatingDelimiter, startIndex, endIndex) = HTTP.find_multipart_boundary(body, delimiter, start = startIndex + 3)
(isTerminatingDelimiter, startIndex, endIndex) = find_multipart_boundary(body, delimiter, start = startIndex + 3)
@test isTerminatingDelimiter
@test 600 == startIndex
# +2 because of the two additional '--' characters
Expand Down Expand Up @@ -109,35 +108,3 @@ end
@test "\nfile with leading newline\n" === String(read(multiparts[4].data))
end
end

@testset "content_disposition_extract($(v[1])" for v in (
("; filename=abc.txt ; name = xyz", "xyz", "abc.txt"),
("; name=abc ; filename = xyz", "abc", "xyz"),
("""; mno;filename="abc";name=xyz""", "xyz", "abc"),
(""";filename="abc";mno;name=xyz""", "xyz", "abc"),
(""";filename= "abc" ;mno;name=xyz""", "xyz", "abc"),
("; filename=abc.txt ; name = xyz ;", "xyz", "abc.txt"),
("; filename=abc.txt ; name = xyz;", "xyz", "abc.txt"),
("; filename=abc.txt ; name = xyz ; mno", "xyz", "abc.txt"),
("; filename=abc.txt ; name = xyz ; mno ;", "xyz", "abc.txt"),
(";name=\"ab\\\"cdef\"","ab\\\"cdef", nothing),
(";filename=abc\\;xyz", nothing, "abc\\;xyz"),
(";filename=\\\"abc;name=xyz", "xyz", "\\\"abc"),
(";name=xyz;filename=;mno", "xyz", nothing),
(";name=\"xy;z\";filename=;mno", "xy;z", nothing),
(";name=\"x=z\";filename=bbb", "x=z", "bbb")
)
name = nothing
filename = nothing

for (pair, key, value) in HTTP.content_disposition_extract(v[1])
if pair && key == "name"
name = value
elseif pair && key == "filename"
filename = value
end
end

@test name == v[2]
@test filename == v[3]
end

0 comments on commit a52b672

Please sign in to comment.