Skip to content

Commit

Permalink
Add new stripquoted keyword arg and fix stripwhitespace (#112)
Browse files Browse the repository at this point in the history
* Add new stripquoted keyword arg and fix stripwhitespace

Fixes #109. As noted in that issue, stripping whitespace *within* quoted
strings, IMO, should be considered a bug, since one of the primary
reasons for quoting strings in various applications is to delineate the
exact characters that make up the string. This PR fixes
`stripwhitespace` to preserve whitepace encountered within strings, and
only strip whitespace for non-quoted strings (leading or trailing) and
leading/trailing around quoted fields.

On the other hand, there are legitimate use-cases for also stripping
whitespace within quoted strings, so we add a new opt-in `stripquoted`
keyword argument that allows the additional precision of also stripping
whitespace inside quotes. Note that passing `stripquoted=true` implies
`stripwhitespace=true`, so it can be considered a "stronger" version of
`stripewhitespace`.

* Update src/Parsers.jl

Co-authored-by: Nick Robinson <npr251@gmail.com>

* Update test/runtests.jl

Co-authored-by: Nick Robinson <npr251@gmail.com>

Co-authored-by: Nick Robinson <npr251@gmail.com>
  • Loading branch information
quinnj and nickrobinson251 committed Apr 21, 2022
1 parent 227ffa1 commit ea1259a
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 21 deletions.
15 changes: 9 additions & 6 deletions src/Parsers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ end
* `quoted=false`: whether parsing should check for `openquotechar` and `closequotechar` characters to signal quoted fields
* `comment=nothing`: a string which, if matched at the start of a line, will make parsing consume the rest of the line
* `ignoreemptylines=false`: after parsing a value, if a newline is detected, another immediately proceeding newline will be checked for and consumed
* `stripwhitespace=false`: if true, leading and trailing whitespace is stripped from string fields
* `stripwhitespace=false`: if true, leading and trailing whitespace is stripped from string fields, note that for *quoted* strings however, whitespace is preserved within quotes (but ignored before/after quote characters). To also strip *within* quotes, see `stripquoted`
* `stripquoted=false`: if true, whitespace is also stripped within quoted strings. If true, `stripwhitespace` is also set to true.
* `debug=false`: if `true`, various debug logging statements will be printed while parsing; useful when diagnosing why parsing returns certain `Parsers.ReturnCode` values
"""
struct Options
Expand All @@ -74,6 +75,7 @@ struct Options
dateformat::Union{Nothing, Format}
cmt::Union{Nothing, PtrLen}
stripwhitespace::Bool
stripquoted::Bool
end

prepare(x::Vector{String}) = sort!(map(ptrlen, x), by=x->x[2], rev=true)
Expand All @@ -92,7 +94,7 @@ function Options(
trues::Union{Nothing, Vector{String}},
falses::Union{Nothing, Vector{String}},
dateformat::Union{Nothing, String, Dates.DateFormat, Format},
ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace=false)
ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace=false, stripquoted=false)
asciival(wh1) && asciival(wh2) || throw(ArgumentError("whitespace characters must be ASCII"))
asciival(oq) && asciival(cq) && asciival(e) || throw(ArgumentError("openquotechar, closequotechar, and escapechar must be ASCII characters"))
(oq == delim) || (cq == delim) || (e == delim) && throw(ArgumentError("delim argument must be different than openquotechar, closequotechar, and escapechar arguments"))
Expand Down Expand Up @@ -134,7 +136,7 @@ function Options(
cmt = ptrlen(comment)
end
df = dateformat === nothing ? nothing : dateformat isa String ? Format(dateformat) : dateformat isa Dates.DateFormat ? Format(dateformat) : dateformat
return Options(refs, sent, ignorerepeated, ignoreemptylines, wh1 % UInt8, wh2 % UInt8, quoted, oq % UInt8, cq % UInt8, e % UInt8, del, decimal % UInt8, trues, falses, df, cmt, stripwhitespace)
return Options(refs, sent, ignorerepeated, ignoreemptylines, wh1 % UInt8, wh2 % UInt8, quoted, oq % UInt8, cq % UInt8, e % UInt8, del, decimal % UInt8, trues, falses, df, cmt, stripwhitespace || stripquoted, stripquoted)
end

Options(;
Expand All @@ -155,7 +157,8 @@ Options(;
quoted::Bool=false,
debug::Bool=false,
stripwhitespace::Bool=false,
) = Options(sentinel, wh1, wh2, openquotechar, closequotechar, escapechar, delim, decimal, trues, falses, dateformat, ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace)
stripquoted::Bool=false,
) = Options(sentinel, wh1, wh2, openquotechar, closequotechar, escapechar, delim, decimal, trues, falses, dateformat, ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace, stripquoted)

const OPTIONS = Options(nothing, UInt8(' '), UInt8('\t'), UInt8('"'), UInt8('"'), UInt8('"'), nothing, UInt8('.'), nothing, nothing, nothing, false, false, nothing, false, false, false)
const XOPTIONS = Options(missing, UInt8(' '), UInt8('\t'), UInt8('"'), UInt8('"'), UInt8('"'), UInt8(','), UInt8('.'), nothing, nothing, nothing, false, false, nothing, true, false, false)
Expand Down Expand Up @@ -206,8 +209,8 @@ A [`Parsers.Result`](@ref) struct is returned, with the following fields:
function xparse end

# for testing purposes only, it's much too slow to dynamically create Options for every xparse call
function xparse(::Type{T}, buf::Union{AbstractVector{UInt8}, AbstractString, IO}; pos::Integer=1, len::Integer=buf isa IO ? 0 : sizeof(buf), sentinel=nothing, wh1::Union{UInt8, Char}=UInt8(' '), wh2::Union{UInt8, Char}=UInt8('\t'), quoted::Bool=true, openquotechar::Union{UInt8, Char}=UInt8('"'), closequotechar::Union{UInt8, Char}=UInt8('"'), escapechar::Union{UInt8, Char}=UInt8('"'), ignorerepeated::Bool=false, ignoreemptylines::Bool=false, delim::Union{UInt8, Char, PtrLen, AbstractString, Nothing}=UInt8(','), decimal::Union{UInt8, Char}=UInt8('.'), comment=nothing, trues=nothing, falses=nothing, dateformat::Union{Nothing, String, Dates.DateFormat}=nothing, debug::Bool=false, stripwhitespace::Bool=false) where {T}
options = Options(sentinel, wh1, wh2, openquotechar, closequotechar, escapechar, delim, decimal, trues, falses, dateformat, ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace)
function xparse(::Type{T}, buf::Union{AbstractVector{UInt8}, AbstractString, IO}; pos::Integer=1, len::Integer=buf isa IO ? 0 : sizeof(buf), sentinel=nothing, wh1::Union{UInt8, Char}=UInt8(' '), wh2::Union{UInt8, Char}=UInt8('\t'), quoted::Bool=true, openquotechar::Union{UInt8, Char}=UInt8('"'), closequotechar::Union{UInt8, Char}=UInt8('"'), escapechar::Union{UInt8, Char}=UInt8('"'), ignorerepeated::Bool=false, ignoreemptylines::Bool=false, delim::Union{UInt8, Char, PtrLen, AbstractString, Nothing}=UInt8(','), decimal::Union{UInt8, Char}=UInt8('.'), comment=nothing, trues=nothing, falses=nothing, dateformat::Union{Nothing, String, Dates.DateFormat}=nothing, debug::Bool=false, stripwhitespace::Bool=false, stripquoted::Bool=false) where {T}
options = Options(sentinel, wh1, wh2, openquotechar, closequotechar, escapechar, delim, decimal, trues, falses, dateformat, ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace, stripquoted)
return xparse(T, buf, pos, len, options)
end

Expand Down
6 changes: 3 additions & 3 deletions src/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
pos += 1
incr!(source)
vpos = pos
if options.stripwhitespace
if options.stripquoted
vstartpos = pos
end
if eof(source, pos, len)
Expand Down Expand Up @@ -97,7 +97,7 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
code |= INVALID_QUOTED_FIELD | EOF
@goto donedone
end
if options.stripwhitespace && b != options.wh1 && b != options.wh2
if options.stripquoted && b != options.wh1 && b != options.wh2
lastnonwhitespacepos = pos
end
b = peekbyte(source, pos)
Expand Down Expand Up @@ -282,7 +282,7 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
if eof(source, pos, len)
code |= EOF
end
if options.stripwhitespace
if options.stripquoted || (options.stripwhitespace && !quoted)
vpos = lastnonwhitespacepos
end
poslen = PosLen(vstartpos, vpos - vstartpos, ismissing, escapedstring(code))
Expand Down
46 changes: 34 additions & 12 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -195,13 +195,13 @@ testcases = [
(str="1\r\n# \r\n\r\n", kwargs=(ignoreemptylines=true, comment="#",), x=1, code=(OK | NEWLINE | EOF), vpos=1, vlen=1, tlen=10),
(str="1,\r\n# \r\n\r\n,", kwargs=(ignorerepeated=true, ignoreemptylines=true, comment="#", delim=UInt8(',')), x=1, code=(OK | NEWLINE | DELIMITED), vpos=1, vlen=1, tlen=12),
(str="1::\r\n# \r\n\r\n::", kwargs=(ignorerepeated=true, ignoreemptylines=true, comment="#", delim="::"), x=1, code=(OK | NEWLINE | DELIMITED), vpos=1, vlen=1, tlen=14),
# stripwhitespace
(str=" 1", kwargs=(stripwhitespace=true,), x=1, code=(OK | EOF), vpos=2, vlen=1, tlen=2),
(str="{ 1}", kwargs=(stripwhitespace=true,), x=1, code=(OK | QUOTED | EOF), vpos=3, vlen=1, tlen=4),
(str="{1 }", kwargs=(stripwhitespace=true,), x=1, code=(OK | QUOTED | EOF), vpos=2, vlen=1, tlen=4),
(str="1 ", kwargs=(stripwhitespace=true,), x=1, code=(OK | EOF), vpos=1, vlen=1, tlen=2),
(str="1 ,", kwargs=(stripwhitespace=true,delim=UInt8(',')), x=1, code=(OK | DELIMITED), vpos=1, vlen=1, tlen=3),
(str="{1 } ,", kwargs=(stripwhitespace=true,delim=UInt8(',')), x=1, code=(OK | DELIMITED | QUOTED), vpos=2, vlen=1, tlen=6),
# stripquoted
(str=" 1", kwargs=(stripquoted=true,), x=1, code=(OK | EOF), vpos=2, vlen=1, tlen=2),
(str="{ 1}", kwargs=(stripquoted=true,), x=1, code=(OK | QUOTED | EOF), vpos=3, vlen=1, tlen=4),
(str="{1 }", kwargs=(stripquoted=true,), x=1, code=(OK | QUOTED | EOF), vpos=2, vlen=1, tlen=4),
(str="1 ", kwargs=(stripquoted=true,), x=1, code=(OK | EOF), vpos=1, vlen=1, tlen=2),
(str="1 ,", kwargs=(stripquoted=true,delim=UInt8(',')), x=1, code=(OK | DELIMITED), vpos=1, vlen=1, tlen=3),
(str="{1 } ,", kwargs=(stripquoted=true,delim=UInt8(',')), x=1, code=(OK | DELIMITED | QUOTED), vpos=2, vlen=1, tlen=6),
];

for useio in (false, true)
Expand Down Expand Up @@ -241,22 +241,44 @@ end
res = Parsers.xparse(String, "{hey there}"; openquotechar='{', closequotechar='}', stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9
res = Parsers.xparse(String, "{hey there }"; openquotechar='{', closequotechar='}', stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9
@test res.val.pos == 2 && res.val.len == 10
res = Parsers.xparse(String, "{hey there },"; openquotechar='{', closequotechar='}', delim=',', stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9
@test res.val.pos == 2 && res.val.len == 10
res = Parsers.xparse(String, "{hey there } ,"; openquotechar='{', closequotechar='}', delim=',', stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9
@test res.val.pos == 2 && res.val.len == 10
res = Parsers.xparse(String, "{hey there } a,"; openquotechar='{', closequotechar='}', delim=',', stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9 && Parsers.invaliddelimiter(res.code)
@test res.val.pos == 2 && res.val.len == 10 && Parsers.invaliddelimiter(res.code)
res = Parsers.xparse(String, "{hey there } a "; openquotechar='{', closequotechar='}', delim=nothing, stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9 && res.tlen == 13
@test res.val.pos == 2 && res.val.len == 10 && res.tlen == 13
res = Parsers.xparse(String, "hey there ,"; delim=',', stripwhitespace=true)
@test res.val.pos == 1 && res.val.len == 9
res = Parsers.xparse(String, " hey there "; stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9
res = Parsers.xparse(String, " hey there "; delim=nothing, stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9

res = Parsers.xparse(String, "{hey there}"; openquotechar='{', closequotechar='}', stripquoted=true)
@test res.val.pos == 2 && res.val.len == 9
res = Parsers.xparse(String, "{hey there }"; openquotechar='{', closequotechar='}', stripquoted=true)
@test res.val.pos == 2 && res.val.len == 9
res = Parsers.xparse(String, "{hey there },"; openquotechar='{', closequotechar='}', delim=',', stripquoted=true)
@test res.val.pos == 2 && res.val.len == 9
res = Parsers.xparse(String, "{hey there } ,"; openquotechar='{', closequotechar='}', delim=',', stripquoted=true)
@test res.val.pos == 2 && res.val.len == 9
res = Parsers.xparse(String, "{hey there } a,"; openquotechar='{', closequotechar='}', delim=',', stripquoted=true)
@test res.val.pos == 2 && res.val.len == 9 && Parsers.invaliddelimiter(res.code)
res = Parsers.xparse(String, "{hey there } a "; openquotechar='{', closequotechar='}', delim=nothing, stripquoted=true)
@test res.val.pos == 2 && res.val.len == 9 && res.tlen == 13
res = Parsers.xparse(String, "hey there ,"; delim=',', stripquoted=true)
@test res.val.pos == 1 && res.val.len == 9
res = Parsers.xparse(String, " hey there "; stripquoted=true)
@test res.val.pos == 2 && res.val.len == 9
res = Parsers.xparse(String, " hey there "; delim=nothing, stripquoted=true)
@test res.val.pos == 2 && res.val.len == 9
# `stripquoted=true` should always override `stripwhitespace` to `true`
res = Parsers.xparse(String, " hey there "; delim=nothing, stripquoted=true, stripwhitespace=false)
@test res.val.pos == 2 && res.val.len == 9

end # @testset "Core Parsers.xparse"

@testset "ints" begin
Expand Down

0 comments on commit ea1259a

Please sign in to comment.