From c6398a3d216b1b533e084d2c8597cce1fe4a7af4 Mon Sep 17 00:00:00 2001 From: c42f Date: Sun, 2 Oct 2022 20:05:53 +1000 Subject: [PATCH 1/4] Rework JuliaSyntax.parse() public API `parse()` and `parseall()` were generally pretty inconvenient to use. This change reworks what I had called `parseall()` to be more similar to `Meta.parse()` and adds `parseall()` and `parseatom()` in analogy to the `Base.Meta` versions of these functions. The lower level function `parse!()` is provided to work with `ParseStream` for cases where more control is required. --- README.md | 75 +++++++++-------- src/hooks.jl | 2 +- src/parse_stream.jl | 25 +++++- src/parser_api.jl | 186 +++++++++++++++++-------------------------- test/expr.jl | 72 ++++++++--------- test/parse_stream.jl | 40 +++++++--- test/parser_api.jl | 95 ++++++++++++---------- test/test_utils.jl | 13 ++- 8 files changed, 256 insertions(+), 252 deletions(-) diff --git a/README.md b/README.md index 7ef4dbcd..ecce2b3a 100644 --- a/README.md +++ b/README.md @@ -42,23 +42,24 @@ A talk from JuliaCon 2022 covered some aspects of this package. # Examples Here's what parsing of a small piece of code currently looks like in various -forms. We'll use the `parseall` convenience function to demonstrate, but -there's also a more flexible parsing interface with `JuliaSyntax.parse()`. +forms. We'll use the `JuliaSyntax.parse` function to demonstrate, there's also +`JuliaSyntax.parse!` offering more fine-grained control. First, a source-ordered AST with `SyntaxNode` (`call-i` in the dump here means the `call` has the infix `-i` flag): ```julia -julia> parseall(SyntaxNode, "(x + y)*z", filename="foo.jl") +julia> using JuliaSyntax: JuliaSyntax, SyntaxNode, GreenNode + +julia> JuliaSyntax.parse(SyntaxNode, "(x + y)*z", filename="foo.jl") line:col│ byte_range │ tree │ file_name - 1:1 │ 1:9 │[toplevel] │foo.jl - 1:1 │ 1:9 │ [call-i] - 1:2 │ 2:6 │ [call-i] - 1:2 │ 2:2 │ x - 1:4 │ 4:4 │ + - 1:6 │ 6:6 │ y - 1:8 │ 8:8 │ * - 1:9 │ 9:9 │ z + 1:1 │ 1:9 │[call-i] │foo.jl + 1:2 │ 2:6 │ [call-i] + 1:2 │ 2:2 │ x + 1:4 │ 4:4 │ + + 1:6 │ 6:6 │ y + 1:8 │ 8:8 │ * + 1:9 │ 9:9 │ z ``` Internally this has a full representation of all syntax trivia (whitespace and @@ -69,19 +70,18 @@ despite being important for parsing. ```julia julia> text = "(x + y)*z" - greentree = parseall(GreenNode, text) - 1:9 │[toplevel] - 1:9 │ [call] - 1:1 │ ( - 2:6 │ [call] - 2:2 │ Identifier ✔ - 3:3 │ Whitespace - 4:4 │ + ✔ - 5:5 │ Whitespace - 6:6 │ Identifier ✔ - 7:7 │ ) - 8:8 │ * ✔ - 9:9 │ Identifier ✔ + greentree = JuliaSyntax.parse(GreenNode, text) + 1:9 │[call] + 1:1 │ ( + 2:6 │ [call] + 2:2 │ Identifier ✔ + 3:3 │ Whitespace + 4:4 │ + ✔ + 5:5 │ Whitespace + 6:6 │ Identifier ✔ + 7:7 │ ) + 8:8 │ * ✔ + 9:9 │ Identifier ✔ ``` `GreenNode` stores only byte ranges, but the token strings can be shown by @@ -89,25 +89,24 @@ supplying the source text string: ```julia julia> show(stdout, MIME"text/plain"(), greentree, text) - 1:9 │[toplevel] - 1:9 │ [call] - 1:1 │ ( "(" - 2:6 │ [call] - 2:2 │ Identifier ✔ "x" - 3:3 │ Whitespace " " - 4:4 │ + ✔ "+" - 5:5 │ Whitespace " " - 6:6 │ Identifier ✔ "y" - 7:7 │ ) ")" - 8:8 │ * ✔ "*" - 9:9 │ Identifier ✔ "z" + 1:9 │[call] + 1:1 │ ( "(" + 2:6 │ [call] + 2:2 │ Identifier ✔ "x" + 3:3 │ Whitespace " " + 4:4 │ + ✔ "+" + 5:5 │ Whitespace " " + 6:6 │ Identifier ✔ "y" + 7:7 │ ) ")" + 8:8 │ * ✔ "*" + 9:9 │ Identifier ✔ "z" ``` Julia `Expr` can also be produced: ```julia -julia> parseall(Expr, "(x + y)*z") -:($(Expr(:toplevel, :((x + y) * z)))) +julia> JuliaSyntax.parse(Expr, "(x + y)*z") +:((x + y) * z) ``` # Using JuliaSyntax as the default parser diff --git a/src/hooks.jl b/src/hooks.jl index 8011f602..9d65983c 100644 --- a/src/hooks.jl +++ b/src/hooks.jl @@ -157,7 +157,7 @@ function _core_parser_hook(code, filename, lineno, offset, options) return Core.svec(nothing, last_byte(stream)) end end - parse(stream; rule=rule) + parse!(stream; rule=rule) if rule === :statement bump_trivia(stream) end diff --git a/src/parse_stream.jl b/src/parse_stream.jl index 6665c3fa..33b614e2 100644 --- a/src/parse_stream.jl +++ b/src/parse_stream.jl @@ -155,10 +155,27 @@ const NO_POSITION = ParseStreamPosition(0, 0) #------------------------------------------------------------------------------- """ -ParseStream provides an IO interface for the parser. It -- Wraps the lexer with a lookahead buffer -- Removes insignificant whitespace and comment tokens, shifting them into the - output implicitly (newlines may be significant depending on `skip_newlines`) + ParseStream(text::AbstractString, index::Integer=1; version=VERSION) + ParseStream(text::IO; version=VERSION) + ParseStream(text::Vector{UInt8}, index::Integer=1; version=VERSION) + ParseStream(ptr::Ptr{UInt8}, len::Integer, index::Integer=1; version=VERSION) + +Construct a `ParseStream` from source `text` which may come in various forms - +a string, an `IO` object, or a buffer of bytes. In the case that the buffer is +passed as `ptr,len`, the caller is responsible for preserving the buffer during +parsing. + +A byte `index` may provided as the position to start parsing. + +ParseStream provides an IO interface for the parser which provides lexing of +the source text input into tokens, manages insignificant whitespace tokens on +behalf of the parser, and stores output tokens and tree nodes in a pair of +output arrays. + +`version` (default `VERSION`) may be used to set the syntax version to +any Julia version `>= v"1.0"`. We aim to parse all Julia syntax which has been +added after v"1.0", emitting an error if it's not compatible with the requested +`version`. """ mutable struct ParseStream # `textbuf` is a buffer of UTF-8 encoded text of the source code. This is a diff --git a/src/parser_api.jl b/src/parser_api.jl index c05122be..8745aaa5 100644 --- a/src/parser_api.jl +++ b/src/parser_api.jl @@ -3,43 +3,6 @@ # This is defined separately from parser.jl so that: # * parser.jl doesn't need to refer to any tree data structures # * It's clear which parts are the public API -# -# What should the general parsing API look like? Some points to consider: -# -# * After parsing atoms or statements or most other internal rules, it's -# usual to start in the middle of the input text and end somewhere else in -# the middle of the input text. So we should taken an index for the start of -# parsing and supply an index back to the caller after parsing. -# -# * `parseall` is a special case where we expect to consume all the input. -# Perhaps this is the API which throws an error if we don't consume it all, -# and doesn't accept an index as input? -# -# * The ParseStream is the fundamental interface which wraps the code string -# and index up together for input and contains the output events, diagnostics -# and current stream position after parsing. The user should potentially be -# able to use this directly. It does, however assume a Julia-compatible token -# stream. -# -# * It could be useful to support an IO-based interface so that users can parse -# Julia code intermixed with other DSLs. Documenter.jl and string macros come -# to mind as examples which could use this. A tricky part is deciding where -# the input ends: For string macros this is done by the parser, but for -# Documenter it's probably just done beforehand according to the Markdown -# code block rules. -# -# * The API should have an interface where a simple string is passed in. How -# does SourceFile relate to this? -# -# * It's neat for `parse` to be overloadable to produce various output data -# structures; GreenNode, SyntaxNode, Expr, (etc?) in the same way that -# Base.parse can be used for non-Julia code. (Heh... though -# `Base.parse(Expr, "...")` would also make a certain amount of sense.) -# -# * What's the no-copy API look like? A String can be put into an IOBuffer via -# unsafe_wrap(Vector{UInt8}, str) ... A SubString likewise. Also there's the -# `codeunits` function to hold a GC-safe view of string data as an array (but -# we can't use a Vector{UInt8}) struct ParseError <: Exception source::SourceFile @@ -65,39 +28,19 @@ Base.display_error(io::IO, err::ParseError, bt) = Base.showerror(io, err, bt) """ - # Input and output: - stream = parse(stream::ParseStream; kws...) - (tree, diagnostics) = parse(TreeType, io::IOBuffer; kws...) - (tree, diagnostics, index) = parse(TreeType, str::AbstractString, [index::Integer]; kws...) - # Keywords - parse(...; rule=:toplevel, version=VERSION, ignore_trivia=true) - -Parse Julia source code from `input`, returning the output in a format -compatible with `input`: - -* When `input` is a `ParseStream`, the stream itself is returned and the - `ParseStream` interface can be used to process the output. -* When `input` is a seekable `IO` subtype, the output is `(tree, diagnostics)`. - The buffer `position` will be set to the next byte of input. -* When `input` is an `AbstractString, Integer`, or `Vector{UInt8}, Integer` the - output is `(tree, diagnostics, index)`, where `index` (default 1) is the next - byte of input. + parse!(stream::ParseStream; rule=:toplevel) + +Parse Julia source code from a [`ParseStream`](@ref) object. Output tree data +structures may be extracted from `stream` with the `build_tree()` function. `rule` may be any of -* `toplevel` (default) — parse a whole "file" of top level statements. In this +* `:toplevel` (default) — parse a whole "file" of top level statements. In this mode, the parser expects to fully consume the input. -* `statement` — parse a single statement, or statements separated by semicolons. -* `atom` — parse a single syntax "atom": a literal, identifier, or +* `:statement` — parse a single statement, or statements separated by semicolons. +* `:atom` — parse a single syntax "atom": a literal, identifier, or parenthesized expression. - -`version` (default `VERSION`) may be used to set the syntax version to -any Julia version `>= v"1.0"`. We aim to parse all Julia syntax which has been -added after v"1.0", emitting an error if it's not compatible with the requested -`version`. - -See also [`parseall`](@ref) for a simpler but less powerful interface. """ -function parse(stream::ParseStream; rule::Symbol=:toplevel) +function parse!(stream::ParseStream; rule::Symbol=:toplevel) ps = ParseState(stream) if rule === :toplevel parse_toplevel(ps) @@ -111,56 +54,37 @@ function parse(stream::ParseStream; rule::Symbol=:toplevel) stream end -function parse(::Type{T}, io::IO; - rule::Symbol=:toplevel, version=VERSION, kws...) where {T} +""" + parse!(io::IO; rule=:toplevel, version=VERSION) + +Parse Julia source code from a seekable `IO` object. The output is a tuple +`(tree, diagnostics)`. The `io` position will be set to the next byte of input +after parsing. +""" +function parse!(::Type{T}, io::IO; + rule::Symbol=:toplevel, version=VERSION, kws...) where {T} stream = ParseStream(io; version=version) - parse(stream; rule=rule) + parse!(stream; rule=rule) tree = build_tree(T, stream; kws...) seek(io, last_byte(stream)) tree, stream.diagnostics end -# Generic version of parse for all other cases where an index must be passed -# back - ie strings and buffers -function parse(::Type{T}, input...; - rule::Symbol=:toplevel, version=VERSION, kws...) where {T} - stream = ParseStream(input...; version=version) - parse(stream; rule=rule) - tree = build_tree(T, stream; kws...) - tree, stream.diagnostics, last_byte(stream) + 1 -end - - -""" - parseall(TreeType, input...; - rule=:toplevel, - version=VERSION, - ignore_trivia=true) - -Experimental convenience interface to parse `input` as Julia code, emitting an -error if the entire input is not consumed. `input` can be a string or any other -valid input to the `ParseStream` constructor. By default `parseall` will ignore -whitespace and comments before and after valid code but you can turn this off -by setting `ignore_trivia=false`. - -A `ParseError` will be thrown if any errors occurred during parsing. - -See [`parse`](@ref) for a more complete and powerful interface to the parser, -as well as a description of the `version` and `rule` keywords. -""" -function parseall(::Type{T}, input...; rule=:toplevel, version=VERSION, - ignore_trivia=true, filename=nothing) where {T} - stream = ParseStream(input...; version=version) +function _parse(rule::Symbol, need_eof::Bool, ::Type{T}, text, index=1; version=VERSION, + ignore_trivia=true, filename=nothing, ignore_warnings=false) where {T} + stream = ParseStream(text, index; version=version) if ignore_trivia && rule != :toplevel bump_trivia(stream, skip_newlines=true) empty!(stream) end - parse(stream; rule=rule) - if (ignore_trivia && peek(stream, skip_newlines=true) != K"EndMarker") || - (!ignore_trivia && (peek(stream, skip_newlines=false, skip_whitespace=false) != K"EndMarker")) - emit_diagnostic(stream, error="unexpected text after parsing $rule") + parse!(stream; rule=rule) + if need_eof + if (ignore_trivia && peek(stream, skip_newlines=true) != K"EndMarker") || + (!ignore_trivia && (peek(stream, skip_newlines=false, skip_whitespace=false) != K"EndMarker")) + emit_diagnostic(stream, error="unexpected text after parsing $rule") + end end - if any_error(stream.diagnostics) + if any_error(stream.diagnostics) || (!ignore_warnings && !isempty(stream.diagnostics)) throw(ParseError(stream, filename=filename)) end # TODO: Figure out a more satisfying solution to the wrap_toplevel_as_kind @@ -169,13 +93,51 @@ function parseall(::Type{T}, input...; rule=:toplevel, version=VERSION, # not absolute positions. # * Dropping it would be ok for SyntaxNode and Expr... tree = build_tree(T, stream; wrap_toplevel_as_kind=K"toplevel", filename=filename) - if !isempty(stream.diagnostics) - # Crudely format any warnings to the current logger. - buf = IOBuffer() - show_diagnostics(IOContext(buf, stdout), stream, - SourceFile(sourcetext(stream, steal_textbuf=true), filename=filename)) - @warn Text(String(take!(buf))) - end - tree + tree, last_byte(stream) + 1 end +""" + parse(TreeType, text, [index]; + version=VERSION, + ignore_trivia=true, + filename=nothing, + ignore_warnings=false) + + # Or, with the same arguments + parseall(...) + parseatom(...) + +Parse Julia source code string `text` into a data structure of type `TreeType`. +`parse` parses a single Julia statement, `parseall` parses top level statements +at file scope and `parseatom` parses a single Julia identifier or other "syntax +atom". + +If `text` is passed without `index`, all the input text must be consumed and a +tree data structure is returned. When an integer byte `index` is passed, a +tuple `(tree, next_index)` will be returned containing the next index in `text` +to resume parsing. By default whitespace and comments before and after valid +code are ignored but you can turn this off by setting `ignore_trivia=false`. + +`version` (default `VERSION`) may be used to set the syntax version to +any Julia version `>= v"1.0"`. We aim to parse all Julia syntax which has been +added after v"1.0", emitting an error if it's not compatible with the requested +`version`. + +Pass `filename` to set any file name information embedded within the output +tree, if applicable. This will also annotate errors and warnings with the +source file name. + +A `ParseError` will be thrown if any errors or warnings occurred during +parsing. To avoid exceptions due to warnings, use `ignore_warnings=true`. +""" +parse(T, text::AbstractString; kws...) = _parse(:statement, true, T, text; kws...)[1] +parseall(T, text::AbstractString; kws...) = _parse(:toplevel, true, T, text; kws...)[1] +parseatom(T, text::AbstractString; kws...) = _parse(:atom, true, T, text; kws...)[1] + +@eval @doc $(@doc parse) parseall +@eval @doc $(@doc parse) parseatom + +parse(T, text::AbstractString, index::Integer; kws...) = _parse(:statement, false, T, text, index; kws...) +parseall(T, text::AbstractString, index::Integer; kws...) = _parse(:toplevel, false, T, text, index; kws...) +parseatom(T, text::AbstractString, index::Integer; kws...) = _parse(:atom, false, T, text, index; kws...) + diff --git a/test/expr.jl b/test/expr.jl index 35d84de5..850853f5 100644 --- a/test/expr.jl +++ b/test/expr.jl @@ -1,20 +1,16 @@ -function parse_Expr(str) - parseall(Expr, str, rule=:statement) -end - @testset "Expr conversion" begin @testset "Quote nodes" begin - @test parseall(Expr, ":(a)", rule=:atom) == QuoteNode(:a) - @test parseall(Expr, ":(:a)", rule=:atom) == Expr(:quote, QuoteNode(:a)) - @test parseall(Expr, ":(1+2)", rule=:atom) == Expr(:quote, Expr(:call, :+, 1, 2)) + @test parseatom(Expr, ":(a)") == QuoteNode(:a) + @test parseatom(Expr, ":(:a)") == Expr(:quote, QuoteNode(:a)) + @test parseatom(Expr, ":(1+2)") == Expr(:quote, Expr(:call, :+, 1, 2)) # Compatibility hack for VERSION >= v"1.4" # https://github.com/JuliaLang/julia/pull/34077 - @test parseall(Expr, ":true", rule=:atom) == Expr(:quote, true) + @test parseatom(Expr, ":true") == Expr(:quote, true) end @testset "Line numbers" begin @testset "Blocks" begin - @test parse_Expr("begin a\nb\n\nc\nend") == + @test parse(Expr, "begin a\nb\n\nc\nend") == Expr(:block, LineNumberNode(1), :a, @@ -23,7 +19,7 @@ end LineNumberNode(4), :c, ) - @test parse_Expr("begin end") == + @test parse(Expr, "begin end") == Expr(:block, LineNumberNode(1) ) @@ -36,7 +32,7 @@ end :b, ) - @test parse_Expr("module A\n\nbody\nend") == + @test parse(Expr, "module A\n\nbody\nend") == Expr(:module, true, :A, @@ -49,7 +45,7 @@ end end @testset "Function definition lines" begin - @test parse_Expr("function f()\na\n\nb\nend") == + @test parse(Expr, "function f()\na\n\nb\nend") == Expr(:function, Expr(:call, :f), Expr(:block, @@ -60,7 +56,7 @@ end :b, ) ) - @test parse_Expr("f() = 1") == + @test parse(Expr, "f() = 1") == Expr(:(=), Expr(:call, :f), Expr(:block, @@ -70,14 +66,14 @@ end ) # function/macro without methods - @test parse_Expr("function f end") == + @test parse(Expr, "function f end") == Expr(:function, :f) - @test parse_Expr("macro f end") == + @test parse(Expr, "macro f end") == Expr(:macro, :f) end @testset "elseif" begin - @test parse_Expr("if a\nb\nelseif c\n d\nend") == + @test parse(Expr, "if a\nb\nelseif c\n d\nend") == Expr(:if, :a, Expr(:block, @@ -95,7 +91,7 @@ end end @testset "No line numbers in for/let bindings" begin - @test parse_Expr("for i=is, j=js\nbody\nend") == + @test parse(Expr, "for i=is, j=js\nbody\nend") == Expr(:for, Expr(:block, Expr(:(=), :i, :is), @@ -106,7 +102,7 @@ end :body ) ) - @test parse_Expr("let i=is, j=js\nbody\nend") == + @test parse(Expr, "let i=is, j=js\nbody\nend") == Expr(:let, Expr(:block, Expr(:(=), :i, :is), @@ -122,7 +118,7 @@ end @testset "Short form function line numbers" begin # A block is added to hold the line number node - @test parse_Expr("f() = xs") == + @test parse(Expr, "f() = xs") == Expr(:(=), Expr(:call, :f), Expr(:block, @@ -130,7 +126,7 @@ end :xs)) # flisp parser quirk: In a for loop the block is not added, despite # this defining a short-form function. - @test parse_Expr("for f() = xs\nend") == + @test parse(Expr, "for f() = xs\nend") == Expr(:for, Expr(:(=), Expr(:call, :f), :xs), Expr(:block, @@ -139,7 +135,7 @@ end end @testset "Long form anonymous functions" begin - @test parse_Expr("function (xs...)\nbody end") == + @test parse(Expr, "function (xs...)\nbody end") == Expr(:function, Expr(:..., :xs), Expr(:block, @@ -150,19 +146,19 @@ end @testset "String conversions" begin # String unwrapping / wrapping - @test parse_Expr("\"str\"") == "str" - @test parse_Expr("\"\$(\"str\")\"") == + @test parse(Expr, "\"str\"") == "str" + @test parse(Expr, "\"\$(\"str\")\"") == Expr(:string, Expr(:string, "str")) # Concatenation of string chunks in triple quoted cases - @test parse_Expr("```\n a\n b```") == + @test parse(Expr, "```\n a\n b```") == Expr(:macrocall, GlobalRef(Core, Symbol("@cmd")), LineNumberNode(1), "a\nb") - @test parse_Expr("\"\"\"\n a\n \$x\n b\n c\"\"\"") == + @test parse(Expr, "\"\"\"\n a\n \$x\n b\n c\"\"\"") == Expr(:string, "a\n", :x, "\nb\nc") end @testset "do block conversion" begin - @test parse_Expr("f(x) do y\n body end") == + @test parse(Expr, "f(x) do y\n body end") == Expr(:do, Expr(:call, :f, :x), Expr(:->, Expr(:tuple, :y), Expr(:block, @@ -172,29 +168,29 @@ end @testset "= to Expr(:kw) conversion" begin # Call - @test parse_Expr("f(a=1)") == + @test parse(Expr, "f(a=1)") == Expr(:call, :f, Expr(:kw, :a, 1)) - @test parse_Expr("f(; b=2)") == + @test parse(Expr, "f(; b=2)") == Expr(:call, :f, Expr(:parameters, Expr(:kw, :b, 2))) - @test parse_Expr("f(a=1; b=2)") == + @test parse(Expr, "f(a=1; b=2)") == Expr(:call, :f, Expr(:parameters, Expr(:kw, :b, 2)), Expr(:kw, :a, 1)) # Infix call = is not :kw - @test parse_Expr("(x=1) != 2") == + @test parse(Expr, "(x=1) != 2") == Expr(:call, :!=, Expr(:(=), :x, 1), 2) # Dotcall - @test parse_Expr("f.(a=1; b=2)") == + @test parse(Expr, "f.(a=1; b=2)") == Expr(:., :f, Expr(:tuple, Expr(:parameters, Expr(:kw, :b, 2)), Expr(:kw, :a, 1))) # Named tuples - @test parse_Expr("(a=1,)") == + @test parse(Expr, "(a=1,)") == Expr(:tuple, Expr(:(=), :a, 1)) - @test parse_Expr("(a=1,; b=2)") == + @test parse(Expr, "(a=1,; b=2)") == Expr(:tuple, Expr(:parameters, Expr(:kw, :b, 2)), Expr(:(=), :a, 1)) - @test parse_Expr("(a=1,; b=2; c=3)") == + @test parse(Expr, "(a=1,; b=2; c=3)") == Expr(:tuple, Expr(:parameters, Expr(:parameters, Expr(:kw, :c, 3)), @@ -202,21 +198,21 @@ end Expr(:(=), :a, 1)) # ref - @test parse_Expr("x[i=j]") == + @test parse(Expr, "x[i=j]") == Expr(:ref, :x, Expr(:kw, :i, :j)) # vect/braces - @test parse_Expr("[a=1,; b=2]") == + @test parse(Expr, "[a=1,; b=2]") == Expr(:vect, Expr(:parameters, Expr(:(=), :b, 2)), Expr(:(=), :a, 1)) - @test parse_Expr("{a=1,; b=2}") == + @test parse(Expr, "{a=1,; b=2}") == Expr(:braces, Expr(:parameters, Expr(:(=), :b, 2)), Expr(:(=), :a, 1)) # dotted = is not :kw - @test parse_Expr("f(a .= 1)") == + @test parse(Expr, "f(a .= 1)") == Expr(:call, :f, Expr(:.=, :a, 1)) end end diff --git a/test/parse_stream.jl b/test/parse_stream.jl index 10c021f7..315b59c8 100644 --- a/test/parse_stream.jl +++ b/test/parse_stream.jl @@ -71,16 +71,34 @@ st = ParseStream(code) @test peek(st) == K"NewlineWs" bump(st, TRIVIA_FLAG) emit(st, p1, K"toplevel") -end - -@test JuliaSyntax.build_tree(GreenNode, st) isa JuliaSyntax.GreenNode -# ## Input code -#= -println("-----------------------") -print(code) -println() + @test build_tree(GreenNode, st) isa JuliaSyntax.GreenNode +end -# ## Output tree -show(stdout, MIME"text/plain"(), t, code, show_trivia=true) -=# +@testset "ParseStream constructors" begin + @testset "Byte buffer inputs" begin + # Vector{UInt8} + let + st = ParseStream(Vector{UInt8}("x+y")) + bump(st) + @test build_tree(Expr, st) == :x + @test JuliaSyntax.last_byte(st) == 1 + end + let + st = ParseStream(Vector{UInt8}("x+y"), 3) + bump(st) + @test build_tree(Expr, st) == :y + @test JuliaSyntax.last_byte(st) == 3 + end + # Ptr{UInt8}, len + code = "x+y" + GC.@preserve code begin + let + st = ParseStream(pointer(code), 3) + bump(st) + @test build_tree(Expr, st) == :x + @test JuliaSyntax.last_byte(st) == 1 + end + end + end +end diff --git a/test/parser_api.jl b/test/parser_api.jl index 0a3c49ec..1af46fba 100644 --- a/test/parser_api.jl +++ b/test/parser_api.jl @@ -1,52 +1,56 @@ @testset "parser API" begin - @testset "String and buffer input" begin - # String - let - ex,diag,pos = parse(Expr, "x+y\nz") - @test JuliaSyntax.remove_linenums!(ex) == Expr(:toplevel, :(x+y), :z) - @test diag == [] - @test pos == 6 - end - @test parse(Expr, "x+y\nz", rule=:statement) == (:(x+y), [], 4) - @test parse(Expr, "x+y\nz", rule=:atom) == (:x, [], 2) - @test parse(Expr, "x+y\nz", 5, rule=:atom) == (:z, [], 6) - - # Vector{UInt8} - @test parse(Expr, Vector{UInt8}("x+y"), rule=:statement) == (:(x+y), [], 4) - @test parse(Expr, Vector{UInt8}("x+y"), 3, rule=:statement) == (:y, [], 4) - # Ptr{UInt8}, len - code = "x+y" - GC.@preserve code begin - stream = ParseStream(pointer(code), 3) - parse(stream, rule=:statement) - @test JuliaSyntax.build_tree(Expr, stream) == :(x+y) - @test JuliaSyntax.last_byte(stream) == 3 - end + @testset "parse with String input" begin + @test parse(Expr, " x ") == :x + @test JuliaSyntax.remove_linenums!(parseall(Expr, " x ")) == Expr(:toplevel, :x) + @test parseatom(Expr, " x ") == :x + # TODO: Fix this situation with trivia here; the brackets are trivia, but + # must be parsed to discover the atom inside. But in GreenTree we only + # place trivia as siblings of the leaf node with identifier `x`, not as + # children. + @test_broken parseatom(Expr, "(x)") == :x # SubString - @test parse(Expr, SubString("x+y"), rule=:statement) == (:(x+y), [], 4) - @test parse(Expr, SubString("x+y"), 1, rule=:atom) == (:x, [], 2) - @test parse(Expr, SubString("x+y"), 3, rule=:atom) == (:y, [], 4) - @test parse(Expr, SubString("x+y",3,3), 1, rule=:atom) == (:y, [], 2) - @test parse(Expr, SubString("α+x"), rule=:statement) == (:(α+x), [], 5) + @test parse(Expr, SubString("x+y")) == :(x+y) + @test parse(Expr, SubString("α+x")) == :(α+x) + @test parseatom(Expr, SubString("x+y",3,3)) == :y + + # Exceptions due to extra trailing syntax + @test_throws JuliaSyntax.ParseError parseatom(Expr, "x+y") + @test_throws JuliaSyntax.ParseError parse(Expr, "x+y\nz") + + # ignore_warnings flag + @test_throws JuliaSyntax.ParseError parse(Expr, "import . .A") + @test parse(Expr, "import . .A", ignore_warnings=true) == :(import ..A) + + # version selection + @test_throws JuliaSyntax.ParseError parse(Expr, "[a ;; b]", version=v"1.6") + @test parse(Expr, "[a ;; b]", version=v"1.7") == Expr(:ncat, 2, :a, :b) + + # filename + @test JuliaSyntax.parse(Expr, "begin\na\nend", filename="foo.jl") == + Expr(:block, LineNumberNode(2, Symbol("foo.jl")), :a) + + # ignore_trivia + @test parseatom(Expr, " x ", ignore_trivia=true) == :x + @test_throws JuliaSyntax.ParseError parseatom(Expr, " x ", ignore_trivia=false) end @testset "IO input" begin # IOBuffer io = IOBuffer("x+y") - @test parse(Expr, io, rule=:statement) == (:(x+y), []) + @test parse!(Expr, io, rule=:statement) == (:(x+y), []) @test position(io) == 3 io = IOBuffer("x+y") seek(io, 2) - @test parse(Expr, io, rule=:atom) == (:y, []) + @test parse!(Expr, io, rule=:atom) == (:y, []) @test position(io) == 3 # A GenericIOBuffer, not actually IOBuffer io = IOBuffer(SubString("x+y")) - @test parse(Expr, io, rule=:statement) == (:(x+y), []) + @test parse!(Expr, io, rule=:statement) == (:(x+y), []) @test position(io) == 3 # Another type of GenericIOBuffer io = IOBuffer(codeunits("x+y")) - @test parse(Expr, io, rule=:statement) == (:(x+y), []) + @test parse!(Expr, io, rule=:statement) == (:(x+y), []) @test position(io) == 3 # IOStream mktemp() do path, io @@ -54,23 +58,26 @@ close(io) open(path, "r") do io - @test parse(Expr, io, rule=:statement) == (:(x+y), []) + @test parse!(Expr, io, rule=:statement) == (:(x+y), []) @test position(io) == 3 end end end - @testset "parseall" begin - @test JuliaSyntax.remove_linenums!(parseall(Expr, " x ")) == Expr(:toplevel, :x) - @test parseall(Expr, " x ", rule=:statement) == :x - @test parseall(Expr, " x ", rule=:atom) == :x - # TODO: Fix this situation with trivia here; the brackets are trivia, but - # must be parsed to discover the atom inside. But in GreenTree we only - # place trivia as siblings of the leaf node with identifier `x`, not as - # children. - @test_broken parseall(Expr, "(x)", rule=:atom) == :x + @testset "parse with String and index input" begin + # String + let + ex,pos = parseall(Expr, "x+y\nz", 1) + @test JuliaSyntax.remove_linenums!(ex) == Expr(:toplevel, :(x+y), :z) + @test pos == 6 + end + @test parse(Expr, "x+y\nz", 1) == (:(x+y), 4) + @test parseatom(Expr, "x+y\nz", 1) == (:x, 2) + @test parseatom(Expr, "x+y\nz", 5) == (:z, 6) - @test_throws JuliaSyntax.ParseError parseall(Expr, "x+y", rule=:atom) - @test_throws JuliaSyntax.ParseError parseall(Expr, "x+y\nz", rule=:statement) + # SubString + @test parse(Expr, SubString("α+x\ny"), 1) == (:(α+x), 5) + @test parseatom(Expr, SubString("x+y"), 1) == (:x, 2) + @test parseatom(Expr, SubString("x+y"), 3) == (:y, 4) end end diff --git a/test/test_utils.jl b/test/test_utils.jl index ced5912c..d8b52436 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -7,8 +7,11 @@ using JuliaSyntax: # Parsing ParseStream, SourceFile, + parse!, parse, parseall, + parseatom, + build_tree, @K_str, # Nodes GreenNode, @@ -72,11 +75,13 @@ function parsers_agree_on_file(filename; show_diff=false) return true end try - ex, diagnostics, _ = parse(Expr, text, filename=filename) + stream = ParseStream(text) + parse!(stream) + ex = build_tree(Expr, stream, filename=filename) if show_diff && ex != fl_ex show_expr_text_diff(show, ex, fl_ex) end - return !JuliaSyntax.any_error(diagnostics) && + return !JuliaSyntax.any_error(stream) && JuliaSyntax.remove_linenums!(ex) == JuliaSyntax.remove_linenums!(fl_ex) catch exc @@ -111,7 +116,7 @@ function equals_flisp_parse(tree) node_text = sourcetext(tree) # Reparse with JuliaSyntax. This is a crude way to ensure we're not missing # some context from the parent node. - ex,_,_ = parse(Expr, node_text) + ex = parseall(Expr, node_text) fl_ex = fl_parseall(node_text) if Meta.isexpr(fl_ex, :error) return true # Something went wrong in reduction; ignore these cases 😬 @@ -156,7 +161,7 @@ function reduce_test(tree::SyntaxNode) end function reduce_test(text::AbstractString) - tree, _, _ = parse(SyntaxNode, text) + tree, = parseall(SyntaxNode, text) reduce_test(tree) end From 0ad7fd5c8ac226020532a022a990b12f2a8319cc Mon Sep 17 00:00:00 2001 From: c42f Date: Mon, 3 Oct 2022 08:00:07 +1000 Subject: [PATCH 2/4] Update src/parser_api.jl Co-authored-by: Sebastian Pfitzner --- src/parser_api.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser_api.jl b/src/parser_api.jl index 8745aaa5..5bd91708 100644 --- a/src/parser_api.jl +++ b/src/parser_api.jl @@ -31,7 +31,7 @@ Base.display_error(io::IO, err::ParseError, bt) = Base.showerror(io, err, bt) parse!(stream::ParseStream; rule=:toplevel) Parse Julia source code from a [`ParseStream`](@ref) object. Output tree data -structures may be extracted from `stream` with the `build_tree()` function. +structures may be extracted from `stream` with the [`build_tree`](@ref) function. `rule` may be any of * `:toplevel` (default) — parse a whole "file" of top level statements. In this From af4c0abe4fc1b841c132ba65dbbf4c728db38505 Mon Sep 17 00:00:00 2001 From: c42f Date: Mon, 3 Oct 2022 11:18:34 +1000 Subject: [PATCH 3/4] Improve some docs, specialize parse shims on tree type --- src/parse_stream.jl | 9 +++++---- src/parser_api.jl | 24 ++++++++++++------------ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/parse_stream.jl b/src/parse_stream.jl index 33b614e2..0d249423 100644 --- a/src/parse_stream.jl +++ b/src/parse_stream.jl @@ -160,10 +160,11 @@ const NO_POSITION = ParseStreamPosition(0, 0) ParseStream(text::Vector{UInt8}, index::Integer=1; version=VERSION) ParseStream(ptr::Ptr{UInt8}, len::Integer, index::Integer=1; version=VERSION) -Construct a `ParseStream` from source `text` which may come in various forms - -a string, an `IO` object, or a buffer of bytes. In the case that the buffer is -passed as `ptr,len`, the caller is responsible for preserving the buffer during -parsing. +Construct a `ParseStream` from input which may come in various forms: +* An string (zero copy for `String` and `SubString`) +* An `IO` object (zero copy for `IOBuffer`). The `IO` object must be seekable. +* A buffer of bytes (zero copy). The caller is responsible for preserving + buffers passed as `(ptr,len)`. A byte `index` may provided as the position to start parsing. diff --git a/src/parser_api.jl b/src/parser_api.jl index 5bd91708..9904e84a 100644 --- a/src/parser_api.jl +++ b/src/parser_api.jl @@ -55,17 +55,17 @@ function parse!(stream::ParseStream; rule::Symbol=:toplevel) end """ - parse!(io::IO; rule=:toplevel, version=VERSION) + parse!(TreeType, io::IO; rule=:toplevel, version=VERSION) Parse Julia source code from a seekable `IO` object. The output is a tuple -`(tree, diagnostics)`. The `io` position will be set to the next byte of input -after parsing. +`(tree, diagnostics)`. When `parse!` returns, the stream `io` is positioned +directly after the last byte which was consumed during parsing. """ -function parse!(::Type{T}, io::IO; - rule::Symbol=:toplevel, version=VERSION, kws...) where {T} +function parse!(::Type{TreeType}, io::IO; + rule::Symbol=:toplevel, version=VERSION, kws...) where {TreeType} stream = ParseStream(io; version=version) parse!(stream; rule=rule) - tree = build_tree(T, stream; kws...) + tree = build_tree(TreeType, stream; kws...) seek(io, last_byte(stream)) tree, stream.diagnostics end @@ -130,14 +130,14 @@ source file name. A `ParseError` will be thrown if any errors or warnings occurred during parsing. To avoid exceptions due to warnings, use `ignore_warnings=true`. """ -parse(T, text::AbstractString; kws...) = _parse(:statement, true, T, text; kws...)[1] -parseall(T, text::AbstractString; kws...) = _parse(:toplevel, true, T, text; kws...)[1] -parseatom(T, text::AbstractString; kws...) = _parse(:atom, true, T, text; kws...)[1] +parse(::Type{T}, text::AbstractString; kws...) where {T} = _parse(:statement, true, T, text; kws...)[1] +parseall(::Type{T}, text::AbstractString; kws...) where {T} = _parse(:toplevel, true, T, text; kws...)[1] +parseatom(::Type{T}, text::AbstractString; kws...) where {T} = _parse(:atom, true, T, text; kws...)[1] @eval @doc $(@doc parse) parseall @eval @doc $(@doc parse) parseatom -parse(T, text::AbstractString, index::Integer; kws...) = _parse(:statement, false, T, text, index; kws...) -parseall(T, text::AbstractString, index::Integer; kws...) = _parse(:toplevel, false, T, text, index; kws...) -parseatom(T, text::AbstractString, index::Integer; kws...) = _parse(:atom, false, T, text, index; kws...) +parse(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _parse(:statement, false, T, text, index; kws...) +parseall(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _parse(:toplevel, false, T, text, index; kws...) +parseatom(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _parse(:atom, false, T, text, index; kws...) From 4d8c601571dfb8927151e752eb9d6edc10b17f40 Mon Sep 17 00:00:00 2001 From: c42f Date: Mon, 3 Oct 2022 11:19:57 +1000 Subject: [PATCH 4/4] Update src/parse_stream.jl Co-authored-by: Sebastian Pfitzner --- src/parse_stream.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parse_stream.jl b/src/parse_stream.jl index 0d249423..c62e64b2 100644 --- a/src/parse_stream.jl +++ b/src/parse_stream.jl @@ -166,7 +166,7 @@ Construct a `ParseStream` from input which may come in various forms: * A buffer of bytes (zero copy). The caller is responsible for preserving buffers passed as `(ptr,len)`. -A byte `index` may provided as the position to start parsing. +A byte `index` may be provided as the position to start parsing. ParseStream provides an IO interface for the parser which provides lexing of the source text input into tokens, manages insignificant whitespace tokens on