diff --git a/base/exports.jl b/base/exports.jl index 71bb92e49249e..b87d274b15889 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -1340,6 +1340,7 @@ export # notation for certain types @b_str, # byte vector @r_str, # regex + @s_str, # regex substitution string @v_str, # version number # documentation diff --git a/base/pcre.jl b/base/pcre.jl index edf4530cb52de..5326bdb21551b 100644 --- a/base/pcre.jl +++ b/base/pcre.jl @@ -140,6 +140,23 @@ function substring_number_from_name(re, name) (Ptr{Void}, Cstring), re, name) end +function substring_length_bynumber(match_data, number) + s = Ref{Csize_t}() + rc = ccall((:pcre2_substring_length_bynumber_8, PCRE_LIB), Cint, + (Ptr{Void}, UInt32, Ref{Csize_t}), match_data, number, s) + rc < 0 && error("PCRE error: $(err_message(rc))") + convert(Int, s[]) +end + +function substring_copy_bynumber(match_data, number, buf, buf_size) + s = Ref{Csize_t}(buf_size) + rc = ccall((:pcre2_substring_copy_bynumber_8, PCRE_LIB), Cint, + (Ptr{Void}, UInt32, Ptr{UInt8}, Ref{Csize_t}), + match_data, number, buf, s) + rc < 0 && error("PCRE error: $(err_message(rc))") + convert(Int, s[]) +end + function capture_names(re) name_count = info(re, INFO_NAMECOUNT, UInt32) name_entry_size = info(re, INFO_NAMEENTRYSIZE, UInt32) diff --git a/base/regex.jl b/base/regex.jl index 2699c563ec349..66612f9908e0d 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -209,6 +209,89 @@ search(s::AbstractString, r::Regex, idx::Integer) = throw(ArgumentError("regex search is only available for bytestrings; use bytestring(s) to convert")) search(s::AbstractString, r::Regex) = search(s,r,start(s)) +immutable SubstitutionString{T<:AbstractString} <: AbstractString + string::T +end + +endof(s::SubstitutionString) = endof(s.string) +next(s::SubstitutionString, idx::Int) = next(s.string, idx) +function show(io::IO, s::SubstitutionString) + print(io, "s") + show(io, s.string) +end + +macro s_str(string) SubstitutionString(string) end + +replace_err(repl) = error("Bad replacement string: $repl") + +function _write_capture(io, re, group) + len = PCRE.substring_length_bynumber(re.match_data, group) + ensureroom(io, len+1) + PCRE.substring_copy_bynumber(re.match_data, group, + pointer(io.data, io.ptr), len+1) + io.ptr += len + io.size = max(io.size, io.ptr - 1) +end + +function _replace(io, repl_s::SubstitutionString, str, r, re) + const SUB_CHAR = '\\' + const GROUP_CHAR = 'g' + const LBRACKET = '<' + const RBRACKET = '>' + repl = repl_s.string + i = start(repl) + e = endof(repl) + while i <= e + if repl[i] == SUB_CHAR + next_i = nextind(repl, i) + next_i > e && replace_err(repl) + if repl[next_i] == SUB_CHAR + write(io, SUB_CHAR, repl[next_i]) + i = nextind(repl, next_i) + elseif isnumber(repl[next_i]) + group = parse(Int, repl[next_i]) + i = nextind(repl, next_i) + while i <= e + if isnumber(repl[i]) + group = 10group + parse(Int, repl[i]) + i = nextind(repl, i) + else + break + end + end + _write_capture(io, re, group) + elseif repl[next_i] == GROUP_CHAR + i = nextind(repl, next_i) + if i > e || repl[i] != LBRACKET + replace_err(repl) + end + i = nextind(repl, i) + i > e && replace_err(repl) + groupstart = i + while repl[i] != RBRACKET + i = nextind(repl, i) + i > e && replace_err(repl) + end + # TODO: avoid this allocation + groupname = SubString(repl, groupstart, prevind(repl, i)) + if isnumber(groupname) + _write_capture(io, re, parse(Int, groupname)) + else + group = PCRE.substring_number_from_name(re.regex, groupname) + group < 0 && replace_err("Group $groupname not found in regex $re") + _write_capture(io, re, group) + end + i = nextind(repl, i) + else + replace_err(repl) + end + else + write(io, repl[i]) + i = nextind(repl, i) + end + end +end + immutable RegexMatchIterator regex::Regex string::UTF8String diff --git a/base/strings/util.jl b/base/strings/util.jl index c22586c432fb9..1e56ee1fb1515 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -173,8 +173,9 @@ function _rsplit{T<:AbstractString,U<:Array}(str::T, splitter, limit::Integer, k end #rsplit(str::AbstractString) = rsplit(str, _default_delims, 0, false) -_replacement(repl, str, j, k) = repl -_replacement(repl::Function, str, j, k) = repl(SubString(str, j, k)) +_replace(io, repl, str, r, pattern) = write(io, repl) +_replace(io, repl::Function, str, r, pattern) = + write(io, repl(SubString(str, first(r), last(r)))) function replace(str::ByteString, pattern, repl, limit::Integer) n = 1 @@ -183,10 +184,11 @@ function replace(str::ByteString, pattern, repl, limit::Integer) r = search(str,pattern,i) j, k = first(r), last(r) out = IOBuffer() + ensureroom(out, floor(Int, 1.2sizeof(str))) while j != 0 if i == a || i <= k write_sub(out, str.data, i, j-i) - write(out, _replacement(repl, str, j, k)) + _replace(out, repl, str, r, pattern) end if k m[2] "45" +Captures can be referenced in a substitution string when using :func:`replace` +by using ``\n`` to refer to the `n`th capture group and prefixing the +subsitution string with ``s``. Capture group 0 refers to the entire match object. +Named capture groups can be referenced in the substitution with ``g``. +For example:: + + julia> replace("first second", r"(\w+) (?P\w+), s"\g \1") + julia> "second first" + +Numbered capture groups can also be referenced as ``\g`` for disambiguation, +as in:: + julia> replace("a", r".", "\g<0>1") + julia> a1 + You can modify the behavior of regular expressions by some combination of the flags ``i``, ``m``, ``s``, and ``x`` after the closing double quote mark. These flags have the same meaning as they do in Perl, as diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index 4cba2c7179790..3649350f8435b 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -177,7 +177,7 @@ .. function:: replace(string, pat, r[, n]) - Search for the given pattern ``pat``, and replace each occurrence with ``r``. If ``n`` is provided, replace at most ``n`` occurrences. As with search, the second argument may be a single character, a vector or a set of characters, a string, or a regular expression. If ``r`` is a function, each occurrence is replaced with ``r(s)`` where ``s`` is the matched substring. + Search for the given pattern ``pat``, and replace each occurrence with ``r``. If ``n`` is provided, replace at most ``n`` occurrences. As with search, the second argument may be a single character, a vector or a set of characters, a string, or a regular expression. If ``r`` is a function, each occurrence is replaced with ``r(s)`` where ``s`` is the matched substring. If ``pat`` is a regular expression and ``r`` is a ``SubstitutionString``, then capture group references in ``r`` are replaced with the corresponding matched text. .. function:: split(string, [chars]; limit=0, keep=true) diff --git a/test/regex.jl b/test/regex.jl index aaf8eafa72a39..df648c71ebc4b 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -39,6 +39,11 @@ show(buf, r"") @test_throws ArgumentError search(utf32("this is a test"), r"test") # Named subpatterns -m = match(r"(?.)(.)(?.)", "xyz") -@test (m[:a], m[2], m["b"]) == ("x", "y", "z") -@test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")" +let m = match(r"(?.)(.)(?.)", "xyz") + @test (m[:a], m[2], m["b"]) == ("x", "y", "z") + @test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")" +end + +# Backcapture reference in substitution string +@test replace("abcde", r"(..)(?Pd)", s"\gxy\1") == "adxybce" +@test_throws ErrorException replace("a", r"(?P)", s"\g")