Skip to content

Commit

Permalink
Support backcapture references in replacement strings
Browse files Browse the repository at this point in the history
  • Loading branch information
malmaud committed Jul 21, 2015
1 parent eb636a0 commit 098039a
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 7 deletions.
1 change: 1 addition & 0 deletions base/exports.jl
Expand Up @@ -1340,6 +1340,7 @@ export
# notation for certain types
@b_str, # byte vector
@r_str, # regex
@s_str, # regex substitution string
@v_str, # version number

# documentation
Expand Down
17 changes: 17 additions & 0 deletions base/pcre.jl
Expand Up @@ -140,6 +140,23 @@ function substring_number_from_name(re, name)
(Ptr{Void}, Cstring), re, name)
end

function substring_length_bynumber(match_data, number)
s = Ref{Csize_t}()
rc = ccall((:pcre2_substring_length_bynumber_8, PCRE_LIB), Cint,
(Ptr{Void}, UInt32, Ref{Csize_t}), match_data, number, s)
rc < 0 && error("PCRE error: $(err_message(rc))")
convert(Int, s[])
end

function substring_copy_bynumber(match_data, number, buf, buf_size)
s = Ref{Csize_t}(buf_size)
rc = ccall((:pcre2_substring_copy_bynumber_8, PCRE_LIB), Cint,
(Ptr{Void}, UInt32, Ptr{UInt8}, Ref{Csize_t}),
match_data, number, buf, s)
rc < 0 && error("PCRE error: $(err_message(rc))")
convert(Int, s[])
end

function capture_names(re)
name_count = info(re, INFO_NAMECOUNT, UInt32)
name_entry_size = info(re, INFO_NAMEENTRYSIZE, UInt32)
Expand Down
83 changes: 83 additions & 0 deletions base/regex.jl
Expand Up @@ -209,6 +209,89 @@ search(s::AbstractString, r::Regex, idx::Integer) =
throw(ArgumentError("regex search is only available for bytestrings; use bytestring(s) to convert"))
search(s::AbstractString, r::Regex) = search(s,r,start(s))

immutable SubstitutionString{T<:AbstractString} <: AbstractString
string::T
end

endof(s::SubstitutionString) = endof(s.string)
next(s::SubstitutionString, idx::Int) = next(s.string, idx)
function show(io::IO, s::SubstitutionString)
print(io, "s")
show(io, s.string)
end

macro s_str(string) SubstitutionString(string) end

replace_err(repl) = error("Bad replacement string: $repl")

function _write_capture(io, re, group)
len = PCRE.substring_length_bynumber(re.match_data, group)
ensureroom(io, len+1)
PCRE.substring_copy_bynumber(re.match_data, group,
pointer(io.data, io.ptr), len+1)
io.ptr += len
io.size = max(io.size, io.ptr - 1)
end

function _replace(io, repl_s::SubstitutionString, str, r, re)
const SUB_CHAR = '\\'
const GROUP_CHAR = 'g'
const LBRACKET = '<'
const RBRACKET = '>'
repl = repl_s.string
i = start(repl)
e = endof(repl)
while i <= e
if repl[i] == SUB_CHAR
next_i = nextind(repl, i)
next_i > e && replace_err(repl)
if repl[next_i] == SUB_CHAR
write(io, SUB_CHAR, repl[next_i])
i = nextind(repl, next_i)
elseif isnumber(repl[next_i])
group = parse(Int, repl[next_i])
i = nextind(repl, next_i)
while i <= e
if isnumber(repl[i])
group = 10group + parse(Int, repl[i])
i = nextind(repl, i)
else
break
end
end
_write_capture(io, re, group)
elseif repl[next_i] == GROUP_CHAR
i = nextind(repl, next_i)
if i > e || repl[i] != LBRACKET
replace_err(repl)
end
i = nextind(repl, i)
i > e && replace_err(repl)
groupstart = i
while repl[i] != RBRACKET
i = nextind(repl, i)
i > e && replace_err(repl)
end
# TODO: avoid this allocation
groupname = SubString(repl, groupstart, prevind(repl, i))
if isnumber(groupname)
_write_capture(io, re, parse(Int, groupname))
else
group = PCRE.substring_number_from_name(re.regex, groupname)
group < 0 && replace_err("Group $groupname not found in regex $re")
_write_capture(io, re, group)
end
i = nextind(repl, i)
else
replace_err(repl)
end
else
write(io, repl[i])
i = nextind(repl, i)
end
end
end

immutable RegexMatchIterator
regex::Regex
string::UTF8String
Expand Down
8 changes: 5 additions & 3 deletions base/strings/util.jl
Expand Up @@ -173,8 +173,9 @@ function _rsplit{T<:AbstractString,U<:Array}(str::T, splitter, limit::Integer, k
end
#rsplit(str::AbstractString) = rsplit(str, _default_delims, 0, false)

_replacement(repl, str, j, k) = repl
_replacement(repl::Function, str, j, k) = repl(SubString(str, j, k))
_replace(io, repl, str, r, pattern) = write(io, repl)
_replace(io, repl::Function, str, r, pattern) =
write(io, repl(SubString(str, first(r), last(r))))

function replace(str::ByteString, pattern, repl, limit::Integer)
n = 1
Expand All @@ -183,10 +184,11 @@ function replace(str::ByteString, pattern, repl, limit::Integer)
r = search(str,pattern,i)
j, k = first(r), last(r)
out = IOBuffer()
ensureroom(out, floor(Int, 1.2sizeof(str)))
while j != 0
if i == a || i <= k
write_sub(out, str.data, i, j-i)
write(out, _replacement(repl, str, j, k))
_replace(out, repl, str, r, pattern)
end
if k<j
i = j
Expand Down
14 changes: 14 additions & 0 deletions doc/manual/strings.rst
Expand Up @@ -707,6 +707,20 @@ with the number or name of the capture group::
julia> m[2]
"45"

Captures can be referenced in a substitution string when using :func:`replace`
by using ``\n`` to refer to the `n`th capture group and prefixing the
subsitution string with ``s``. Capture group 0 refers to the entire match object.
Named capture groups can be referenced in the substitution with ``g<groupname>``.
For example::

julia> replace("first second", r"(\w+) (?P<agroup>\w+), s"\g<agroup> \1")
julia> "second first"

Numbered capture groups can also be referenced as ``\g<n>`` for disambiguation,
as in::
julia> replace("a", r".", "\g<0>1")
julia> a1

You can modify the behavior of regular expressions by some combination
of the flags ``i``, ``m``, ``s``, and ``x`` after the closing double
quote mark. These flags have the same meaning as they do in Perl, as
Expand Down
2 changes: 1 addition & 1 deletion doc/stdlib/strings.rst
Expand Up @@ -177,7 +177,7 @@

.. function:: replace(string, pat, r[, n])

Search for the given pattern ``pat``, and replace each occurrence with ``r``. If ``n`` is provided, replace at most ``n`` occurrences. As with search, the second argument may be a single character, a vector or a set of characters, a string, or a regular expression. If ``r`` is a function, each occurrence is replaced with ``r(s)`` where ``s`` is the matched substring.
Search for the given pattern ``pat``, and replace each occurrence with ``r``. If ``n`` is provided, replace at most ``n`` occurrences. As with search, the second argument may be a single character, a vector or a set of characters, a string, or a regular expression. If ``r`` is a function, each occurrence is replaced with ``r(s)`` where ``s`` is the matched substring. If ``pat`` is a regular expression and ``r`` is a ``SubstitutionString``, then capture group references in ``r`` are replaced with the corresponding matched text.

.. function:: split(string, [chars]; limit=0, keep=true)

Expand Down
11 changes: 8 additions & 3 deletions test/regex.jl
Expand Up @@ -39,6 +39,11 @@ show(buf, r"")
@test_throws ArgumentError search(utf32("this is a test"), r"test")

# Named subpatterns
m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
@test (m[:a], m[2], m["b"]) == ("x", "y", "z")
@test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")"
let m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
@test (m[:a], m[2], m["b"]) == ("x", "y", "z")
@test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")"
end

# Backcapture reference in substitution string
@test replace("abcde", r"(..)(?P<byname>d)", s"\g<byname>xy\1") == "adxybce"
@test_throws ErrorException replace("a", r"(?P<x>)", s"\g<y>")

0 comments on commit 098039a

Please sign in to comment.