From 84bde3fbb0d978e4d7122b725c9d7503f2dab200 Mon Sep 17 00:00:00 2001 From: TEC Date: Sat, 27 Apr 2024 18:23:32 +0800 Subject: [PATCH] Remove strong ordering of annotation ranges After a long chat with Lilith Halfner, we've come to the conclusion that the range-based ordering applied to annotations, while nice for making some otherwise O(n) code O(log n) and O(n^2) code O(n), is actually assuming too much about how annotations are used and interact with each other. Removing all assumptions about ordering, and giving annotation order primacy seems like the most sensible thing to do, even if it makes a few bits of the code less algorithmically "nice". As a consequence, we also get rid of annotatedstring_optimize!. Specific producers/consumers of annotated text will know what assumptions can be made to compress/optimise the annotations used, and are thus best suited to do so themselves. The one exception to this is probably when writing to an AnnotatedIOBuffer, here adding a specific optimisation to _insert_annotations! probably makes sense, and will be explored soon. --- base/strings/annotated.jl | 82 +++++++++------------------------------ test/strings/annotated.jl | 29 ++++++-------- 2 files changed, 30 insertions(+), 81 deletions(-) diff --git a/base/strings/annotated.jl b/base/strings/annotated.jl index bf510cab82052..5baf6f41d2d9c 100644 --- a/base/strings/annotated.jl +++ b/base/strings/annotated.jl @@ -255,36 +255,6 @@ annotatedstring(c::AnnotatedChar) = AnnotatedString(s::SubString{<:AnnotatedString}) = annotatedstring(s) -""" - annotatedstring_optimize!(str::AnnotatedString) - -Merge contiguous identical annotations in `str`. -""" -function annotatedstring_optimize!(s::AnnotatedString) - last_seen = Dict{Pair{Symbol, Any}, Int}() - i = 1 - while i <= length(s.annotations) - region, keyval = s.annotations[i] - prev = get(last_seen, keyval, 0) - if prev > 0 - lregion, _ = s.annotations[prev] - if last(lregion) + 1 == first(region) - s.annotations[prev] = - setindex(s.annotations[prev], - first(lregion):last(region), - 1) - deleteat!(s.annotations, i) - else - delete!(last_seen, keyval) - end - else - last_seen[keyval] = i - i += 1 - end - end - s -end - function repeat(str::AnnotatedString, r::Integer) r == 0 && return one(AnnotatedString) r == 1 && return str @@ -292,19 +262,19 @@ function repeat(str::AnnotatedString, r::Integer) annotations = Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}() len = ncodeunits(str) fullregion = firstindex(str):lastindex(str) - for (region, annot) in str.annotations - if region == fullregion - push!(annotations, (firstindex(unannot):lastindex(unannot), annot)) + if allequal(first, str.annotations) && first(first(str.annotations)) == fullregion + newfullregion = firstindex(unannot):lastindex(unannot) + for (_, annot) in str.annotations + push!(annotations, (newfullregion, annot)) end - end - for offset in 0:len:(r-1)*len - for (region, annot) in str.annotations - if region != fullregion + else + for offset in 0:len:(r-1)*len + for (region, annot) in str.annotations push!(annotations, (region .+ offset, annot)) end end end - AnnotatedString(unannot, annotations) |> annotatedstring_optimize! + AnnotatedString(unannot, annotations) end repeat(str::SubString{<:AnnotatedString}, r::Integer) = @@ -335,14 +305,9 @@ reverse(s::SubString{<:AnnotatedString}) = reverse(AnnotatedString(s)) function _annotate!(annlist::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, range::UnitRange{Int}, @nospecialize(labelval::Pair{Symbol, <:Any})) label, val = labelval if val === nothing - indices = searchsorted(annlist, (range,), by=first) - labelindex = filter(i -> first(annlist[i][2]) === label, indices) - for index in Iterators.reverse(labelindex) - deleteat!(annlist, index) - end + deleteat!(annlist, findall(ann -> ann[1] == range && first(ann[2]) === label, annlist)) else - sortedindex = searchsortedlast(annlist, (range,), by=first) + 1 - insert!(annlist, sortedindex, (range, Pair{Symbol, Any}(label, val))) + push!(annlist, (range, Pair{Symbol, Any}(label, val))) end end @@ -521,7 +486,7 @@ end function _clear_annotations_in_region!(annotations::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, span::UnitRange{Int}) # Clear out any overlapping pre-existing annotations. filter!(((region, _),) -> first(region) < first(span) || last(region) > last(span), annotations) - extras = Tuple{UnitRange{Int}, Pair{Symbol, Any}}[] + extras = Tuple{Int, Tuple{UnitRange{Int}, Pair{Symbol, Any}}}[] for i in eachindex(annotations) region, annot = annotations[i] # Test for partial overlap @@ -532,30 +497,21 @@ function _clear_annotations_in_region!(annotations::Vector{Tuple{UnitRange{Int}, # If `span` fits exactly within `region`, then we've only copied over # the beginning overhang, but also need to conserve the end overhang. if first(region) < first(span) && last(span) < last(region) - push!(extras, (last(span)+1:last(region), annot)) + push!(extras, (i, (last(span)+1:last(region), annot))) end end - # Insert any extra entries in the appropriate position - for entry in extras - sortedindex = searchsortedlast(annotations, (first(entry),), by=first) + 1 - insert!(annotations, sortedindex, entry) - end + end + # Insert any extra entries in the appropriate position + for (offset, (i, entry)) in enumerate(extras) + insert!(annotations, i + offset, entry) end annotations end function _insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, offset::Int = position(io)) - if !eof(io) - for (region, annot) in annotations - region = first(region)+offset:last(region)+offset - sortedindex = searchsortedlast(io.annotations, (region,), by=first) + 1 - insert!(io.annotations, sortedindex, (region, annot)) - end - else - for (region, annot) in annotations - region = first(region)+offset:last(region)+offset - push!(io.annotations, (region, annot)) - end + for (region, annot) in annotations + region = first(region)+offset:last(region)+offset + push!(io.annotations, (region, annot)) end end diff --git a/test/strings/annotated.jl b/test/strings/annotated.jl index f348639e31feb..a1bf9f1b3ee24 100644 --- a/test/strings/annotated.jl +++ b/test/strings/annotated.jl @@ -24,10 +24,10 @@ @test Base.AnnotatedString(str[3:4]) == Base.AnnotatedString("me", [(1:2, :thing => 0x01), (1:2, :all => 0x03)]) @test Base.AnnotatedString(str[3:6]) == - Base.AnnotatedString("me s", [(1:2, :thing => 0x01), (1:4, :all => 0x03), (4:4, :other => 0x02)]) - @test str == Base.AnnotatedString("some string", [(1:4, :thing => 0x01), (1:11, :all => 0x03), (6:11, :other => 0x02)]) + Base.AnnotatedString("me s", [(1:2, :thing => 0x01), (4:4, :other => 0x02), (1:4, :all => 0x03)]) + @test str == Base.AnnotatedString("some string", [(1:4, :thing => 0x01), (6:11, :other => 0x02), (1:11, :all => 0x03)]) @test str != Base.AnnotatedString("some string") - @test str != Base.AnnotatedString("some string", [(1:1, :thing => 0x01), (6:6, :other => 0x02), (11:11, :all => 0x03)]) + @test str != Base.AnnotatedString("some string", [(1:1, :thing => 0x01), (1:11, :all => 0x03), (6:6, :other => 0x02)]) @test str != Base.AnnotatedString("some string", [(1:4, :thing => 0x11), (1:11, :all => 0x13), (6:11, :other => 0x12)]) @test str != Base.AnnotatedString("some thingg", [(1:4, :thing => 0x01), (1:11, :all => 0x03), (6:11, :other => 0x02)]) @test Base.AnnotatedString([Base.AnnotatedChar('a', [:a => 1]), Base.AnnotatedChar('b', [:b => 2])]) == @@ -51,15 +51,8 @@ # @test collect(Base.eachstyle(str)) == # [("some", [:thing => 0x01, :all => 0x03]), # (" string", [:all => 0x03, :other => 0x02])] - @test ==(Base.annotatedstring_optimize!( - Base.AnnotatedString("abc", [(1:1, :val => 1), - (2:2, :val => 2), - (2:2, :val => 1), - (3:3, :val => 2)])), - Base.AnnotatedString("abc", [(1:2, :val => 1), - (2:3, :val => 2)])) @test chopprefix(sprint(show, str), "Base.") == - "AnnotatedString{String}(\"some string\", [(1:4, :thing => 0x01), (1:11, :all => 0x03), (6:11, :other => 0x02)])" + "AnnotatedString{String}(\"some string\", [(1:4, :thing => 0x01), (6:11, :other => 0x02), (1:11, :all => 0x03)])" @test eval(Meta.parse(repr(str))) == str @test sprint(show, MIME("text/plain"), str) == "\"some string\"" end @@ -149,8 +142,8 @@ end # Check `annotate!`, including region sorting @test truncate(aio, 0).io.size == 0 @test write(aio, "hello world") == ncodeunits("hello world") - @test Base.annotate!(aio, 7:11, :tag => 2) === aio @test Base.annotate!(aio, 1:5, :tag => 1) === aio + @test Base.annotate!(aio, 7:11, :tag => 2) === aio @test Base.annotations(aio) == [(1:5, :tag => 1), (7:11, :tag => 2)] # Reading @test read(seekstart(deepcopy(aio.io)), String) == "hello world" @@ -178,24 +171,24 @@ end @test Base.annotations(aio) == [(1:5, :tag => 1), (7:11, :tag => 2)] # Should be unchanged @test write(seek(aio, 0), Base.AnnotatedString("hey-o", [(1:5, :hey => 'o')])) == 5 @test read(seekstart(aio), String) == "hey-o alice" - @test Base.annotations(aio) == [(1:5, :hey => 'o'), (7:11, :tag => 2)] # First annotation should have been entirely replaced + @test Base.annotations(aio) == [(7:11, :tag => 2), (1:5, :hey => 'o')] # First annotation should have been entirely replaced @test write(seek(aio, 7), Base.AnnotatedString("bbi", [(1:3, :hey => 'a')])) == 3 # a[lic => bbi]e ('alice' => 'abbie') @test read(seekstart(aio), String) == "hey-o abbie" - @test Base.annotations(aio) == [(1:5, :hey => 'o'), (7:7, :tag => 2), (8:10, :hey => 'a'), (11:11, :tag => 2)] + @test Base.annotations(aio) == [(7:7, :tag => 2), (11:11, :tag => 2), (1:5, :hey => 'o'), (8:10, :hey => 'a')] @test write(seek(aio, 0), Base.AnnotatedString("ab")) == 2 # Check first annotation's region is adjusted correctly @test read(seekstart(aio), String) == "aby-o abbie" - @test Base.annotations(aio) == [(3:5, :hey => 'o'), (7:7, :tag => 2), (8:10, :hey => 'a'), (11:11, :tag => 2)] + @test Base.annotations(aio) == [(7:7, :tag => 2), (11:11, :tag => 2), (3:5, :hey => 'o'), (8:10, :hey => 'a')] @test write(seek(aio, 3), Base.AnnotatedString("ss")) == 2 @test read(seekstart(aio), String) == "abyss abbie" - @test Base.annotations(aio) == [(3:3, :hey => 'o'), (7:7, :tag => 2), (8:10, :hey => 'a'), (11:11, :tag => 2)] + @test Base.annotations(aio) == [(7:7, :tag => 2), (11:11, :tag => 2), (3:3, :hey => 'o'), (8:10, :hey => 'a')] # Writing one buffer to another newaio = Base.AnnotatedIOBuffer() @test write(newaio, seekstart(aio)) == 11 @test read(seekstart(newaio), String) == "abyss abbie" @test Base.annotations(newaio) == Base.annotations(aio) @test write(seek(newaio, 5), seek(aio, 5)) == 6 - @test Base.annotations(newaio) == Base.annotations(aio) + @test sort(Base.annotations(newaio)) == sort(Base.annotations(aio)) @test write(newaio, seek(aio, 5)) == 6 @test read(seekstart(newaio), String) == "abyss abbie abbie" - @test Base.annotations(newaio) == vcat(Base.annotations(aio), [(13:13, :tag => 2), (14:16, :hey => 'a'), (17:17, :tag => 2)]) + @test sort(Base.annotations(newaio)) == sort(vcat(Base.annotations(aio), [(13:13, :tag => 2), (14:16, :hey => 'a'), (17:17, :tag => 2)])) end