Skip to content

Commit

Permalink
Update REPL.fuzzyscore to use string distance
Browse files Browse the repository at this point in the history
The old heuristics were not particularly helpful. These new heuristics should
be easier to reason about, since the score is between 0 and 1, and also yield
much more intuitive results.

Co-authored-by: TEC <git@tecosaur.net>
Co-authored-by: matthieugomez <gomez.matthieu@gmail.com>
  • Loading branch information
3 people committed Aug 11, 2023
1 parent cda570e commit 80c46a1
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 13 deletions.
65 changes: 52 additions & 13 deletions stdlib/REPL/src/docview.jl
Original file line number Diff line number Diff line change
Expand Up @@ -624,22 +624,61 @@ bestmatch(needle, haystack) =
longer(matchinds(needle, haystack, acronym = true),
matchinds(needle, haystack))

avgdistance(xs) =
isempty(xs) ? 0 :
(xs[end] - xs[1] - length(xs)+1)/length(xs)
# Optimal string distance: Counts the minimum number of insertions, deletions,
# transpositions or substitutions to go from one string to the other.
function string_distance(a::AbstractString, lena::Integer, b::AbstractString, lenb::Integer)
if lena > lenb
a, b = b, a
lena, lenb = lenb, lena
end
start = 0
for (i, j) in zip(a, b)
if a == b
start += 1
else
break
end
end
start == lena && return lenb - start
vzero = collect(1:(lenb - start))
vone = similar(vzero)
prev_a, prev_b = first(a), first(b)
current = 0
for (i, ai) in enumerate(a)
i > start || (prev_a = ai; continue)
left = i - start - 1
current = i - start
transition_next = 0
for (j, bj) in enumerate(b)
j > start || (prev_b = bj; continue)
# No need to look beyond window of lower right diagonal
above = current
this_transition = transition_next
transition_next = vone[j - start]
vone[j - start] = current = left
left = vzero[j - start]
if ai != bj
# Minimum between substitution, deletion and insertion
current = min(current + 1, above + 1, left + 1)
if i > start + 1 && j > start + 1 && ai == prev_b && prev_a == bj
current = min(current, (this_transition += 1))
end
end
vzero[j - start] = current
prev_b = bj
end
prev_a = ai
end
current
end

function fuzzyscore(needle, haystack)
score = 0.
is, acro = bestmatch(needle, haystack)
score += (acro ? 2 : 1)*length(is) # Matched characters
score -= 2(length(needle)-length(is)) # Missing characters
!acro && (score -= avgdistance(is)/10) # Contiguous
!isempty(is) && (score -= sum(is)/length(is)/100) # Closer to beginning
return score
function fuzzyscore(needle::AbstractString, haystack::AbstractString)
lena, lenb = length(needle), length(haystack)
1 - (string_distance(needle, lena, haystack, lenb) / max(lena, lenb))
end

function fuzzysort(search::String, candidates::Vector{String})
scores = map(cand -> (fuzzyscore(search, cand), -Float64(levenshtein(search, cand))), candidates)
scores = map(cand -> fuzzyscore(search, cand), candidates)
candidates[sortperm(scores)] |> reverse
end

Expand Down Expand Up @@ -690,7 +729,7 @@ function printmatches(io::IO, word, matches; cols::Int = _displaysize(io)[2])
total = 0
for match in matches
total + length(match) + 1 > cols && break
fuzzyscore(word, match) < 0 && break
fuzzyscore(word, match) < 0.5 && break
print(io, " ")
printmatch(io, word, match)
total += length(match) + 1
Expand Down
9 changes: 9 additions & 0 deletions stdlib/REPL/test/docview.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ end
# https://github.com/JunoLab/FuzzyCompletions.jl/issues/7
# shouldn't throw when there is a space in a middle of query
@test (REPL.matchinds("a ", "a file.txt"); true)
@test isapprox(REPL.fuzzyscore("abcdef", ""), 0.0; atol=0.001)
@test 0.8 < REPL.fuzzyscore(
"supercalifragilisticexpialidocious",
"bupercalifragilisticexpialidocious"
) < 1.0

# Unicode
@test 1.0 > REPL.fuzzyscore("αkδψm", "αkδm") > 0.0
@test 1.0 > REPL.fuzzyscore("αkδψm", "α") > 0.0
end

@testset "Unicode doc lookup (#41589)" begin
Expand Down

0 comments on commit 80c46a1

Please sign in to comment.