Skip to content

Commit

Permalink
Fix multi-language detection exceptions for multi-script text
Browse files Browse the repository at this point in the history
There were multiple issues:
- `end` was previously not correctly set after a new section was started
  This could erroneously lead to `end <= start` if there was a trailing
  single character in a different script
- `lettersCount` was one too high
  • Loading branch information
Marcono1234 committed Oct 31, 2023
1 parent 42a5231 commit adfb0f7
Showing 1 changed file with 18 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ private open class PotentialSection(
// Cache text to reduce number of created substrings
private var cachedText: String? = null,
) {
init {
check(start < end)
check(lettersCount > 0)
// Should have at most as many letters as there are chars in section
check(lettersCount <= end - start)
}

fun getStart() = start
fun getEnd() = end
fun getLettersCount() = lettersCount
Expand Down Expand Up @@ -182,24 +189,26 @@ private fun splitPotentialSections(text: String): MutableList<PotentialSection>
if (char.isLetter()) {
val script = UnicodeScript.of(char.code)

if (start != -1 && (hasLogograms || lettersCount >= minSectionLength) &&
if (start == -1) {
// Start a new section
start = index
}
// Or check if current section should end
else if ((hasLogograms || lettersCount >= minSectionLength) &&
lastScript != null && !lastScript!!.belongsToSameLanguageAs(script)
) {
sections.add(PotentialSection(start, index, lettersCount, text))

// Current letter is start of new section
start = index
lettersCount = 1
// Set to 0 instead of 1 because it is directly incremented below
lettersCount = 0
hasLogograms = false
} else {
if (start == -1) {
start = index
}

// Mark current letter as potential last letter
end = index + 1
}

// Mark current letter as potential last letter
end = index + 1

lastScript = script
lettersCount++
hasLogograms = hasLogograms || char.isLogogram()
Expand Down

0 comments on commit adfb0f7

Please sign in to comment.