Skip to content

Commit

Permalink
TextNormalizer: improvements (#358)
Browse files Browse the repository at this point in the history
  • Loading branch information
eolivelli committed Sep 6, 2023
1 parent b372b45 commit 80cdb56
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@ static String trimSpaces(String stream) {
return stream.replaceAll("\t+", " ") // convert tabs to spaces
.replaceAll(" +", " ") // remove multiple spaces
.replaceAll("\n\n\n", "\n\n") // remove repeated newlines (3 or more)
.replaceAll("( \n)+", " \n") // remove repeated sequences of space + newlines
.replaceAll(
"( \n\n)+", " \n\n") // remove repeated sequences of space + double newlines
.replaceAll("( \n)+", "\n") // remove repeated sequences of space + newlines
.replaceAll("\n ", "\n") // remove repeated sequences of newlines + space
.trim();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,18 @@ class TextNormaliserAgentTest {
@Test
void testTrimSpaces() {
String text =
" some text with \t \tspaces \n\n\n this is a new line. \n \n \n \n then two new lines. \n\n end";
" some \n\n text with \t \tspaces \n\n\n this is a new line. \n \n \n \n then two new lines. \n\n \n\n \n\n \n\n end";
String result = TextNormaliserAgent.trimSpaces(text);
assertEquals(
"""
some text with spaces\s
some
this is a new line.\s
then two new lines.\s
text with spaces
end""",
this is a new line.
then two new lines.
end""",
result);
}
}

0 comments on commit 80cdb56

Please sign in to comment.