Skip to content

Commit

Permalink
[agents] TextNormalizer: remove repeated sequences of space + new line
Browse files Browse the repository at this point in the history
  • Loading branch information
eolivelli committed Sep 5, 2023
1 parent 2ac92f1 commit 483f358
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ static String trimSpaces(String stream) {
return stream.replaceAll("\t+", " ") // convert tabs to spaces
.replaceAll(" +", " ") // remove multiple spaces
.replaceAll("\n\n\n", "\n\n") // remove repeated newlines (3 or more)
.replaceAll("( \n)+", " \n") // remove repeated sequences of space + newlines
.trim();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,16 @@ class TextNormaliserAgentTest {

@Test
void testTrimSpaces() {
String text = " some text with \t \tspaces \n\n\n this is a new line";
String text = " some text with \t \tspaces \n\n\n this is a new line. \n \n \n \n then two new lines. \n\n end";
String result = TextNormaliserAgent.trimSpaces(text);
assertEquals(
"""
some text with spaces\s
this is a new line""",
some text with spaces\s
this is a new line.\s
then two new lines.\s
end""",
result);
}
}

0 comments on commit 483f358

Please sign in to comment.