diff --git a/langstream-agents/langstream-agents-text-processing/src/main/java/ai/langstream/agents/text/TextNormaliserAgent.java b/langstream-agents/langstream-agents-text-processing/src/main/java/ai/langstream/agents/text/TextNormaliserAgent.java index e49b55f9c..7ad80fe25 100644 --- a/langstream-agents/langstream-agents-text-processing/src/main/java/ai/langstream/agents/text/TextNormaliserAgent.java +++ b/langstream-agents/langstream-agents-text-processing/src/main/java/ai/langstream/agents/text/TextNormaliserAgent.java @@ -59,7 +59,10 @@ static String trimSpaces(String stream) { return stream.replaceAll("\t+", " ") // convert tabs to spaces .replaceAll(" +", " ") // remove multiple spaces .replaceAll("\n\n\n", "\n\n") // remove repeated newlines (3 or more) - .replaceAll("( \n)+", " \n") // remove repeated sequences of space + newlines + .replaceAll( + "( \n\n)+", " \n\n") // remove repeated sequences of space + double newlines + .replaceAll("( \n)+", "\n") // remove repeated sequences of space + newlines + .replaceAll("\n ", "\n") // remove repeated sequences of newlines + space .trim(); } } diff --git a/langstream-agents/langstream-agents-text-processing/src/test/java/ai/langstream/agents/text/TextNormaliserAgentTest.java b/langstream-agents/langstream-agents-text-processing/src/test/java/ai/langstream/agents/text/TextNormaliserAgentTest.java index 2d7a8b167..cd714f383 100644 --- a/langstream-agents/langstream-agents-text-processing/src/test/java/ai/langstream/agents/text/TextNormaliserAgentTest.java +++ b/langstream-agents/langstream-agents-text-processing/src/test/java/ai/langstream/agents/text/TextNormaliserAgentTest.java @@ -24,16 +24,18 @@ class TextNormaliserAgentTest { @Test void testTrimSpaces() { String text = - " some text with \t \tspaces \n\n\n this is a new line. \n \n \n \n then two new lines. \n\n end"; + " some \n\n text with \t \tspaces \n\n\n this is a new line. \n \n \n \n then two new lines. \n\n \n\n \n\n \n\n end"; String result = TextNormaliserAgent.trimSpaces(text); assertEquals( """ - some text with spaces\s + some - this is a new line.\s - then two new lines.\s + text with spaces - end""", + this is a new line. + then two new lines. + + end""", result); } }