diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java index 6fcb62c20..da309362f 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java @@ -1,6 +1,7 @@ package nu.marginalia.search.query; import nu.marginalia.index.client.model.query.SearchSubquery; +import nu.marginalia.language.WordPatterns; import nu.marginalia.query_parser.token.Token; import nu.marginalia.query_parser.token.TokenVisitor; import nu.marginalia.search.model.SearchProfile; @@ -46,7 +47,15 @@ public void onLiteralTerm(Token token) { @Override public void onQuotTerm(Token token) { String[] parts = token.str.split("_"); - if (parts.length > 1) { + + // HACK (2023-05-02 vlofgren) + // + // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being + // required in the query (which is a problem because they are not indexed). How to do this + // in a clean way is a bit of an open problem that may not get resolved until query-parsing is + // improved. + + if (parts.length > 1 && !anyPartIsStopWord(parts)) { // Prefer that the actual n-gram is present searchTermsAdvice.add(token.str); @@ -63,6 +72,15 @@ public void onQuotTerm(Token token) { } } + private boolean anyPartIsStopWord(String[] parts) { + for (String part : parts) { + if (WordPatterns.isStopWord(part)) { + return true; + } + } + return false; + } + @Override public void onExcludeTerm(Token token) { searchTermsExclude.add(token.str); diff --git a/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java b/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java index 843771548..d2d453375 100644 --- a/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java +++ b/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java @@ -95,6 +95,25 @@ public void testParseSizeGt() { assertEquals(2000, size.value()); } + @Test + public void testQuotedStopwords() { + { + // the is a stopword, so it should generate an ngram search term + var specs = parseAndGetSpecs("\"the shining\""); + assertEquals(List.of("the_shining"), specs.subqueries.iterator().next().searchTermsInclude); + assertEquals(List.of(), specs.subqueries.iterator().next().searchTermsAdvice); + assertEquals(List.of(), specs.subqueries.iterator().next().searchTermCoherences); + } + + { + // tde isn't a stopword, so we should get the normal behavior + var specs = parseAndGetSpecs("\"tde shining\""); + assertEquals(List.of("tde", "shining"), specs.subqueries.iterator().next().searchTermsInclude); + assertEquals(List.of("tde_shining"), specs.subqueries.iterator().next().searchTermsAdvice); + assertEquals(List.of(List.of("tde", "shining")), specs.subqueries.iterator().next().searchTermCoherences); + } + } + @Test public void testParseQualityEq() {