Skip to content

Commit

Permalink
Stopgap fix for a bug in dealing with quote terms containing stop words.
Browse files Browse the repository at this point in the history
  • Loading branch information
vlofgren committed May 2, 2023
1 parent a9f7b4c commit 6fae51a
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 1 deletion.
@@ -1,6 +1,7 @@
package nu.marginalia.search.query;

import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.query_parser.token.Token;
import nu.marginalia.query_parser.token.TokenVisitor;
import nu.marginalia.search.model.SearchProfile;
Expand Down Expand Up @@ -46,7 +47,15 @@ public void onLiteralTerm(Token token) {
@Override
public void onQuotTerm(Token token) {
String[] parts = token.str.split("_");
if (parts.length > 1) {

// HACK (2023-05-02 vlofgren)
//
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
// required in the query (which is a problem because they are not indexed). How to do this
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
// improved.

if (parts.length > 1 && !anyPartIsStopWord(parts)) {
// Prefer that the actual n-gram is present
searchTermsAdvice.add(token.str);

Expand All @@ -63,6 +72,15 @@ public void onQuotTerm(Token token) {
}
}

private boolean anyPartIsStopWord(String[] parts) {
for (String part : parts) {
if (WordPatterns.isStopWord(part)) {
return true;
}
}
return false;
}

@Override
public void onExcludeTerm(Token token) {
searchTermsExclude.add(token.str);
Expand Down
Expand Up @@ -95,6 +95,25 @@ public void testParseSizeGt() {
assertEquals(2000, size.value());
}

@Test
public void testQuotedStopwords() {
{
// the is a stopword, so it should generate an ngram search term
var specs = parseAndGetSpecs("\"the shining\"");
assertEquals(List.of("the_shining"), specs.subqueries.iterator().next().searchTermsInclude);
assertEquals(List.of(), specs.subqueries.iterator().next().searchTermsAdvice);
assertEquals(List.of(), specs.subqueries.iterator().next().searchTermCoherences);
}

{
// tde isn't a stopword, so we should get the normal behavior
var specs = parseAndGetSpecs("\"tde shining\"");
assertEquals(List.of("tde", "shining"), specs.subqueries.iterator().next().searchTermsInclude);
assertEquals(List.of("tde_shining"), specs.subqueries.iterator().next().searchTermsAdvice);
assertEquals(List.of(List.of("tde", "shining")), specs.subqueries.iterator().next().searchTermCoherences);
}
}


@Test
public void testParseQualityEq() {
Expand Down

0 comments on commit 6fae51a

Please sign in to comment.