Skip to content

Commit

Permalink
Fix IEEE test (#7852)
Browse files Browse the repository at this point in the history
  • Loading branch information
koppor committed Jun 29, 2021
1 parent bcd41ce commit c168fbd
Show file tree
Hide file tree
Showing 15 changed files with 192 additions and 61 deletions.
33 changes: 33 additions & 0 deletions docs/adr/0022-remove-stop-words-during-query-transformation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Remove stop words during query transformation

## Context and Problem Statement

When quering for a title of a paper, the title might contain stop words such as "a", "for", "and". Some data providers return 0 results when querying for a stop word. When transforming a query to the lucene syntax, the default Boolean operator `and` is used. When using IEEE, this often leads to zero search results.

## Decision Drivers

* Consistent to the Google search engine
* Allow reproducible searches
* Avoid WTFs on the user's side

## Considered Options

* Remove stop words from the query
* Automatically enclose in quotes if no Boolean operator is contained

## Decision Outcome

Chosen option: "Remove stop words from the query", because comes out best.

## Pros and Cons of the Options

### Remove stop words from the query

* Good, because Good search results if no Boolean operators are used
* Bad, because When using complex queries and stop words are used alone, they are silently removed

### Automatically enclose in quotes if no Boolean operator is contained

* Good, because Good search results if no Boolean operators are used
* Bad, because Silently leads to different results
* Bad, because Inconsistent to Google behavior
28 changes: 24 additions & 4 deletions src/main/java/org/jabref/logic/importer/fetcher/IEEE.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,16 @@ public class IEEE implements FulltextFetcher, PagedSearchBasedParserFetcher {

private final ImportFormatPreferences preferences;

private IEEEQueryTransformer transformer;

public IEEE(ImportFormatPreferences preferences) {
this.preferences = Objects.requireNonNull(preferences);
}

/**
* @implNote <a href="https://developer.ieee.org/docs/read/Metadata_API_responses">documentation</a>
*/
private static BibEntry parseJsonRespone(JSONObject jsonEntry, Character keywordSeparator) {
private static BibEntry parseJsonResponse(JSONObject jsonEntry, Character keywordSeparator) {
BibEntry entry = new BibEntry();

switch (jsonEntry.optString("content_type")) {
Expand Down Expand Up @@ -205,8 +207,24 @@ public Parser getParser() {
JSONArray results = jsonObject.getJSONArray("articles");
for (int i = 0; i < results.length(); i++) {
JSONObject jsonEntry = results.getJSONObject(i);
BibEntry entry = parseJsonRespone(jsonEntry, preferences.getKeywordSeparator());
entries.add(entry);
BibEntry entry = parseJsonResponse(jsonEntry, preferences.getKeywordSeparator());
boolean addEntry;
// In case entry has no year, add it
// In case an entry has a year, check if its in the year range
// The implementation uses some Java 8 Optional magic to implement that
if (entry.hasField(StandardField.YEAR)) {
addEntry = entry.getField(StandardField.YEAR).filter(year -> {
Integer yearAsInteger = Integer.valueOf(year);
return
transformer.getStartYear().map(startYear -> yearAsInteger >= startYear).orElse(true) &&
transformer.getEndYear().map(endYear -> yearAsInteger <= endYear).orElse(true);
}).map(x -> true).orElse(false);
} else {
addEntry = true;
}
if (addEntry) {
entries.add(entry);
}
}
}

Expand All @@ -226,7 +244,9 @@ public Optional<HelpFile> getHelpPage() {

@Override
public URL getURLForQuery(QueryNode luceneQuery, int pageNumber) throws URISyntaxException, MalformedURLException, FetcherException {
IEEEQueryTransformer transformer = new IEEEQueryTransformer();
// transformer is stored globally, because we need to filter out the bib entries by the year manually
// the transformer stores the min and max year
transformer = new IEEEQueryTransformer();
String transformedQuery = transformer.transformLuceneQuery(luceneQuery).orElse("");
URIBuilder uriBuilder = new URIBuilder("https://ieeexploreapi.ieee.org/api/v1/search/articles");
uriBuilder.addParameter("apikey", API_KEY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import java.util.StringJoiner;
import java.util.stream.Collectors;

import org.jabref.model.strings.StringUtil;

import org.apache.lucene.queryparser.flexible.core.nodes.BooleanQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode;
Expand Down Expand Up @@ -96,7 +98,7 @@ private Optional<String> transform(FieldQueryNode query) {
return s.isEmpty() ? Optional.empty() : Optional.of(s);
}
case NO_EXPLICIT_FIELD -> {
return Optional.of(handleUnFieldedTerm(term));
return handleUnFieldedTerm(term);
}
default -> {
// Just add unknown fields as default
Expand Down Expand Up @@ -184,29 +186,16 @@ protected String handleYearRange(String yearRange) {
*
* Default implementation: just return the term (in quotes if a space is contained)
*/
protected String handleUnFieldedTerm(String term) {
return quoteStringIfSpaceIsContained(term);
}

/**
* Encloses the given string with " if there is a space contained
*
* @return Returns a string
*/
protected String quoteStringIfSpaceIsContained(String string) {
if (string.contains(" ")) {
return "\"" + string + "\"";
} else {
return string;
}
protected Optional<String> handleUnFieldedTerm(String term) {
return Optional.of(StringUtil.quoteStringIfSpaceIsContained(term));
}

protected String createKeyValuePair(String fieldAsString, String term) {
return createKeyValuePair(fieldAsString, term, ":");
}

protected String createKeyValuePair(String fieldAsString, String term, String separator) {
return String.format("%s%s%s", fieldAsString, separator, quoteStringIfSpaceIsContained(term));
return String.format("%s%s%s", fieldAsString, separator, StringUtil.quoteStringIfSpaceIsContained(term));
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import java.util.Optional;

public class ArXivQueryTransformer extends YearRangeByFilteringQueryTransformer {
@Override
protected String getLogicalAndOperator() {
Expand Down Expand Up @@ -42,8 +44,8 @@ protected String handleYear(String year) {
}

@Override
protected String handleUnFieldedTerm(String term) {
return createKeyValuePair("all", term);
protected Optional<String> handleUnFieldedTerm(String term) {
return Optional.of(createKeyValuePair("all", term));
}

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import org.jabref.model.strings.StringUtil;

public class CollectionOfComputerScienceBibliographiesQueryTransformer extends AbstractQueryTransformer {

@Override
Expand Down Expand Up @@ -29,7 +31,7 @@ protected String handleTitle(String title) {

@Override
protected String handleJournal(String journalTitle) {
return quoteStringIfSpaceIsContained(journalTitle);
return StringUtil.quoteStringIfSpaceIsContained(journalTitle);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import org.jabref.model.strings.StringUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -28,18 +30,18 @@ protected String getLogicalNotOperator() {
@Override
protected String handleAuthor(String author) {
// DBLP does not support explicit author field search
return quoteStringIfSpaceIsContained(author);
return StringUtil.quoteStringIfSpaceIsContained(author);
}

@Override
protected String handleTitle(String title) {
// DBLP does not support explicit title field search
return quoteStringIfSpaceIsContained(title);
return StringUtil.quoteStringIfSpaceIsContained(title);
}

@Override
protected String handleJournal(String journalTitle) {
// DBLP does not support explicit journal field search
return quoteStringIfSpaceIsContained(journalTitle);
return StringUtil.quoteStringIfSpaceIsContained(journalTitle);
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import org.jabref.model.strings.StringUtil;

/**
* Default query transformer without any boolean operators
*/
Expand All @@ -22,16 +24,16 @@ protected String getLogicalNotOperator() {

@Override
protected String handleAuthor(String author) {
return quoteStringIfSpaceIsContained(author);
return StringUtil.quoteStringIfSpaceIsContained(author);
}

@Override
protected String handleTitle(String title) {
return quoteStringIfSpaceIsContained(title);
return StringUtil.quoteStringIfSpaceIsContained(title);
}

@Override
protected String handleJournal(String journalTitle) {
return quoteStringIfSpaceIsContained(journalTitle);
return StringUtil.quoteStringIfSpaceIsContained(journalTitle);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ protected String handleYear(String year) {
}

@Override
protected String handleUnFieldedTerm(String term) {
protected Optional<String> handleUnFieldedTerm(String term) {
// all does not search in full-text
// Other option is txt: but this does not search in meta data
return createKeyValuePair("pica.all", term, "=");
return Optional.of(createKeyValuePair("pica.all", term, "="));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
package org.jabref.logic.importer.fetcher.transformers;

import java.util.List;
import java.util.Objects;
import java.util.Optional;

import org.jabref.model.strings.StringUtil;

/**
* Needs to be instantiated for each new query
*/
public class IEEEQueryTransformer extends YearRangeByFilteringQueryTransformer {
/**
* Returns words ignored by the engine. Need to be removed when querying for them.
* See ADR-0022
*/
private static final List<String> STOP_WORDS = List.of("a", "and", "for", "or", "with");

// These have to be integrated into the IEEE query URL as these are just supported as query parameters
// Journal is wrapped in quotes by the transformer
private String journal;
private String articleNumber;
private int startYear = Integer.MAX_VALUE;
private int endYear = Integer.MIN_VALUE;

@Override
protected String getLogicalAndOperator() {
Expand Down Expand Up @@ -40,8 +47,9 @@ protected String handleTitle(String title) {
}

@Override
protected String handleJournal(String journalTitle) {
return handleUnFieldedTerm(journalTitle);
protected String handleJournal(String journal) {
this.journal = journal;
return StringUtil.quoteStringIfSpaceIsContained(journal);
}

@Override
Expand All @@ -59,6 +67,14 @@ protected Optional<String> handleOtherField(String fieldAsString, String term) {
};
}

@Override
protected Optional<String> handleUnFieldedTerm(String term) {
if (STOP_WORDS.contains(term)) {
return Optional.empty();
}
return super.handleUnFieldedTerm(term);
}

private Optional<String> handleArticleNumber(String term) {
articleNumber = term;
return Optional.empty();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jabref.logic.importer.fetcher.transformers;

import java.util.Optional;

public class ZbMathQueryTransformer extends AbstractQueryTransformer {

@Override
Expand Down Expand Up @@ -43,7 +45,7 @@ protected String handleYearRange(String yearRange) {
}

@Override
protected String handleUnFieldedTerm(String term) {
return createKeyValuePair("any", term);
protected Optional<String> handleUnFieldedTerm(String term) {
return Optional.of(createKeyValuePair("any", term));
}
}
13 changes: 13 additions & 0 deletions src/main/java/org/jabref/model/strings/StringUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -738,4 +738,17 @@ public static String substringBetween(String str, String open, String close) {
public static String ignoreCurlyBracket(String title) {
return isNotBlank(title) ? title.replace("{", "").replace("}", "") : title;
}

/**
* Encloses the given string with " if there is a space contained
*
* @return Returns a string
*/
public static String quoteStringIfSpaceIsContained(String string) {
if (string.contains(" ")) {
return "\"" + string + "\"";
} else {
return string;
}
}
}

0 comments on commit c168fbd

Please sign in to comment.