Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Extract PDF References #10437

Merged
merged 14 commits into from Mar 12, 2024
19 changes: 19 additions & 0 deletions src/main/java/org/jabref/logic/importer/util/GrobidService.java
Expand Up @@ -84,6 +84,25 @@ public List<BibEntry> processPDF(Path filePath, ImportFormatPreferences importFo

String httpResponse = response.body();

return getBibEntries(importFormatPreferences, httpResponse);
}

public List<BibEntry> processReferences(Path filePath, ImportFormatPreferences importFormatPreferences) throws IOException, ParseException {
Connection.Response response = Jsoup.connect(grobidPreferences.getGrobidURL() + "/api/processReferences")
tobiasdiez marked this conversation as resolved.
Show resolved Hide resolved
.header("Accept", MediaTypes.APPLICATION_BIBTEX)
.data("input", filePath.toString(), Files.newInputStream(filePath))
.data("consolidateCitations", String.valueOf(ConsolidateCitations.WITH_METADATA))
.method(Connection.Method.POST)
.ignoreContentType(true)
.timeout(20000)
.execute();

String httpResponse = response.body();

return getBibEntries(importFormatPreferences, httpResponse);
}

private static List<BibEntry> getBibEntries(ImportFormatPreferences importFormatPreferences, String httpResponse) throws IOException, ParseException {
if (httpResponse == null || "@misc{-1,\n author = {}\n}\n".equals(httpResponse)) { // This filters empty BibTeX entries
throw new IOException("The GROBID server response does not contain anything.");
}
Expand Down
Expand Up @@ -4,6 +4,7 @@
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.util.List;
import java.util.Objects;
import java.util.Optional;

import org.jabref.logic.importer.ImportFormatPreferences;
Expand Down Expand Up @@ -96,4 +97,22 @@ public void processPdfTest() throws IOException, ParseException, URISyntaxExcept
assertEquals(Optional.of("Paper Title"), be0.getField(StandardField.TITLE));
assertEquals(Optional.of("2014-10-05"), be0.getField(StandardField.DATE));
}

@Test
public void extractsReferencesFromPdf() throws IOException, ParseException, URISyntaxException {
BibEntry ref1 = new BibEntry(StandardEntryType.Article)
.withField(StandardField.AUTHOR, "Kopp, O")
.withField(StandardField.ADDRESS, "Berlin; Heidelberg")
.withField(StandardField.DATE, "2013")
.withField(StandardField.JOURNAL, "LNCS")
.withField(StandardField.PAGES, "700--704")
.withField(StandardField.PUBLISHER, "Springer")
.withField(StandardField.TITLE, "Winery -A Modeling Tool for TOSCA-based Cloud Applications")
.withField(StandardField.VOLUME, "8274")
.withField(StandardField.YEAR, "2013");

Path file = Path.of(Objects.requireNonNull(PdfGrobidImporterTest.class.getResource("LNCS-minimal.pdf")).toURI());
List<BibEntry> extractedReferences = grobidService.processReferences(file, importFormatPreferences);
assertEquals(List.of(ref1), extractedReferences);
}
}