JabRef · Siedlerchr · Aug 18, 2021 · Jul 20, 2021 · Jul 20, 2021 · Jul 28, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 
 - We added the option to copy the DOI of an entry directly from the context menu copy submenu. [#7826](https://github.com/JabRef/jabref/issues/7826)
 - We added a fulltext search feature. [#2838](https://github.com/JabRef/jabref/pull/2838)
+- We improved the deduction of bib-entries from imported fulltext pdfs. [#7947](https://github.com/JabRef/jabref/pull/7947)
 - We added unprotect_terms to the list of bracketed pattern modifiers [#7826](https://github.com/JabRef/jabref/pull/7960)
 - We added an icon picker in group edit dialog. [#6142](https://github.com/JabRef/jabref/issues/6142)
 

diff --git a/src/main/java/org/jabref/gui/entryeditor/EntryEditor.java b/src/main/java/org/jabref/gui/entryeditor/EntryEditor.java
@@ -355,7 +355,7 @@ private void setupToolBar() {
 
         // Add menu for fetching bibliographic information
         ContextMenu fetcherMenu = new ContextMenu();
-        for (EntryBasedFetcher fetcher : WebFetchers.getEntryBasedFetchers(preferencesService.getImportFormatPreferences())) {
+        for (EntryBasedFetcher fetcher : WebFetchers.getEntryBasedFetchers(preferencesService.getImportFormatPreferences(), preferencesService.getFilePreferences(), databaseContext, preferencesService.getDefaultEncoding())) {
             MenuItem fetcherMenuItem = new MenuItem(fetcher.getName());
             fetcherMenuItem.setOnAction(event -> fetchAndMerge(fetcher));
             fetcherMenu.getItems().add(fetcherMenuItem);

diff --git a/src/main/java/org/jabref/logic/externalfiles/ExternalFilesContentImporter.java b/src/main/java/org/jabref/logic/externalfiles/ExternalFilesContentImporter.java
@@ -7,7 +7,7 @@
 import org.jabref.logic.importer.ImportFormatPreferences;
 import org.jabref.logic.importer.OpenDatabase;
 import org.jabref.logic.importer.ParserResult;
-import org.jabref.logic.importer.fileformat.PdfContentImporter;
+import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter;
 import org.jabref.logic.importer.fileformat.PdfXmpImporter;
 import org.jabref.logic.preferences.TimestampPreferences;
 import org.jabref.model.util.FileUpdateMonitor;
@@ -23,7 +23,11 @@ public ExternalFilesContentImporter(ImportFormatPreferences importFormatPreferen
     }
 
     public ParserResult importPDFContent(Path file) {
-        return new PdfContentImporter(importFormatPreferences).importDatabase(file, StandardCharsets.UTF_8);
+        try {
+            return new PdfMergeMetadataImporter(importFormatPreferences).importDatabase(file, StandardCharsets.UTF_8);
+        } catch (IOException e) {
+           return ParserResult.fromError(e);
+        }
     }
 
     public ParserResult importXMPContent(Path file) {

diff --git a/src/main/java/org/jabref/logic/importer/ImportFormatReader.java b/src/main/java/org/jabref/logic/importer/ImportFormatReader.java
@@ -2,12 +2,14 @@
 
 import java.io.IOException;
 import java.nio.file.Path;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
+import org.jabref.logic.importer.fetcher.GrobidCitationFetcher;
 import org.jabref.logic.importer.fileformat.BibTeXMLImporter;
 import org.jabref.logic.importer.fileformat.BiblioscapeImporter;
 import org.jabref.logic.importer.fileformat.BibtexImporter;
@@ -22,6 +24,10 @@
 import org.jabref.logic.importer.fileformat.MsBibImporter;
 import org.jabref.logic.importer.fileformat.OvidImporter;
 import org.jabref.logic.importer.fileformat.PdfContentImporter;
+import org.jabref.logic.importer.fileformat.PdfEmbeddedBibFileImporter;
+import org.jabref.logic.importer.fileformat.PdfGrobidImporter;
+import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter;
+import org.jabref.logic.importer.fileformat.PdfVerbatimBibTextImporter;
 import org.jabref.logic.importer.fileformat.PdfXmpImporter;
 import org.jabref.logic.importer.fileformat.RepecNepImporter;
 import org.jabref.logic.importer.fileformat.RisImporter;
@@ -42,7 +48,7 @@ public class ImportFormatReader {
      * All import formats.
      * Sorted accordingly to {@link Importer#compareTo}, which defaults to alphabetically by the name
      */
-    private final SortedSet<Importer> formats = new TreeSet<>();
+    private final List<Importer> formats = new ArrayList<>();
 
     private ImportFormatPreferences importFormatPreferences;
 
@@ -51,8 +57,6 @@ public void resetImportFormats(ImportFormatPreferences newImportFormatPreference
 
         formats.clear();
 
-        formats.add(new BiblioscapeImporter());
-        formats.add(new BibtexImporter(importFormatPreferences, fileMonitor));
         formats.add(new BibTeXMLImporter());
         formats.add(new CopacImporter());
         formats.add(new EndnoteImporter(importFormatPreferences));
@@ -64,11 +68,17 @@ public void resetImportFormats(ImportFormatPreferences newImportFormatPreference
         formats.add(new ModsImporter(importFormatPreferences));
         formats.add(new MsBibImporter());
         formats.add(new OvidImporter());
+        formats.add(new PdfMergeMetadataImporter(importFormatPreferences));
+        formats.add(new PdfVerbatimBibTextImporter(importFormatPreferences));
         formats.add(new PdfContentImporter(importFormatPreferences));
+        formats.add(new PdfEmbeddedBibFileImporter(importFormatPreferences));
+        formats.add(new PdfGrobidImporter(GrobidCitationFetcher.GROBID_URL, importFormatPreferences));
         formats.add(new PdfXmpImporter(xmpPreferences));
         formats.add(new RepecNepImporter(importFormatPreferences));
         formats.add(new RisImporter());
         formats.add(new SilverPlatterImporter());
+        formats.add(new BiblioscapeImporter());
+        formats.add(new BibtexImporter(importFormatPreferences, fileMonitor));
 
         // Get custom import formats
         formats.addAll(importFormatPreferences.getCustomImportList());
@@ -110,26 +120,26 @@ public ParserResult importFromFile(String format, Path file) throws ImportExcept
      * All importers.
      * <p>
      * <p>
-     * Elements are in default order.
+     * Elements are sorted by name.
      * </p>
      *
      * @return all custom importers, elements are of type InputFormat
      */
     public SortedSet<Importer> getImportFormats() {
-        return this.formats;
+        return new TreeSet<>(this.formats);
     }
 
     /**
      * Human readable list of all known import formats (name and CLI Id).
      * <p>
-     * <p>List is in default-order.</p>
+     * <p>List is sorted by importer name.</p>
      *
      * @return human readable list of all known import formats
      */
     public String getImportFormatList() {
         StringBuilder sb = new StringBuilder();
 
-        for (Importer imFo : formats) {
+        for (Importer imFo : getImportFormats()) {
             int pad = Math.max(0, 14 - imFo.getName().length());
             sb.append("  ");
             sb.append(imFo.getName());
@@ -166,20 +176,25 @@ public UnknownFormatImport(String format, ParserResult parserResult) {
     public UnknownFormatImport importUnknownFormat(Path filePath, TimestampPreferences timestampPreferences, FileUpdateMonitor fileMonitor) throws ImportException {
         Objects.requireNonNull(filePath);
 
-        // First, see if it is a BibTeX file:
         try {
-            ParserResult parserResult = OpenDatabase.loadDatabase(filePath, importFormatPreferences, timestampPreferences, fileMonitor);
-            if (parserResult.getDatabase().hasEntries() || !parserResult.getDatabase().hasNoStrings()) {
-                parserResult.setFile(filePath.toFile());
-                return new UnknownFormatImport(ImportFormatReader.BIBTEX_FORMAT, parserResult);
+            UnknownFormatImport unknownFormatImport = importUnknownFormat(importer -> importer.importDatabase(filePath, importFormatPreferences.getEncoding()), importer -> importer.isRecognizedFormat(filePath, importFormatPreferences.getEncoding()));
+            unknownFormatImport.parserResult.setFile(filePath.toFile());
+            return unknownFormatImport;
+        } catch (ImportException e) {
+            // If all importers fail, try to read the file as BibTeX
+            try {
+                ParserResult parserResult = OpenDatabase.loadDatabase(filePath, importFormatPreferences, timestampPreferences, fileMonitor);
+                if (parserResult.getDatabase().hasEntries() || !parserResult.getDatabase().hasNoStrings()) {
+                    parserResult.setFile(filePath.toFile());
+                    return new UnknownFormatImport(ImportFormatReader.BIBTEX_FORMAT, parserResult);
+                } else {
+                    throw new ImportException(Localization.lang("Could not find a suitable import format."));
+                }
+            } catch (IOException ignore) {
+                // Ignored
+                throw new ImportException(Localization.lang("Could not find a suitable import format."));
             }
-        } catch (IOException ignore) {
-            // Ignored
         }
-
-        UnknownFormatImport unknownFormatImport = importUnknownFormat(importer -> importer.importDatabase(filePath, importFormatPreferences.getEncoding()), importer -> importer.isRecognizedFormat(filePath, importFormatPreferences.getEncoding()));
-        unknownFormatImport.parserResult.setFile(filePath.toFile());
-        return unknownFormatImport;
     }
 
     /**
@@ -198,7 +213,7 @@ private UnknownFormatImport importUnknownFormat(CheckedFunction<Importer, Parser
         String bestFormatName = null;
 
         // Cycle through all importers:
-        for (Importer imFo : getImportFormats()) {
+        for (Importer imFo : formats) {
             try {
                 if (!isRecognizedFormat.apply(imFo)) {
                     continue;

diff --git a/src/main/java/org/jabref/logic/importer/WebFetchers.java b/src/main/java/org/jabref/logic/importer/WebFetchers.java
@@ -1,5 +1,6 @@
 package org.jabref.logic.importer;
 
+import java.nio.charset.Charset;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Optional;
@@ -37,10 +38,13 @@
 import org.jabref.logic.importer.fetcher.SpringerLink;
 import org.jabref.logic.importer.fetcher.TitleFetcher;
 import org.jabref.logic.importer.fetcher.ZbMATH;
+import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter;
+import org.jabref.model.database.BibDatabaseContext;
 import org.jabref.model.entry.field.Field;
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.entry.identifier.DOI;
 import org.jabref.model.entry.identifier.Identifier;
+import org.jabref.preferences.FilePreferences;
 
 import static org.jabref.model.entry.field.StandardField.EPRINT;
 import static org.jabref.model.entry.field.StandardField.ISBN;
@@ -133,14 +137,15 @@ public static SortedSet<IdBasedFetcher> getIdBasedFetchers(ImportFormatPreferenc
     /**
      * @return sorted set containing entry based fetchers
      */
-    public static SortedSet<EntryBasedFetcher> getEntryBasedFetchers(ImportFormatPreferences importFormatPreferences) {
+    public static SortedSet<EntryBasedFetcher> getEntryBasedFetchers(ImportFormatPreferences importFormatPreferences, FilePreferences filePreferences, BibDatabaseContext databaseContext, Charset defaultEncoding) {
         SortedSet<EntryBasedFetcher> set = new TreeSet<>(Comparator.comparing(WebFetcher::getName));
         set.add(new AstrophysicsDataSystem(importFormatPreferences));
         set.add(new DoiFetcher(importFormatPreferences));
         set.add(new IsbnFetcher(importFormatPreferences));
         set.add(new MathSciNet(importFormatPreferences));
         set.add(new CrossRef());
         set.add(new ZbMATH(importFormatPreferences));
+        set.add(new PdfMergeMetadataImporter.EntryBasedFetcherWrapper(importFormatPreferences, filePreferences, databaseContext, defaultEncoding));
         return set;
     }
 

diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcher.java
@@ -23,9 +23,10 @@
 
 public class GrobidCitationFetcher implements SearchBasedFetcher {
 
+    public static final String GROBID_URL = "http://grobid.jabref.org:8070";
+
     private static final Logger LOGGER = LoggerFactory.getLogger(GrobidCitationFetcher.class);
 
-    private static final String GROBID_URL = "http://grobid.jabref.org:8070";
     private ImportFormatPreferences importFormatPreferences;
     private GrobidService grobidService;
 

diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporter.java
@@ -0,0 +1,166 @@
+package org.jabref.logic.importer.fileformat;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+import org.jabref.logic.importer.ImportFormatPreferences;
+import org.jabref.logic.importer.Importer;
+import org.jabref.logic.importer.ParseException;
+import org.jabref.logic.importer.ParserResult;
+import org.jabref.logic.l10n.Localization;
+import org.jabref.logic.util.StandardFileType;
+import org.jabref.logic.util.io.FileUtil;
+import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException;
+import org.jabref.logic.xmp.XmpUtilReader;
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.util.DummyFileUpdateMonitor;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
+
+/**
+ * PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF.
+ */
+public class PdfEmbeddedBibFileImporter extends Importer {
+
+    private final ImportFormatPreferences importFormatPreferences;
+    private final BibtexParser bibtexParser;
+
+    public PdfEmbeddedBibFileImporter(ImportFormatPreferences importFormatPreferences) {
+        this.importFormatPreferences = importFormatPreferences;
+        bibtexParser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor());
+    }
+
+    @Override
+    public boolean isRecognizedFormat(BufferedReader input) throws IOException {
+        return input.readLine().startsWith("%PDF");
+    }
+
+    @Override
+    public ParserResult importDatabase(BufferedReader reader) throws IOException {
+        Objects.requireNonNull(reader);
+        throw new UnsupportedOperationException("PdfEmbeddedBibFileImporter does not support importDatabase(BufferedReader reader)."
+                + "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
+    }
+
+    @Override
+    public ParserResult importDatabase(String data) throws IOException {
+        Objects.requireNonNull(data);
+        throw new UnsupportedOperationException("PdfEmbeddedBibFileImporter does not support importDatabase(String data)."
+                + "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
+    }
+
+    @Override
+    public ParserResult importDatabase(Path filePath, Charset defaultEncoding) {
+        try (PDDocument document = XmpUtilReader.loadWithAutomaticDecryption(filePath)) {
+            return new ParserResult(getEmbeddedBibFileEntries(document));
+        } catch (EncryptedPdfsNotSupportedException e) {
+            return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
+        } catch (IOException | ParseException e) {
+            return ParserResult.fromError(e);
+        }
+    }
+
+    /**
+     * Extraction of embedded files in pdfs adapted from:
+     * Adapted from https://svn.apache.org/repos/asf/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.javaj
+     */
+
+    private List<BibEntry> getEmbeddedBibFileEntries(PDDocument document) throws IOException, ParseException {
+        List<BibEntry> allParsedEntries = new ArrayList<>();
+        PDDocumentNameDictionary nameDictionary = document.getDocumentCatalog().getNames();
+        if (nameDictionary != null) {
+            PDEmbeddedFilesNameTreeNode efTree = nameDictionary.getEmbeddedFiles();
+            if (efTree != null) {
+                Map<String, PDComplexFileSpecification> names = efTree.getNames();
+                if (names != null) {
+                    allParsedEntries.addAll(extractAndParseFiles(names));
+                } else {
+                    List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+                    for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+                        names = node.getNames();
+                        allParsedEntries.addAll(extractAndParseFiles(names));
+                    }
+                }
+            }
+        }
+        // extract files from annotations
+        for (PDPage page : document.getPages()) {
+            for (PDAnnotation annotation : page.getAnnotations()) {
+                if (annotation instanceof PDAnnotationFileAttachment) {
+                    PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation;
+                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment.getFile();
+                    allParsedEntries.addAll(extractAndParseFile(getEmbeddedFile(fileSpec)));
+                }
+            }
+        }
+        return allParsedEntries;
+    }
+
+    private List<BibEntry> extractAndParseFiles(Map<String, PDComplexFileSpecification> names) throws IOException, ParseException {
+        List<BibEntry> allParsedEntries = new ArrayList<>();
+        for (Map.Entry<String, PDComplexFileSpecification> entry : names.entrySet()) {
+            String filename = entry.getKey();
+            FileUtil.getFileExtension(filename);
+            if (FileUtil.isBibFile(Path.of(filename))) {
+                PDComplexFileSpecification fileSpec = entry.getValue();
+                allParsedEntries.addAll(extractAndParseFile(getEmbeddedFile(fileSpec)));
+            }
+        }
+        return allParsedEntries;
+    }
+
+    private List<BibEntry> extractAndParseFile(PDEmbeddedFile embeddedFile) throws IOException, ParseException {
+        return bibtexParser.parseEntries(embeddedFile.createInputStream());
+    }
+
+    private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec) {
+        // search for the first available alternative of the embedded file
+        PDEmbeddedFile embeddedFile = null;
+        if (fileSpec != null) {
+            embeddedFile = fileSpec.getEmbeddedFileUnicode();
+            if (embeddedFile == null) {
+                embeddedFile = fileSpec.getEmbeddedFileDos();
+            }
+            if (embeddedFile == null) {
+                embeddedFile = fileSpec.getEmbeddedFileMac();
+            }
+            if (embeddedFile == null) {
+                embeddedFile = fileSpec.getEmbeddedFileUnix();
+            }
+            if (embeddedFile == null) {
+                embeddedFile = fileSpec.getEmbeddedFile();
+            }
+        }
+        return embeddedFile;
+    }
+
+    @Override
+    public String getName() {
+        return "PDFembeddedbibfile";
+    }
+
+    @Override
+    public StandardFileType getFileType() {
+        return StandardFileType.PDF;
+    }
+
+    @Override
+    public String getDescription() {
+        return "PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF.";
+    }
+
+}