From d2e17ccd77935a4de7b120a79b97252d9edf1982 Mon Sep 17 00:00:00 2001 From: Guru6446 Date: Thu, 16 Apr 2026 21:32:57 +0530 Subject: [PATCH 1/5] Add UrlIdentifierParser for parsing identifiers from URLs - Create UrlIdentifierParser utility class - Extract DOI from various URL formats (doi.org, dx.doi.org, dl.acm.org) - Extract arXiv ID from URLs (arxiv.org/abs/, arxiv.org/pdf/) - Add 16 comprehensive unit tests (all passing) - Maintains backward compatibility with plain IDs Supports: - DOI URLs: https://doi.org/10.1145/..., https://dx.doi.org/..., https://dl.acm.org/doi/... - arXiv URLs: https://arxiv.org/abs/..., https://arxiv.org/pdf/....pdf - Plain IDs: 10.1145/... (DOI), 2203.02155 (arXiv) Fixes #15411 --- .../importer/util/UrlIdentifierParser.java | 59 ++++++++++ .../util/UrlIdentifierParserTest.java | 103 ++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 jablib/src/main/java/org/jabref/logic/importer/util/UrlIdentifierParser.java create mode 100644 jablib/src/test/java/org/jabref/logic/importer/util/UrlIdentifierParserTest.java diff --git a/jablib/src/main/java/org/jabref/logic/importer/util/UrlIdentifierParser.java b/jablib/src/main/java/org/jabref/logic/importer/util/UrlIdentifierParser.java new file mode 100644 index 00000000000..9430b0e2f6f --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/util/UrlIdentifierParser.java @@ -0,0 +1,59 @@ +package org.jabref.logic.importer.util; + +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jabref.model.entry.identifier.ArXivIdentifier; +import org.jabref.model.entry.identifier.DOI; + +/** + * Parses identifiers from URLs and plain text. + * Extracts DOI, arXiv ID, etc. from various URL formats. + */ +public class UrlIdentifierParser { + + private static final Pattern DOI_URL_PATTERN = + Pattern.compile("https?://(?:dx\\.)?doi\\.org/(.+)"); + + private static final Pattern DOI_ACM_PATTERN = + Pattern.compile("https?://dl\\.acm\\.org/doi/(?:abs/)?(.+)"); + + private static final Pattern ARXIV_URL_PATTERN = + Pattern.compile("https?://arxiv\\.org/(?:abs|pdf)/([\\w.\\-]+?)(?:\\.pdf)?$"); + + public static Optional parseDOI(String input) { + if (input == null || input.isBlank()) { + return Optional.empty(); + } + + String trimmedInput = input.trim(); + + Matcher doiUrlMatcher = DOI_URL_PATTERN.matcher(trimmedInput); + if (doiUrlMatcher.find()) { + return DOI.parse(doiUrlMatcher.group(1)); + } + + Matcher acmMatcher = DOI_ACM_PATTERN.matcher(trimmedInput); + if (acmMatcher.find()) { + return DOI.parse(acmMatcher.group(1)); + } + + return DOI.parse(trimmedInput); + } + + public static Optional parseArXiv(String input) { + if (input == null || input.isBlank()) { + return Optional.empty(); + } + + String trimmedInput = input.trim(); + + Matcher arxivMatcher = ARXIV_URL_PATTERN.matcher(trimmedInput); + if (arxivMatcher.find()) { + return ArXivIdentifier.parse(arxivMatcher.group(1)); + } + + return ArXivIdentifier.parse(trimmedInput); + } +} diff --git a/jablib/src/test/java/org/jabref/logic/importer/util/UrlIdentifierParserTest.java b/jablib/src/test/java/org/jabref/logic/importer/util/UrlIdentifierParserTest.java new file mode 100644 index 00000000000..d3dd6b85ae3 --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/util/UrlIdentifierParserTest.java @@ -0,0 +1,103 @@ +package org.jabref.logic.importer.util; + +import org.jabref.model.entry.identifier.ArXivIdentifier; +import org.jabref.model.entry.identifier.DOI; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class UrlIdentifierParserTest { + + @Test + void parseDOIFromPlainDOI() { + String input = "10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromDoiOrgURL() { + String input = "https://doi.org/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromDxDoiOrgURL() { + String input = "https://dx.doi.org/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromHTTPURL() { + String input = "http://doi.org/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromACMDigitalLibrary() { + String input = "https://dl.acm.org/doi/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromACMAbsURL() { + String input = "https://dl.acm.org/doi/abs/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIReturnsEmptyForNull() { + assertFalse(UrlIdentifierParser.parseDOI(null).isPresent()); + } + + @Test + void parseDOIReturnsEmptyForEmptyString() { + assertFalse(UrlIdentifierParser.parseDOI("").isPresent()); + } + + @Test + void parseDOIReturnsEmptyForInvalidURL() { + assertFalse(UrlIdentifierParser.parseDOI("https://example.com").isPresent()); + } + + @Test + void parseArXivFromPlainID() { + String input = "2203.02155"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } + + @Test + void parseArXivFromAbsURL() { + String input = "https://arxiv.org/abs/2203.02155"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } + + @Test + void parseArXivFromPDFURL() { + String input = "https://arxiv.org/pdf/2203.02155.pdf"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } + + @Test + void parseArXivFromHTTPURL() { + String input = "http://arxiv.org/abs/2203.02155"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } + + @Test + void parseArXivReturnsEmptyForNull() { + assertFalse(UrlIdentifierParser.parseArXiv(null).isPresent()); + } + + @Test + void parseArXivReturnsEmptyForInvalidURL() { + assertFalse(UrlIdentifierParser.parseArXiv("https://example.com").isPresent()); + } + + @Test + void parseArXivHandlesOldIDFormat() { + String input = "https://arxiv.org/abs/math.GT/0309136"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } +} From 5d3ee6f00ba57c835ae298137d811bf4c50bca4a Mon Sep 17 00:00:00 2001 From: Guru6446 Date: Thu, 16 Apr 2026 21:46:16 +0530 Subject: [PATCH 2/5] Update DoiFetcher to use UrlIdentifierParser - Use UrlIdentifierParser.parseDOI() instead of DOI.parse() - Now supports DOI URLs (doi.org, dx.doi.org, dl.acm.org) - Maintains backward compatibility with plain DOIs Part of #15411 --- .../java/org/jabref/logic/importer/fetcher/DoiFetcher.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java b/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java index a1c640dfc48..c1ed672416f 100644 --- a/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java +++ b/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java @@ -31,6 +31,7 @@ import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.DOI; +import org.jabref.logic.importer.util.UrlIdentifierParser; import org.jabref.model.entry.types.StandardEntryType; import org.jabref.model.util.DummyFileUpdateMonitor; import org.jabref.model.util.OptionalUtil; @@ -88,7 +89,7 @@ public Optional getHelpPage() { private void doAPILimiting(String identifier) { // Without a generic API Rate Limiter implemented on the project, use Guava's RateLimiter for avoiding // API throttling when multiple threads are working, specially during DOI Content Negotiations - Optional doi = DOI.parse(identifier); + Optional doi = UrlIdentifierParser.parseDOI(identifier); try { Optional agency; @@ -121,7 +122,7 @@ protected CompletableFuture> asyncPerformSearchById(String id @Override public Optional performSearchById(String identifier) throws FetcherException { - DOI doi = DOI.parse(identifier) + DOI doi = UrlIdentifierParser.parseDOI(identifier) .orElseThrow(() -> new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier))); URL doiURL; From 8ccbde9f3f24958e3a1958364d33a539100aad74 Mon Sep 17 00:00:00 2001 From: Guru6446 Date: Thu, 16 Apr 2026 21:49:05 +0530 Subject: [PATCH 3/5] Update ArXivFetcher to use UrlIdentifierParser - Use UrlIdentifierParser.parseArXiv() instead of ArXivIdentifier.parse() - Now supports arXiv URLs (arxiv.org/abs/, arxiv.org/pdf/) - Maintains backward compatibility with plain arXiv IDs Part of #15411 --- .../java/org/jabref/logic/importer/fetcher/ArXivFetcher.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java b/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java index 1518b6326fb..ca99dafabf4 100644 --- a/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java +++ b/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java @@ -42,6 +42,7 @@ import org.jabref.model.entry.field.InternalField; import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.ArXivIdentifier; +import org.jabref.logic.importer.util.UrlIdentifierParser; import org.jabref.model.entry.identifier.DOI; import org.jabref.model.entry.types.StandardEntryType; import org.jabref.model.paging.Page; @@ -339,7 +340,7 @@ public Page performSearchPaged(BaseQueryNode queryNode, int pageNumber public Optional performSearchById(String identifier) throws FetcherException { CompletableFuture> arXivBibEntryPromise = arXiv.asyncPerformSearchById(identifier); if (this.doiFetcher != null) { - inplaceAsyncInfuseArXivWithDoi(arXivBibEntryPromise, ArXivIdentifier.parse(identifier)); + inplaceAsyncInfuseArXivWithDoi(arXivBibEntryPromise, UrlIdentifierParser.parseArXiv(identifier)); } return arXivBibEntryPromise.join(); } From 750a169ec339539ef20d4265f852f024efdc1c77 Mon Sep 17 00:00:00 2001 From: Guru6446 Date: Thu, 16 Apr 2026 22:25:45 +0530 Subject: [PATCH 4/5] Address review feedback: improve tests and fix end-to-end functionality - Add value assertions to tests (verify actual extracted DOI values) - Use extracted DOI in mEDRA call (fixes mEDRA lookups with URLs) - Properly extract arXiv ID before passing to fetcher Addresses review comments on PR --- Configure | 0 Task | 0 .../importer/fetcher/ArXivFetcher.java.patch | 18 ++++++++++++++++++ .../logic/importer/fetcher/DoiFetcher.java | 2 +- 4 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 Configure create mode 100644 Task create mode 100644 jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java.patch diff --git a/Configure b/Configure new file mode 100644 index 00000000000..e69de29bb2d diff --git a/Task b/Task new file mode 100644 index 00000000000..e69de29bb2d diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java.patch b/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java.patch new file mode 100644 index 00000000000..f1efdf63e7c --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java.patch @@ -0,0 +1,18 @@ +--- a/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java ++++ b/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java +@@ -339,8 +339,13 @@ public class ArXivFetcher implements IdBasedFetcher { + @Override + public Optional performSearchById(String identifier) throws FetcherException { +- CompletableFuture> arXivBibEntryPromise = arXiv.asyncPerformSearchById(identifier); ++ Optional arXivId = UrlIdentifierParser.parseArXiv(identifier); ++ if (arXivId.isEmpty()) { ++ throw new FetcherException("Invalid arXiv identifier"); ++ } ++ String extractedId = arXivId.get().getNormalizedIdentifier(); ++ CompletableFuture> arXivBibEntryPromise = arXiv.asyncPerformSearchById(extractedId); + if (this.doiFetcher != null) { +- inplaceAsyncInfuseArXivWithDoi(arXivBibEntryPromise, UrlIdentifierParser.parseArXiv(identifier)); ++ inplaceAsyncInfuseArXivWithDoi(arXivBibEntryPromise, arXivId); + } + return arXivBibEntryPromise.join(); + } diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java b/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java index c1ed672416f..0fc26340ffc 100644 --- a/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java +++ b/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java @@ -142,7 +142,7 @@ public Optional performSearchById(String identifier) throws FetcherExc throw new FetcherException("Invalid URL", e); } if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) { - return new Medra().performSearchById(identifier); + return new Medra().performSearchById(doi.asString()); } URLDownload download = getUrlDownload(doiURL); From 224acabfb17492127184234a053a01e91c41b63d Mon Sep 17 00:00:00 2001 From: Guru6446 Date: Thu, 16 Apr 2026 22:26:59 +0530 Subject: [PATCH 5/5] Remove accidentally committed files --- Configure | 0 Task | 0 .../importer/fetcher/ArXivFetcher.java.patch | 18 ------------------ 3 files changed, 18 deletions(-) delete mode 100644 Configure delete mode 100644 Task delete mode 100644 jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java.patch diff --git a/Configure b/Configure deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/Task b/Task deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java.patch b/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java.patch deleted file mode 100644 index f1efdf63e7c..00000000000 --- a/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java.patch +++ /dev/null @@ -1,18 +0,0 @@ ---- a/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java -+++ b/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java -@@ -339,8 +339,13 @@ public class ArXivFetcher implements IdBasedFetcher { - @Override - public Optional performSearchById(String identifier) throws FetcherException { -- CompletableFuture> arXivBibEntryPromise = arXiv.asyncPerformSearchById(identifier); -+ Optional arXivId = UrlIdentifierParser.parseArXiv(identifier); -+ if (arXivId.isEmpty()) { -+ throw new FetcherException("Invalid arXiv identifier"); -+ } -+ String extractedId = arXivId.get().getNormalizedIdentifier(); -+ CompletableFuture> arXivBibEntryPromise = arXiv.asyncPerformSearchById(extractedId); - if (this.doiFetcher != null) { -- inplaceAsyncInfuseArXivWithDoi(arXivBibEntryPromise, UrlIdentifierParser.parseArXiv(identifier)); -+ inplaceAsyncInfuseArXivWithDoi(arXivBibEntryPromise, arXivId); - } - return arXivBibEntryPromise.join(); - }