-
-
Notifications
You must be signed in to change notification settings - Fork 3.4k
JabRef/jabref #15565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JabRef/jabref #15565
Changes from all commits
d2e17cc
5d3ee6f
8ccbde9
750a169
224acab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| package org.jabref.logic.importer.util; | ||
|
|
||
| import java.util.Optional; | ||
| import java.util.regex.Matcher; | ||
| import java.util.regex.Pattern; | ||
|
|
||
| import org.jabref.model.entry.identifier.ArXivIdentifier; | ||
| import org.jabref.model.entry.identifier.DOI; | ||
|
|
||
| /** | ||
| * Parses identifiers from URLs and plain text. | ||
| * Extracts DOI, arXiv ID, etc. from various URL formats. | ||
| */ | ||
| public class UrlIdentifierParser { | ||
|
|
||
| private static final Pattern DOI_URL_PATTERN = | ||
| Pattern.compile("https?://(?:dx\\.)?doi\\.org/(.+)"); | ||
|
|
||
| private static final Pattern DOI_ACM_PATTERN = | ||
| Pattern.compile("https?://dl\\.acm\\.org/doi/(?:abs/)?(.+)"); | ||
|
|
||
| private static final Pattern ARXIV_URL_PATTERN = | ||
| Pattern.compile("https?://arxiv\\.org/(?:abs|pdf)/([\\w.\\-]+?)(?:\\.pdf)?$"); | ||
|
|
||
| public static Optional<DOI> parseDOI(String input) { | ||
| if (input == null || input.isBlank()) { | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| String trimmedInput = input.trim(); | ||
|
|
||
| Matcher doiUrlMatcher = DOI_URL_PATTERN.matcher(trimmedInput); | ||
| if (doiUrlMatcher.find()) { | ||
| return DOI.parse(doiUrlMatcher.group(1)); | ||
| } | ||
|
|
||
| Matcher acmMatcher = DOI_ACM_PATTERN.matcher(trimmedInput); | ||
| if (acmMatcher.find()) { | ||
| return DOI.parse(acmMatcher.group(1)); | ||
| } | ||
|
Comment on lines
+19
to
+40
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 4. Acm doi regex rejects pdf UrlIdentifierParser.parseDOI short-circuits on DOI_ACM_PATTERN and captures everything after /doi/, so URLs like https://dl.acm.org/doi/pdf/10.... are turned into pdf/10.... and then rejected by DOI.parse. This is a regression because DOI.parse is already able to extract a DOI embedded later in an arbitrary https URL. Agent Prompt
|
||
|
|
||
| return DOI.parse(trimmedInput); | ||
| } | ||
|
|
||
| public static Optional<ArXivIdentifier> parseArXiv(String input) { | ||
| if (input == null || input.isBlank()) { | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| String trimmedInput = input.trim(); | ||
|
|
||
| Matcher arxivMatcher = ARXIV_URL_PATTERN.matcher(trimmedInput); | ||
| if (arxivMatcher.find()) { | ||
| return ArXivIdentifier.parse(arxivMatcher.group(1)); | ||
| } | ||
|
|
||
| return ArXivIdentifier.parse(trimmedInput); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| package org.jabref.logic.importer.util; | ||
|
|
||
| import org.jabref.model.entry.identifier.ArXivIdentifier; | ||
| import org.jabref.model.entry.identifier.DOI; | ||
|
|
||
| import org.junit.jupiter.api.Test; | ||
|
|
||
| import static org.junit.jupiter.api.Assertions.assertFalse; | ||
| import static org.junit.jupiter.api.Assertions.assertTrue; | ||
|
|
||
| class UrlIdentifierParserTest { | ||
|
|
||
| @Test | ||
| void parseDOIFromPlainDOI() { | ||
| String input = "10.1145/3544548.3580995"; | ||
| assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseDOIFromDoiOrgURL() { | ||
| String input = "https://doi.org/10.1145/3544548.3580995"; | ||
| assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseDOIFromDxDoiOrgURL() { | ||
| String input = "https://dx.doi.org/10.1145/3544548.3580995"; | ||
| assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseDOIFromHTTPURL() { | ||
| String input = "http://doi.org/10.1145/3544548.3580995"; | ||
| assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseDOIFromACMDigitalLibrary() { | ||
| String input = "https://dl.acm.org/doi/10.1145/3544548.3580995"; | ||
| assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseDOIFromACMAbsURL() { | ||
| String input = "https://dl.acm.org/doi/abs/10.1145/3544548.3580995"; | ||
| assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseDOIReturnsEmptyForNull() { | ||
| assertFalse(UrlIdentifierParser.parseDOI(null).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseDOIReturnsEmptyForEmptyString() { | ||
| assertFalse(UrlIdentifierParser.parseDOI("").isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseDOIReturnsEmptyForInvalidURL() { | ||
| assertFalse(UrlIdentifierParser.parseDOI("https://example.com").isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseArXivFromPlainID() { | ||
| String input = "2203.02155"; | ||
| assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseArXivFromAbsURL() { | ||
| String input = "https://arxiv.org/abs/2203.02155"; | ||
| assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseArXivFromPDFURL() { | ||
| String input = "https://arxiv.org/pdf/2203.02155.pdf"; | ||
| assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseArXivFromHTTPURL() { | ||
| String input = "http://arxiv.org/abs/2203.02155"; | ||
| assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseArXivReturnsEmptyForNull() { | ||
| assertFalse(UrlIdentifierParser.parseArXiv(null).isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseArXivReturnsEmptyForInvalidURL() { | ||
| assertFalse(UrlIdentifierParser.parseArXiv("https://example.com").isPresent()); | ||
| } | ||
|
|
||
| @Test | ||
| void parseArXivHandlesOldIDFormat() { | ||
| String input = "https://arxiv.org/abs/math.GT/0309136"; | ||
| assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); | ||
| } | ||
|
Comment on lines
+13
to
+102
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 1. Tests only assert presence UrlIdentifierParserTest uses assertTrue(optional.isPresent())/assertFalse(optional.isPresent()) instead of asserting the exact parsed DOI/arXiv value. This weakens test precision and can allow incorrect-but-present parsing results to pass. Agent Prompt
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2. Arxiv pdf url fails
🐞 Bug≡ CorrectnessAgent Prompt
ⓘ Copy this prompt and use it to remediate the issue with your preferred AI generation tools