From d333a38a525eda0279250d0b0306be2ef960de6c Mon Sep 17 00:00:00 2001 From: jsochava <144294623+jsochava@users.noreply.github.com> Date: Fri, 21 Nov 2025 16:43:30 -0800 Subject: [PATCH 1/2] Addressing requested changes(#14085) Removing all accidentally changed files and only commiting related work files --- .../logic/importer/RelatedWorkAnnotator.java | 33 +++ .../HeuristicExtractorAdapter.java | 41 +++ .../HeuristicRelatedWorkExtractor.java | 262 ++++++++++++++++++ .../LangChainRelatedWorkSummarizer.java | 122 ++++++++ .../PdfRelatedWorkTextExtractor.java | 56 ++++ .../importer/relatedwork/PdfTextProvider.java | 19 ++ .../relatedwork/RelatedWorkAiModule.java | 63 +++++ .../relatedwork/RelatedWorkAiPreferences.java | 35 +++ .../RelatedWorkEvaluationRunner.java | 180 ++++++++++++ .../relatedwork/RelatedWorkExtractor.java | 20 ++ .../relatedwork/RelatedWorkFixture.java | 63 +++++ .../relatedwork/RelatedWorkHarvester.java | 81 ++++++ .../relatedwork/RelatedWorkMetrics.java | 121 ++++++++ .../relatedwork/RelatedWorkPdfPipeline.java | 65 +++++ .../relatedwork/RelatedWorkPipeline.java | 51 ++++ .../relatedwork/RelatedWorkPluginConfig.java | 65 +++++ .../RelatedWorkSectionLocator.java | 171 ++++++++++++ .../relatedwork/RelatedWorkSummarizer.java | 33 +++ .../importer/RelatedWorkAnnotatorTest.java | 50 ++++ .../HeuristicRelatedWorkExtractorTest.java | 72 +++++ .../LangChainRelatedWorkSummarizerTest.java | 90 ++++++ .../PdfRelatedWorkTextExtratorTest.java | 113 ++++++++ .../relatedwork/RelatedWorkHarvesterTest.java | 67 +++++ .../relatedwork/RelatedWorkMetricsTest.java | 62 +++++ .../RelatedWorkSectionLocatorTest.java | 86 ++++++ 25 files changed, 2021 insertions(+) create mode 100644 jablib/src/main/java/org/jabref/logic/importer/RelatedWorkAnnotator.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicExtractorAdapter.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractor.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/LangChainRelatedWorkSummarizer.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/PdfRelatedWorkTextExtractor.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/PdfTextProvider.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkAiModule.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkAiPreferences.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkEvaluationRunner.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkExtractor.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkFixture.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkHarvester.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkMetrics.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPdfPipeline.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPipeline.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPluginConfig.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkSectionLocator.java create mode 100644 jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkSummarizer.java create mode 100644 jablib/src/test/java/org/jabref/logic/importer/RelatedWorkAnnotatorTest.java create mode 100644 jablib/src/test/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractorTest.java create mode 100644 jablib/src/test/java/org/jabref/logic/importer/relatedwork/LangChainRelatedWorkSummarizerTest.java create mode 100644 jablib/src/test/java/org/jabref/logic/importer/relatedwork/PdfRelatedWorkTextExtratorTest.java create mode 100644 jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkHarvesterTest.java create mode 100644 jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkMetricsTest.java create mode 100644 jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkSectionLocatorTest.java diff --git a/jablib/src/main/java/org/jabref/logic/importer/RelatedWorkAnnotator.java b/jablib/src/main/java/org/jabref/logic/importer/RelatedWorkAnnotator.java new file mode 100644 index 00000000000..0bdfded18c5 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/RelatedWorkAnnotator.java @@ -0,0 +1,33 @@ +package org.jabref.logic.importer; + +import java.util.Optional; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.Field; +import org.jabref.model.entry.field.FieldFactory; + +public class RelatedWorkAnnotator { + + public static void appendSummaryToEntry( + BibEntry entry, + String username, + String citingPaperKey, + String summarySentence + ) { + String fieldName = "comment-" + username; + Field commentField = FieldFactory.parseField(fieldName); + + String cleaned = summarySentence.strip(); + if (!cleaned.endsWith(".")) { + cleaned = cleaned + "."; + } + String formattedBlock = "[" + citingPaperKey + "]: " + cleaned; + + Optional existing = entry.getField(commentField); + String newValue = existing + .map(old -> old.strip() + "\n\n" + formattedBlock) + .orElse(formattedBlock); + + entry.setField(commentField, newValue); + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicExtractorAdapter.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicExtractorAdapter.java new file mode 100644 index 00000000000..e6046339310 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicExtractorAdapter.java @@ -0,0 +1,41 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.jabref.model.entry.BibEntry; + +/** + * Adapts HeuristicRelatedWorkExtractor (citationKey -> snippet) to the + * RelatedWorkEvaluationRunner.Extractor interface (BibEntry -> snippets). + */ +public final class HeuristicExtractorAdapter implements RelatedWorkEvaluationRunner.Extractor { + + private final HeuristicRelatedWorkExtractor delegate; + + public HeuristicExtractorAdapter(HeuristicRelatedWorkExtractor delegate) { + this.delegate = delegate; + } + + @Override + public Map> apply(String relatedWorkText, List candidates) { + Map byKey = delegate.extract(relatedWorkText, candidates); + + Map entryByKey = new HashMap<>(); + for (BibEntry be : candidates) { + be.getCitationKey().ifPresent(k -> entryByKey.put(k, be)); + } + + Map> out = new HashMap<>(); + for (Map.Entry e : byKey.entrySet()) { + BibEntry be = entryByKey.get(e.getKey()); + if (be == null) { + continue; // no match for that citation key among candidates + } + out.computeIfAbsent(be, k -> new ArrayList<>()).add(e.getValue()); + } + return out; + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractor.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractor.java new file mode 100644 index 00000000000..20426401482 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractor.java @@ -0,0 +1,262 @@ +package org.jabref.logic.importer.relatedwork; + +import java.text.Normalizer; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; + +/** + * Deterministic extractor for author–year style citations in "Related Work" sections. + * Handles single and multi-citation parentheticals, including diacritics and all-caps acronyms (e.g., CIA, Šimić). + */ +public class HeuristicRelatedWorkExtractor implements RelatedWorkExtractor { + + // Headings like "1.4 Related work", "RELATED WORK", etc. (case-insensitive) + private static final Pattern RELATED_WORK_HEADING = + Pattern.compile("(?im)^(\\d+(?:\\.\\d+)*)?\\s*related\\s+work[s]?\\s*[:\\-]?$"); + + // Any parenthetical block; author-year pairs are mined inside it. + private static final Pattern PAREN_BLOCK = Pattern.compile("\\(([^)]+)\\)"); + + // Unicode-aware author–year inside a parenthetical. + // Allows all-caps acronyms like "CIA" and Unicode surnames like "Šimić". + // \p{Lu} = uppercase letter, \p{L} = any letter, \p{M} = combining mark. + private static final Pattern AUTHOR_YEAR_INNER = Pattern.compile( + "(?U)" // enable Unicode character classes + + "(\\p{Lu}[\\p{L}\\p{M}'\\-]*)" // 1: first author token (can be acronym or surname) + + "(?:\\s+(?:et\\s+al\\.)|\\s*(?:&|and)\\s+\\p{Lu}[\\p{L}\\p{M}'\\-]+)?" + + "\\s*,?\\s*" + + "(\\d{4})([a-z]?)" // 2: year, 3: optional trailing letter + ); + + /** + * Extract a mapping from cited entry key to a short contextual snippet. + * + *

The returned map uses the cited entry's citation key (for example, {@code Smith2021}) + * as the key, and a sentence-like snippet taken from around the in-text citation as the value.

+ * + * @param fullText the full (plain) text of the paper or section to scan + * @param bibliography candidate entries that may be cited; used to resolve author/year to a citation key + * @return a {@code Map} from citation key to snippet; never {@code null}, possibly empty + */ + @Override + public Map extract(String fullText, List bibliography) { + String related = sliceRelatedWorkSection(fullText); + Map index = buildIndex(bibliography); + Map out = new LinkedHashMap<>(); + + Matcher paren = PAREN_BLOCK.matcher(related); + while (paren.find()) { + String inner = paren.group(1); + Matcher cite = AUTHOR_YEAR_INNER.matcher(inner); + + while (cite.find()) { + String citedToken = normalizeSurname(cite.group(1)); // e.g., "cia" or "nash" + String yearDigits = cite.group(2); // ignore group(3) letter + String citedKey = findKeyFor(citedToken, yearDigits, index); + if (citedKey == null || out.containsKey(citedKey)) { + continue; + } + + String snippet = expandToSentenceLikeSpan(related, paren.start(), paren.end()); + snippet = pruneTrailingCitationTail(snippet).trim(); + + if (!snippet.endsWith(".")) { + snippet = snippet + "."; + } + if (snippet.length() > 300) { + snippet = snippet.substring(0, 300) + "..."; + } + + out.put(citedKey, snippet); + } + } + + return out; + } + + /** + * Try to isolate the "Related work" section; fallback to full text. + */ + private String sliceRelatedWorkSection(String text) { + Matcher start = RELATED_WORK_HEADING.matcher(text); + int begin = -1; + while (start.find()) { + begin = start.end(); + break; + } + if (begin < 0) { + return text; // fallback: whole text + } + + // Next likely section heading AFTER begin (numbered or ALL-CAPS) + Pattern nextSection = Pattern.compile( + "(?m)^(?:\\d+(?:\\.\\d+)*)\\s+[A-Z][A-Z\\s\\-]{3,}$|^[A-Z][A-Z\\s\\-]{3,}$"); + Matcher end = nextSection.matcher(text); + int stop = text.length(); + while (end.find()) { + if (end.start() > begin) { + stop = end.start(); + break; + } + } + return text.substring(begin, stop); + } + + private Map buildIndex(List bibs) { + Map idx = new HashMap<>(); + for (BibEntry b : bibs) { + Optional y = b.getField(StandardField.YEAR); + if (y.isEmpty()) { + Optional date = b.getField(StandardField.DATE); + if (date.isPresent()) { + Matcher m = Pattern.compile("(\\d{4})").matcher(date.get()); + if (m.find()) { + y = Optional.of(m.group(1)); + } + } + } + Optional a = b.getField(StandardField.AUTHOR); + if (y.isEmpty() || a.isEmpty()) { + continue; + } + String yearDigits = y.get().replaceAll("[^0-9]", ""); + if (yearDigits.isEmpty()) { + continue; + } + + String firstAuthor = firstAuthorRaw(a.get()); + String firstSurname = extractFirstSurnameFromRaw(firstAuthor); + if (!firstSurname.isEmpty()) { + idx.put(normalizeSurname(firstSurname) + yearDigits, b); + } + + // Also index acronym for corporate/multi-word first author without comma. + String acronym = maybeAcronym(firstAuthor); + if (!acronym.isEmpty()) { + idx.put(acronym + yearDigits, b); + } + } + return idx; + } + + /** + * Get the raw first author string (before surname extraction). + */ + private String firstAuthorRaw(String authorField) { + return authorField.split("\\s+and\\s+")[0].trim(); + } + + /** + * Extract the first author surname from a raw first-author token. + */ + private String extractFirstSurnameFromRaw(String firstAuthor) { + if (firstAuthor.contains(",")) { + return firstAuthor.substring(0, firstAuthor.indexOf(',')).trim(); + } + if (firstAuthor.startsWith("{") && firstAuthor.endsWith("}")) { + String inner = firstAuthor.substring(1, firstAuthor.length() - 1).trim(); + String[] parts = inner.split("\\s+"); + return parts.length == 0 ? "" : parts[parts.length - 1]; + } + String[] parts = firstAuthor.split("\\s+"); + return parts.length == 0 ? "" : parts[parts.length - 1]; + } + + private String maybeAcronym(String firstAuthor) { + if (firstAuthor.contains(",")) { + return ""; // likely "Surname, Given" → skip acronym + } + String unbraced = firstAuthor; + if (unbraced.startsWith("{") && unbraced.endsWith("}")) { + unbraced = unbraced.substring(1, unbraced.length() - 1); + } + String[] parts = unbraced.trim().split("\\s+"); + if (parts.length < 2) { + return ""; // single token → not helpful + } + StringBuilder sb = new StringBuilder(); + for (String p : parts) { + if (p.isEmpty()) { + continue; + } + char c = p.charAt(0); + if (Character.isLetter(c)) { + sb.append(Character.toLowerCase(c)); + } + } + return sb.toString(); + } + + /** + * Normalize token: remove braces, strip diacritics, lowercase. + */ + private String normalizeSurname(String s) { + String noBraces = s.replace("{", "").replace("}", ""); + String normalized = Normalizer.normalize(noBraces, Normalizer.Form.NFD) + .replaceAll("\\p{M}+", ""); + return normalized.toLowerCase(Locale.ROOT); + } + + /** + * Lookup by normalized token (surname or acronym) + 4-digit year. + */ + private String findKeyFor(String lowerToken, String yearDigits, Map index) { + BibEntry entry = index.get(lowerToken + yearDigits); + return (entry != null) ? entry.getCitationKey().orElse(null) : null; // null signals "not found" + } + + /** + * Expand to a sentence-like span around the parenthetical match. + */ + private String expandToSentenceLikeSpan(String text, int matchStart, int matchEnd) { + int left = matchStart; + while (left > 0) { + char c = text.charAt(left - 1); + if (c == '.' || c == '!' || c == '?' || c == '\n') { + break; + } + left--; + } + int right = matchEnd; + int len = text.length(); + while (right < len) { + char c = text.charAt(right); + if (c == '.' || c == '!' || c == '?' || c == '\n') { + right++; // include the boundary char + break; + } + right++; + } + if (right > len) { + right = len; + } + return text.substring(left, right); + } + + /** + * Heuristically remove trailing citation trains at the end of a snippet + */ + private String pruneTrailingCitationTail(String s) { + int lastParen = s.lastIndexOf(')'); + if (lastParen > -1 && lastParen >= s.length() - 3) { + String head = s.substring(0, lastParen + 1).trim(); + if (head.endsWith(").")) { + return head; + } + if (head.endsWith(")")) { + return head + "."; + } + return head; + } + return s; + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/LangChainRelatedWorkSummarizer.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/LangChainRelatedWorkSummarizer.java new file mode 100644 index 00000000000..27730f5d799 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/LangChainRelatedWorkSummarizer.java @@ -0,0 +1,122 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.StringJoiner; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * RelatedWorkSummarizer implementation that delegates summarization to an LLM client. + * + *

To keep this class easy to test and free from hard dependencies, the actual + * model is abstracted behind the {@link Client} functional interface. In production, + * callers can wrap a LangChain4j ChatLanguageModel:

+ * + *
{@code
+ * ChatLanguageModel model = OpenAiChatModel.builder()
+ *     .apiKey(apiKey)
+ *     .modelName(modelName)
+ *     .build();
+ *
+ * LangChainRelatedWorkSummarizer summarizer =
+ *     new LangChainRelatedWorkSummarizer(model::generate);
+ * }
+ */ +public final class LangChainRelatedWorkSummarizer implements RelatedWorkSummarizer { + + /** + * Minimal abstraction over an LLM-like client. + * Implementations are expected to synchronously return a string for a prompt. + */ + @FunctionalInterface + public interface Client { + String generate(String prompt); + } + + private static final Logger LOGGER = LoggerFactory.getLogger(LangChainRelatedWorkSummarizer.class); + + private static final int DEFAULT_MAX_LEN = 350; + + private final Client client; + + public LangChainRelatedWorkSummarizer(Client client) { + this.client = Objects.requireNonNull(client); + } + + @Override + public Optional summarize(List snippets, BibEntry entry, int maxLen) { + if (snippets == null || snippets.isEmpty()) { + return Optional.empty(); + } + + int effectiveMaxLen = maxLen > 0 ? maxLen : DEFAULT_MAX_LEN; + + String prompt = buildPrompt(snippets, entry, effectiveMaxLen); + + try { + String raw = client.generate(prompt); + if (raw == null) { + return Optional.empty(); + } + String cleaned = normalize(raw); + if (cleaned.isBlank()) { + return Optional.empty(); + } + if (cleaned.length() > effectiveMaxLen) { + cleaned = cleaned.substring(0, effectiveMaxLen).trim(); + } + return Optional.of(cleaned); + } catch (Exception e) { + // Fail-safe: never break the harvester due to AI issues. + LOGGER.warn("LangChain summarizer failed for entry {}", + entry.getCitationKey().orElse(""), e); + return Optional.empty(); + } + } + + private String buildPrompt(List snippets, BibEntry entry, int maxLen) { + StringJoiner joiner = new StringJoiner("\n- ", "- ", ""); + + for (String s : snippets) { + if (s != null && !s.isBlank()) { + joiner.add(s.trim()); + } + } + + String citationKey = entry.getCitationKey().orElse(""); + String title = entry.getField(StandardField.TITLE).orElse("this paper"); + + return "You are helping to summarize how one academic paper describes another in its " + + "\"Related Work\" section.\n\n" + + "Cited paper bibtex key (if known): " + citationKey + "\n" + + "Cited paper title (if known): " + title + "\n\n" + + "Task:\n" + + "Using only the fragments below, write a concise, neutral 1–2 sentence description " + + "of the cited paper's contribution, as characterized by the citing paper.\n" + + "Do not invent new facts. Do not mention citation keys or authors by name.\n" + + "Maximum length: approximately " + maxLen + " characters.\n\n" + + "Fragments:\n" + + joiner.toString(); + } + + private String normalize(String text) { + String trimmed = text.trim(); + + // Strip a leading "Summary:" prefix if the model adds one + if (trimmed.toLowerCase().startsWith("summary:")) { + trimmed = trimmed.substring("summary:".length()).trim(); + } + // Strip surrounding quotes + if (trimmed.length() > 1 && trimmed.startsWith("\"") && trimmed.endsWith("\"")) { + trimmed = trimmed.substring(1, trimmed.length() - 1).trim(); + } + + return trimmed; + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/PdfRelatedWorkTextExtractor.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/PdfRelatedWorkTextExtractor.java new file mode 100644 index 00000000000..2f2ad490ea0 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/PdfRelatedWorkTextExtractor.java @@ -0,0 +1,56 @@ +package org.jabref.logic.importer.relatedwork; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Objects; +import java.util.Optional; + +/** + * Adapter that: PDF -> full plain text (via PdfTextProvider) -> "Related Work" block (via RelatedWorkSectionLocator). + * No PDF dependencies here; all PDF specifics live behind PdfTextProvider. + */ +public final class PdfRelatedWorkTextExtractor { + + private final PdfTextProvider pdfTextProvider; + private final RelatedWorkSectionLocator sectionLocator; + + public PdfRelatedWorkTextExtractor(PdfTextProvider pdfTextProvider, + RelatedWorkSectionLocator sectionLocator) { + this.pdfTextProvider = Objects.requireNonNull(pdfTextProvider); + this.sectionLocator = Objects.requireNonNull(sectionLocator); + } + + /** + * Extracts the "Related Work"/"Literature Review" section from the given PDF, if present. + * + * @param pdf path to the PDF file + * @return Optional with the related-work block (no header), or empty if not found / empty text + * @throws IOException if reading the PDF fails + * @throws IllegalArgumentException if the path is invalid + */ + public Optional extractRelatedWorkSection(Path pdf) throws IOException { + Objects.requireNonNull(pdf, "pdf"); + if (!Files.isRegularFile(pdf)) { + throw new IllegalArgumentException("Not a regular file: " + pdf); + } + + Optional plain = pdfTextProvider.extractPlainText(pdf); + if (plain.isEmpty() || plain.get().isBlank()) { + return Optional.empty(); + } + + String text = plain.get(); + return sectionLocator.locate(text) + .map(span -> { + int start = Math.max(0, span.startOffset); // body start + int end = Math.min(text.length(), span.endOffset); // body end + if (start >= end) { + return ""; + } + // Header already excluded by startOffset, so no need to strip it again. + return text.substring(start, end).trim(); + }) + .filter(s -> !s.isBlank()); + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/PdfTextProvider.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/PdfTextProvider.java new file mode 100644 index 00000000000..a66648f4e16 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/PdfTextProvider.java @@ -0,0 +1,19 @@ +package org.jabref.logic.importer.relatedwork; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Optional; + +/** + * SPI for supplying plain text from a PDF. Implementations may use PDFBox, + * existing JabRef utilities, or any other backend. + */ +public interface PdfTextProvider { + + /** + * @param pdf Path to a readable PDF on disk. + * @return Plain text if extraction succeeds and yields non-empty text; otherwise Optional.empty(). + * @throws IOException for I/O or backend-related failures + */ + Optional extractPlainText(Path pdf) throws IOException; +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkAiModule.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkAiModule.java new file mode 100644 index 00000000000..1f2b6d451d5 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkAiModule.java @@ -0,0 +1,63 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.Objects; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Factory / wiring helper that builds a {@link RelatedWorkPluginConfig} + * from {@link RelatedWorkAiPreferences} and an optional + * {@link RelatedWorkSummarizer} implementation. + *

+ * This class deliberately does NOT depend on LangChain4j or any concrete + * LLM client. Higher-level modules are responsible for + * constructing a {@link RelatedWorkSummarizer} (such as + * {@code LangChainRelatedWorkSummarizer}) and passing it in. + */ +public final class RelatedWorkAiModule { + + private static final Logger LOGGER = LoggerFactory.getLogger(RelatedWorkAiModule.class); + + private RelatedWorkAiModule() { + // utility + } + + /** + * Build a plugin config from preferences and an optional summarizer. + *

+ * If AI is disabled in {@code aiPrefs} or {@code summarizer} is {@code null}, + * this returns a config with summarization effectively turned off. + * + * @param aiPrefs user / application preferences for related-work AI + * @param summarizer an optional summarizer implementation (may be null) + * @return a {@link RelatedWorkPluginConfig} representing the effective setup + */ + public static RelatedWorkPluginConfig fromPreferences( + RelatedWorkAiPreferences aiPrefs, + RelatedWorkSummarizer summarizer + ) { + Objects.requireNonNull(aiPrefs); + + RelatedWorkPluginConfig.Builder builder = RelatedWorkPluginConfig.builder(); + + if (!aiPrefs.isEnabled()) { + LOGGER.info("Related Work AI is disabled via preferences"); + return builder.build(); + } + + if (summarizer == null) { + LOGGER.info("Related Work AI is enabled in preferences, but no summarizer implementation was provided; " + "falling back to no-op configuration."); + return builder.build(); + } + + LOGGER.info("Related Work AI enabled (model='{}', apiKeyEnvVar='{}')", + aiPrefs.getModelName(), + aiPrefs.getApiKeyEnvVar()); + + return builder + .enableSummarization(true) + .withSummarizer(summarizer) + .build(); + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkAiPreferences.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkAiPreferences.java new file mode 100644 index 00000000000..2b8ab86c499 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkAiPreferences.java @@ -0,0 +1,35 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.Objects; + +/** + * Small value object for Related Work AI configuration. + *

+ * This class deliberately does NOT know about JabRefPreferences to keep jablib + * independent of the GUI/application layer. Construction from preferences + * should be done in a higher-level module. + */ +public final class RelatedWorkAiPreferences { + + private final boolean enabled; + private final String modelName; + private final String apiKeyEnvVar; + + public RelatedWorkAiPreferences(boolean enabled, String modelName, String apiKeyEnvVar) { + this.enabled = enabled; + this.modelName = Objects.requireNonNullElse(modelName, "").trim(); + this.apiKeyEnvVar = Objects.requireNonNullElse(apiKeyEnvVar, "").trim(); + } + + public boolean isEnabled() { + return enabled; + } + + public String getModelName() { + return modelName; + } + + public String getApiKeyEnvVar() { + return apiKeyEnvVar; + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkEvaluationRunner.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkEvaluationRunner.java new file mode 100644 index 00000000000..88695f8aa92 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkEvaluationRunner.java @@ -0,0 +1,180 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.BiFunction; +import java.util.stream.Collectors; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; + +/** + * Deterministic evaluator for citation-context extraction. + * Compares extractor output against a gold fixture and computes metrics. + */ +public final class RelatedWorkEvaluationRunner { + + /** + * Minimal adapter: sectionText + candidate entries -> mapping(entryKey -> list of snippets). + */ + @FunctionalInterface + public interface Extractor extends BiFunction, Map>> { + } + + private final Extractor extractor; + + public RelatedWorkEvaluationRunner(Extractor extractor) { + this.extractor = Objects.requireNonNull(extractor); + } + + /** + * Canonical key = firstAuthorSurnameLower + "-" + year (missing parts become "unknown"). + */ + public static String canonicalKey(BibEntry entry) { + String author = entry.getField(StandardField.AUTHOR).orElse("").trim(); + String year = entry.getField(StandardField.YEAR).orElse("").trim(); + String last = firstAuthorSurname(author); + if (last.isBlank()) { + last = "unknown"; + } + if (year.isBlank()) { + year = "unknown"; + } + return last.toLowerCase(Locale.ROOT) + "-" + year; + } + + /** + * Very small surname parser: takes first “word” before comma or the last token. + */ + static String firstAuthorSurname(String authorField) { + if (authorField == null || authorField.isBlank()) { + return ""; + } + // Examples: "Vesce, E.; Olivieri, G.; ..." or "Bianchi, F. R." or "Luisa Marcela Luna Ostos and ..." + String primary = authorField.split("(?i)\\band\\b|;")[0].trim(); + if (primary.contains(",")) { + return primary.substring(0, primary.indexOf(',')).trim(); + } + String[] tokens = primary.trim().split("\\s+"); + return tokens.length == 0 ? "" : tokens[tokens.length - 1]; + } + + /** + * Runs one evaluation against a single fixture. + */ + public RelatedWorkMetrics run(RelatedWorkFixture fixture, List candidates) { + Map> extractedRaw = extractor.apply(fixture.relatedWorkText, candidates); + + // Canonicalize extractor output -> key -> snippets + Map> extractedByKey = new HashMap<>(); + extractedRaw.forEach((entry, snippets) -> { + String key = canonicalKey(entry); + extractedByKey.computeIfAbsent(key, k -> new ArrayList<>()) + .addAll(normalizeSnippets(snippets)); + }); + + // Expected: already in canonical key space + Map> expectedByKey = new HashMap<>(); + for (RelatedWorkFixture.Expectation exp : fixture.expectations) { + expectedByKey.computeIfAbsent(exp.canonicalKey(), k -> new ArrayList<>()) + .add(normalize(exp.snippetContains)); + } + + // For matching snippets we do "expected substring contained in extracted" + // (case-insensitive, simple fuzz). Track matched extracted indices per key. + int truePositives = 0; + Map> matchedExtractedIdxByKey = new HashMap<>(); + + for (Map.Entry> kv : expectedByKey.entrySet()) { + String key = kv.getKey(); + List expectedSnippets = kv.getValue(); + List extractedSnippets = extractedByKey.getOrDefault(key, List.of()); + + boolean[] taken = new boolean[extractedSnippets.size()]; + for (String expNeedle : expectedSnippets) { + int matchIdx = indexOfContaining(extractedSnippets, expNeedle, taken); + if (matchIdx >= 0) { + taken[matchIdx] = true; + matchedExtractedIdxByKey + .computeIfAbsent(key, k -> new HashSet<>()) + .add(matchIdx); + truePositives++; + } + } + } + + int extractedPairs = extractedByKey.values().stream().mapToInt(List::size).sum(); + int expectedPairs = expectedByKey.values().stream().mapToInt(List::size).sum(); + int falsePositives = extractedPairs - truePositives; + int falseNegatives = expectedPairs - truePositives; + + int distinctExpected = expectedByKey.size(); + int distinctAnnotated = (int) extractedByKey.entrySet() + .stream() + .filter(e -> !e.getValue().isEmpty()) + .count(); + double avgSnippetsPerAnnotated = + distinctAnnotated == 0 ? 0.0 : (double) extractedPairs / distinctAnnotated; + + return new RelatedWorkMetrics( + expectedPairs, + extractedPairs, + truePositives, + falsePositives, + falseNegatives, + distinctExpected, + distinctAnnotated, + avgSnippetsPerAnnotated, + RelatedWorkMetrics.perEntryFrom(expectedByKey, + extractedByKey, + matchedExtractedIdxByKey) + ); + } + + // ---------- helpers ---------- + + private static List normalizeSnippets(List raw) { + if (raw == null) { + return List.of(); + } + return raw.stream() + .map(RelatedWorkEvaluationRunner::normalize) + .collect(Collectors.toList()); + } + + /** + * Lowercases, collapses whitespace, strips trailing punctuation for robust substring checks. + */ + static String normalize(String s) { + if (s == null) { + return ""; + } + String t = s.toLowerCase(Locale.ROOT); + t = t.replaceAll("\\s+", " ").trim(); + t = t.replaceAll("[\\p{Punct}]+$", ""); + return t; + } + + /** + * Returns first index where haystack[i] contains needle (both normalized), + * honoring "taken" to avoid double-counting. + */ + static int indexOfContaining(List haystack, String needle, boolean[] taken) { + for (int i = 0; i < haystack.size(); i++) { + if (taken[i]) { + continue; + } + String h = haystack.get(i); + if (h.contains(needle)) { + return i; + } + } + return -1; + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkExtractor.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkExtractor.java new file mode 100644 index 00000000000..efa031f8e63 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkExtractor.java @@ -0,0 +1,20 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.List; +import java.util.Map; + +import org.jabref.model.entry.BibEntry; + +/** + * Interface for components that extract citation summaries from a paper's text. + */ +public interface RelatedWorkExtractor { + /** + * Extracts a mapping from cited paper keys to short summary sentences from the "Related Work" section. + * + * @param fullText full text of the citing paper + * @param bibliography list of BibEntries referenced by the citing paper + * @return map from citation key (of cited paper) to its descriptive summary text + */ + Map extract(String fullText, List bibliography); +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkFixture.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkFixture.java new file mode 100644 index 00000000000..2e587767bcb --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkFixture.java @@ -0,0 +1,63 @@ +package org.jabref.logic.importer.relatedwork; + +import java.io.IOException; +import java.io.Reader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Locale; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * Gold fixture model used for evaluating citation-context extraction. + */ +public final class RelatedWorkFixture { + + public static final class Expectation { + public final String firstAuthorSurname; + public final String year; + public final String snippetContains; + + @JsonCreator + public Expectation(@JsonProperty("firstAuthorSurname") String firstAuthorSurname, + @JsonProperty("year") String year, + @JsonProperty("snippetContains") String snippetContains) { + this.firstAuthorSurname = firstAuthorSurname; + this.year = year; + this.snippetContains = snippetContains; + } + + public String canonicalKey() { + String author = (firstAuthorSurname == null) + ? "unknown" + : firstAuthorSurname.toLowerCase(Locale.ROOT); + String yr = (year == null) ? "unknown" : year.trim(); + return author + "-" + yr; + } + } + + public final String id; + public final String relatedWorkText; + public final List expectations; + + @JsonCreator + public RelatedWorkFixture(@JsonProperty("id") String id, + @JsonProperty("relatedWorkText") String relatedWorkText, + @JsonProperty("expectations") List expectations) { + this.id = id; + this.relatedWorkText = relatedWorkText; + this.expectations = expectations; + } + + /** + * Load a fixture from a JSON file. + */ + public static RelatedWorkFixture load(Path jsonPath) throws IOException { + try (Reader r = Files.newBufferedReader(jsonPath)) { + return new ObjectMapper().readValue(r, RelatedWorkFixture.class); + } + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkHarvester.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkHarvester.java new file mode 100644 index 00000000000..fab79cc509a --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkHarvester.java @@ -0,0 +1,81 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Consumer; + +import org.jabref.logic.importer.RelatedWorkAnnotator; +import org.jabref.model.entry.BibEntry; + +public class RelatedWorkHarvester { + + private final RelatedWorkExtractor extractor; + + public RelatedWorkHarvester(RelatedWorkExtractor extractor) { + this.extractor = extractor; + } + + /** + * Existing API + */ + public void harvestAndAnnotate( + String username, + String citingPaperKey, + String fullText, + List bibliography, + Consumer addOrUpdateFn + ) { + annotateInternal(username, citingPaperKey, fullText, bibliography, addOrUpdateFn); + } + + /** + * Test-friendly: returns number of entries annotated. + */ + public int harvestAndAnnotateCount( + String username, + String citingPaperKey, + String fullText, + List bibliography, + Consumer addOrUpdateFn + ) { + return annotateInternal(username, citingPaperKey, fullText, bibliography, addOrUpdateFn); + } + + private int annotateInternal( + String username, + String citingPaperKey, + String fullText, + List bibliography, + Consumer addOrUpdateFn + ) { + Map summaries = extractor.extract(fullText, bibliography); + int updated = 0; + + for (Map.Entry e : summaries.entrySet()) { + String citedKey = e.getKey(); + String summary = e.getValue(); + + BibEntry entry = findOrCreateEntry(citedKey, bibliography, addOrUpdateFn); + RelatedWorkAnnotator.appendSummaryToEntry(entry, username, citingPaperKey, summary); + addOrUpdateFn.accept(entry); + updated++; + } + return updated; + } + + private BibEntry findOrCreateEntry(String key, List bibs, Consumer addOrUpdateFn) { + Optional found = bibs.stream() + .filter(b -> b.getCitationKey().orElse("").equals(key)) + .findFirst(); + + if (found.isPresent()) { + return found.get(); + } + + BibEntry newEntry = new BibEntry(); + newEntry.setCitationKey(key); + addOrUpdateFn.accept(newEntry); + return newEntry; + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkMetrics.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkMetrics.java new file mode 100644 index 00000000000..f35d846a05f --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkMetrics.java @@ -0,0 +1,121 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Immutable metrics summary for one evaluation run. + */ +public final class RelatedWorkMetrics { + + /** + * Per-entry result summary. + */ + public static final class PerEntry { + public final String key; // canonical key: firstAuthor-lastname-lower + "-" + year + public final int expectedSnippets; + public final int extractedSnippets; + public final int truePositiveSnippets; + + public PerEntry(String key, + int expectedSnippets, + int extractedSnippets, + int truePositiveSnippets) { + this.key = key; + this.expectedSnippets = expectedSnippets; + this.extractedSnippets = extractedSnippets; + this.truePositiveSnippets = truePositiveSnippets; + } + } + + public final int expectedPairs; + public final int extractedPairs; + public final int truePositives; + public final int falsePositives; + public final int falseNegatives; + + public final double precision; + public final double recall; + public final double f1; + + public final int distinctEntriesExpected; + public final int distinctEntriesAnnotated; + public final double avgSnippetsPerAnnotatedEntry; + + public final List perEntry; + + public RelatedWorkMetrics(int expectedPairs, + int extractedPairs, + int truePositives, + int falsePositives, + int falseNegatives, + int distinctEntriesExpected, + int distinctEntriesAnnotated, + double avgSnippetsPerAnnotatedEntry, + List perEntry) { + + this.expectedPairs = expectedPairs; + this.extractedPairs = extractedPairs; + this.truePositives = truePositives; + this.falsePositives = falsePositives; + this.falseNegatives = falseNegatives; + this.distinctEntriesExpected = distinctEntriesExpected; + this.distinctEntriesAnnotated = distinctEntriesAnnotated; + this.avgSnippetsPerAnnotatedEntry = avgSnippetsPerAnnotatedEntry; + + this.precision = (truePositives + falsePositives) == 0 + ? 0.0 + : (double) truePositives / (truePositives + falsePositives); + this.recall = (truePositives + falseNegatives) == 0 + ? 0.0 + : (double) truePositives / (truePositives + falseNegatives); + this.f1 = (precision + recall) == 0 + ? 0.0 + : (2.0 * precision * recall) / (precision + recall); + + this.perEntry = Collections.unmodifiableList(new ArrayList<>(perEntry)); + } + + public String pretty() { + double coverage = distinctEntriesExpected == 0 + ? 0.0 + : 100.0 * distinctEntriesAnnotated / distinctEntriesExpected; + return String.format(Locale.ROOT, + "RelatedWork Metrics:%n" + + " Pairs — expected=%d, extracted=%d, TP=%d, FP=%d, FN=%d%n" + + " Scores — precision=%.3f, recall=%.3f, F1=%.3f%n" + + " Coverage — entries_expected=%d, entries_annotated=%d (%.1f%%), " + + "avg_snippets/annotated_entry=%.2f", + expectedPairs, extractedPairs, truePositives, falsePositives, falseNegatives, + precision, recall, f1, + distinctEntriesExpected, distinctEntriesAnnotated, coverage, + avgSnippetsPerAnnotatedEntry); + } + + /** + * Builds per-entry summaries from confusion sets. + */ + static List perEntryFrom(Map> expectedByKey, + Map> extractedByKey, + Map> matchedIdxByKey) { + + Set allKeys = new HashSet<>(); + allKeys.addAll(expectedByKey.keySet()); + allKeys.addAll(extractedByKey.keySet()); + + return allKeys.stream() + .sorted() + .map(k -> new PerEntry( + k, + expectedByKey.getOrDefault(k, List.of()).size(), + extractedByKey.getOrDefault(k, List.of()).size(), + matchedIdxByKey.getOrDefault(k, Set.of()).size())) + .collect(Collectors.toList()); + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPdfPipeline.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPdfPipeline.java new file mode 100644 index 00000000000..1ef915a6507 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPdfPipeline.java @@ -0,0 +1,65 @@ +package org.jabref.logic.importer.relatedwork; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.jabref.logic.importer.RelatedWorkAnnotator; +import org.jabref.model.entry.BibEntry; + +/** + * End-to-end helper for callers that already have candidate entries: + * PDF -> section -> extractor -> annotator. + */ +public final class RelatedWorkPdfPipeline { + + private final PdfRelatedWorkTextExtractor pdfSectionExtractor; + private final HeuristicRelatedWorkExtractor extractor; + private final RelatedWorkAnnotator annotator; + + public RelatedWorkPdfPipeline(PdfRelatedWorkTextExtractor pdfSectionExtractor, + HeuristicRelatedWorkExtractor extractor, + RelatedWorkAnnotator annotator) { + this.pdfSectionExtractor = Objects.requireNonNull(pdfSectionExtractor); + this.extractor = Objects.requireNonNull(extractor); + this.annotator = Objects.requireNonNull(annotator); + } + + /** + * @return number of annotations appended across all matched entries + */ + public int run(Path citingPdf, + List candidateEntries, + String citingKey, + String username) throws IOException { + + return pdfSectionExtractor.extractRelatedWorkSection(citingPdf) + .map(section -> { + // 1) Extract citationKey -> snippet + Map snippets = extractor.extract(section, candidateEntries); + + // 2) Index candidates by citation key + Map byKey = new HashMap<>(); + for (BibEntry be : candidateEntries) { + be.getCitationKey().ifPresent(k -> byKey.put(k, be)); + } + + // 3) Append to matching entries + int appended = 0; + for (Map.Entry e : snippets.entrySet()) { + BibEntry target = byKey.get(e.getKey()); + if (target != null) { + // Adjust arg order to match your actual method signature if needed: + // appendSummaryToEntry(target, citingKey, summary, username) + annotator.appendSummaryToEntry(target, citingKey, e.getValue(), username); + appended++; + } + } + return appended; + }) + .orElse(0); + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPipeline.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPipeline.java new file mode 100644 index 00000000000..6c02f2e3d9f --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPipeline.java @@ -0,0 +1,51 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.jabref.model.entry.BibEntry; + +/** + * Wires the section locator + extractor: + * - finds the Related Work body in the full text + * - extracts per-citation snippets from that section + */ +public final class RelatedWorkPipeline { + + private final RelatedWorkSectionLocator locator; + private final HeuristicRelatedWorkExtractor extractor; + + public RelatedWorkPipeline(HeuristicRelatedWorkExtractor extractor) { + this.locator = new RelatedWorkSectionLocator(); + this.extractor = extractor; + } + + /** + * Full end-to-end step: locate section, then extract snippets. + * + * @param fullText entire plain-text of the paper + * @param candidateEntries entries we might cite + * @return map: citationKey -> extracted snippet + */ + public Map run(String fullText, List candidateEntries) { + if (fullText == null || fullText.isEmpty()) { + return Collections.emptyMap(); + } + + // Use the static helper to find the section span + Optional opt = + RelatedWorkSectionLocator.locateStatic(fullText); + + if (opt.isEmpty()) { + return Collections.emptyMap(); + } + + RelatedWorkSectionLocator.SectionSpan span = opt.get(); + String sectionText = fullText.substring(span.startOffset, span.endOffset); + + // The extractor expects a String for the section body + return extractor.extract(sectionText, candidateEntries); + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPluginConfig.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPluginConfig.java new file mode 100644 index 00000000000..0889b3528da --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkPluginConfig.java @@ -0,0 +1,65 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.Objects; + +/** + * Configuration holder for the Related Work plug-in. + *

+ * For now it carries: + * - a flag indicating whether AI summarization is enabled + * - an optional {@link RelatedWorkSummarizer} implementation + *

+ * If summarizationEnabled is false or the summarizer is null, the pipeline + * should behave as if AI summarization is off. + */ +public final class RelatedWorkPluginConfig { + + private final boolean summarizationEnabled; + private final RelatedWorkSummarizer summarizer; + + private RelatedWorkPluginConfig(Builder builder) { + this.summarizationEnabled = builder.summarizationEnabled; + this.summarizer = builder.summarizer; + } + + public boolean isSummarizationEnabled() { + return summarizationEnabled; + } + + /** + * @return the configured summarizer, or {@code null} if AI summarization is disabled. + */ + public RelatedWorkSummarizer getSummarizer() { + return summarizer; + } + + public static Builder builder() { + return new Builder(); + } + + public static final class Builder { + + private boolean summarizationEnabled; + private RelatedWorkSummarizer summarizer; + + /** + * Enable or disable AI summarization at the configuration level. + */ + public Builder enableSummarization(boolean enabled) { + this.summarizationEnabled = enabled; + return this; + } + + /** + * Configure the summarizer implementation to use when summarization is enabled. + */ + public Builder withSummarizer(RelatedWorkSummarizer summarizer) { + this.summarizer = Objects.requireNonNull(summarizer); + return this; + } + + public RelatedWorkPluginConfig build() { + return new RelatedWorkPluginConfig(this); + } + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkSectionLocator.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkSectionLocator.java new file mode 100644 index 00000000000..17565f3340c --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkSectionLocator.java @@ -0,0 +1,171 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Finds the "Related Work" (or variants) section and returns its text span. + */ +public final class RelatedWorkSectionLocator { + + /** + * Immutable span describing the located section. + */ + public static final class SectionSpan { + public final int headerStart; + public final int headerEnd; + public final int startOffset; // body start (after header line break) + public final int endOffset; // body end (before next header or EOF) + public final String headerText; + + public SectionSpan(int headerStart, int headerEnd, int startOffset, int endOffset, String headerText) { + this.headerStart = headerStart; + this.headerEnd = headerEnd; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.headerText = headerText; + } + } + + // Header patterns (case-insensitive), allowing optional numbering like "2", "2.1", or Roman numerals. + private static final Pattern HEADER_PATTERN = Pattern.compile( + "^(?:\\s*(?:\\d+(?:\\.\\d+)*|[IVXLCDM]+)\\.?\\s+)?" + + "(?:RELATED\\s+WORKS?" + + "|BACKGROUND\\s+AND\\s+RELATED\\s+WORK" + + "|LITERATURE\\s+REVIEW" + + "|STATE\\s+OF\\s+THE\\s+ART)" + + "\\s*$", + Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.MULTILINE + ); + + // Fragments to help detect a *generic* next header in a line-by-line scan. + private static final Pattern OPT_NUMBERING = Pattern.compile("^\\s*(?:\\d+(?:\\.\\d+)*|[IVXLCDM]+)\\.?\\s+"); + private static final Pattern ALL_CAPS_BODY = Pattern.compile("^[A-Z][A-Z \\-]{2,}$"); + private static final Pattern TITLE_CASE_WORD = Pattern.compile("[A-Z][\\p{L}\\p{M}\\-]+"); + + public RelatedWorkSectionLocator() { + } + + /** + * Instance entry point (delegates to static). + */ + public Optional locate(String text) { + return locateStatic(text); + } + + /** + * Static entry point for convenience in callers/tests. + */ + public static Optional locateStatic(String text) { + if (text == null || text.isEmpty()) { + return Optional.empty(); + } + + Matcher headerMatcher = HEADER_PATTERN.matcher(text); + if (!headerMatcher.find()) { + return Optional.empty(); + } + + int headerStart = headerMatcher.start(); + int headerEnd = headerMatcher.end(); + + // Body starts right after the header line break(s) + int startOffset = headerEnd; + if (startOffset < text.length() && text.charAt(startOffset) == '\r') { + startOffset++; + } + if (startOffset < text.length() && text.charAt(startOffset) == '\n') { + startOffset++; + } + + // Determine end by scanning subsequent lines until a "generic header" is found. + int endOffset = findNextHeaderBoundary(text, startOffset); + + // Trim trailing whitespace from the section body. + while (endOffset > startOffset && Character.isWhitespace(text.charAt(endOffset - 1))) { + endOffset--; + } + + String headerText = extractHeaderLine(text, headerStart, headerEnd); + return Optional.of(new SectionSpan(headerStart, headerEnd, startOffset, endOffset, headerText)); + } + + private static int findNextHeaderBoundary(String text, int startFrom) { + int pos = startFrom; + final int n = text.length(); + + while (pos < n) { + int lineEnd = indexOfNewline(text, pos); + String line = text.substring(pos, lineEnd); + + if (looksLikeHeader(line)) { + return pos; // cut section before this header + } + pos = (lineEnd < n) ? lineEnd + 1 : n; // move to next line (skip '\n') + } + return n; // no later header; section runs to EOF + } + + private static boolean looksLikeHeader(String rawLine) { + String line = rawLine.strip(); + if (line.isEmpty()) { + return false; + } + + // Remove optional numbering prefix for header-shape checks + String core = stripNumbering(line); + + // 1) ALL CAPS headers (e.g., "3 METHODS", "RESULTS") + if (ALL_CAPS_BODY.matcher(core).matches()) { + return true; + } + + // 2) Title-Case short headers: 1–6 words, each capitalized + // Avoid sentences by requiring few words and no trailing punctuation. + if (core.length() <= 80 && !endsWithPunctuation(core)) { + String[] parts = core.split("\\s+"); + if (parts.length >= 1 && parts.length <= 6) { + int titleLike = 0; + for (String p : parts) { + if (TITLE_CASE_WORD.matcher(p).matches()) { + titleLike++; + } + } + if (titleLike == parts.length) { + return true; + } + } + } + + return false; + } + + private static String stripNumbering(String s) { + Matcher m = OPT_NUMBERING.matcher(s); + return m.find() ? s.substring(m.end()).stripLeading() : s; + } + + private static boolean endsWithPunctuation(String s) { + if (s.isEmpty()) { + return false; + } + char c = s.charAt(s.length() - 1); + return c == '.' || c == ':' || c == ';' || c == '!' || c == '?'; + } + + private static int indexOfNewline(String text, int from) { + int idx = text.indexOf('\n', from); + return (idx == -1) ? text.length() : idx; + } + + private static String extractHeaderLine(String text, int headerStart, int headerEnd) { + int lineStart = text.lastIndexOf('\n', Math.max(0, headerStart - 1)); + lineStart = (lineStart == -1) ? 0 : lineStart + 1; + int lineEnd = text.indexOf('\n', headerEnd); + if (lineEnd == -1) { + lineEnd = text.length(); + } + return text.substring(lineStart, lineEnd).trim(); + } +} diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkSummarizer.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkSummarizer.java new file mode 100644 index 00000000000..7f6b807e09a --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/RelatedWorkSummarizer.java @@ -0,0 +1,33 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.List; +import java.util.Optional; + +import org.jabref.model.entry.BibEntry; + +/** + * SPI for optional AI-assisted summarization of related-work snippets. + *

+ * Implementations receive: + * - a list of sentence-like snippets that all describe the same cited paper + * - the target BibEntry (for context: key, title, etc.) + * - a soft maximum length for the resulting summary (in characters) + *

+ * They may return: + * - Optional.of(summary) if a useful, concise summary can be produced + * - Optional.empty() to signal "no summary / fall back to raw snippets" + */ +@FunctionalInterface +public interface RelatedWorkSummarizer { + + /** + * Produce a concise description of a cited paper based on related-work snippets. + * + * @param snippets sentence-like fragments taken from the citing paper's related-work section + * @param entry the BibEntry corresponding to the cited paper + * @param maxLen soft limit on the resulting description length (in characters); implementations are free + * to interpret this as a guideline rather than a strict cutoff + * @return optional summary text + */ + Optional summarize(List snippets, BibEntry entry, int maxLen); +} diff --git a/jablib/src/test/java/org/jabref/logic/importer/RelatedWorkAnnotatorTest.java b/jablib/src/test/java/org/jabref/logic/importer/RelatedWorkAnnotatorTest.java new file mode 100644 index 00000000000..edf6949b0c8 --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/RelatedWorkAnnotatorTest.java @@ -0,0 +1,50 @@ +package org.jabref.logic.importer; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.Field; +import org.jabref.model.entry.field.FieldFactory; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class RelatedWorkAnnotatorTest { + + @Test + void appendsToEmpty() { + BibEntry e = new BibEntry(); + e.setCitationKey("X"); + + RelatedWorkAnnotator.appendSummaryToEntry( + e, + "koppor", + "LunaOstos_2024", + "Colombia is a middle-income country" + ); + + Field commentField = FieldFactory.parseField("comment-koppor"); + String v = e.getField(commentField).orElse(""); + + assertTrue(v.startsWith("[LunaOstos_2024]: Colombia is a middle-income country")); + assertTrue(v.endsWith(".")); + } + + @Test + void appendsWithBlankLine() { + BibEntry e = new BibEntry(); + e.setCitationKey("X"); + + Field commentField = FieldFactory.parseField("comment-koppor"); + e.setField(commentField, "Existing text."); + + RelatedWorkAnnotator.appendSummaryToEntry( + e, + "koppor", + "LunaOstos_2024", + "New sentence" + ); + + String v = e.getField(commentField).orElse(""); + assertTrue(v.contains("Existing text.\n\n[LunaOstos_2024]: New sentence.")); + } +} diff --git a/jablib/src/test/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractorTest.java b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractorTest.java new file mode 100644 index 00000000000..cb8d718d7ac --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractorTest.java @@ -0,0 +1,72 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.List; +import java.util.Map; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class HeuristicRelatedWorkExtractorTest { + + private BibEntry entry(String key, String author, String year) { + BibEntry b = new BibEntry(); + b.setCitationKey(key); + b.setField(StandardField.AUTHOR, author); + b.setField(StandardField.YEAR, year); + return b; + } + + @Test + void singleCitationExtracts() { + String text = """ + 1.4 Related work + Prior literature reports similar findings (Vesce et al., 2016). Additional details follow. + """; + + BibEntry vesce = entry("Vesce2016Key", "Vesce, A.", "2016"); + HeuristicRelatedWorkExtractor ex = new HeuristicRelatedWorkExtractor(); + + Map out = ex.extract(text, List.of(vesce)); + assertEquals(1, out.size()); + assertTrue(out.containsKey("Vesce2016Key")); + assertTrue(out.get("Vesce2016Key").contains("similar findings")); + } + + @Test + void multiCitationBlockExtractsAll() { + String text = """ + RELATED WORK + Approaches vary by context (Bianchi, 2021; López & Perez 2020; Doe et al. 2019a), yet converge later. + """; + + BibEntry bianchi = entry("Bianchi2021", "Bianchi, M.", "2021"); + BibEntry lopez = entry("Lopez2020", "López and Perez", "2020"); + BibEntry doe = entry("Doe2019", "Doe and Others", "2019"); + + HeuristicRelatedWorkExtractor ex = new HeuristicRelatedWorkExtractor(); + Map out = ex.extract(text, List.of(bianchi, lopez, doe)); + + assertEquals(3, out.size()); + assertTrue(out.get("Bianchi2021").endsWith(".")); + assertTrue(out.get("Lopez2020").contains("Approaches vary")); + assertTrue(out.get("Doe2019").contains("Approaches vary")); + } + + @Test + void diacriticsAreNormalized() { + String text = """ + Related work + See also prior synthesis (Šimić, 2022). + """; + + BibEntry simic = entry("Simic2022", "Šimić, Ana", "2022"); + HeuristicRelatedWorkExtractor ex = new HeuristicRelatedWorkExtractor(); + Map out = ex.extract(text, List.of(simic)); + assertTrue(out.containsKey("Simic2022")); + } +} diff --git a/jablib/src/test/java/org/jabref/logic/importer/relatedwork/LangChainRelatedWorkSummarizerTest.java b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/LangChainRelatedWorkSummarizerTest.java new file mode 100644 index 00000000000..1c41ed3dd45 --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/LangChainRelatedWorkSummarizerTest.java @@ -0,0 +1,90 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.List; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicInteger; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +public class LangChainRelatedWorkSummarizerTest { + + private BibEntry sampleEntry() { + BibEntry e = new BibEntry(); + e.setCitationKey("LunaOstos_2024"); + e.setField(StandardField.TITLE, "Social Life Cycle Assessment in the Chocolate Industry"); + return e; + } + + @Test + public void delegatesToClientAndBuildsPromptFromSnippets() { + AtomicInteger calls = new AtomicInteger(0); + + LangChainRelatedWorkSummarizer.Client fakeClient = prompt -> { + calls.incrementAndGet(); + + // Very lightweight checks: prompt should contain key / title / snippets. + assertTrue(prompt.contains("LunaOstos_2024")); + assertTrue(prompt.contains("Social Life Cycle Assessment in the Chocolate Industry")); + assertTrue(prompt.contains("first snippet")); + assertTrue(prompt.contains("second snippet")); + + return "compressed summary"; + }; + + LangChainRelatedWorkSummarizer summarizer = + new LangChainRelatedWorkSummarizer(fakeClient); + + Optional result = summarizer.summarize( + List.of("first snippet", "second snippet"), + sampleEntry(), + 200 + ); + + assertTrue(result.isPresent()); + assertEquals("compressed summary", result.get()); + assertEquals(1, calls.get()); + } + + @Test + public void returnsEmptyWhenNoSnippets() { + LangChainRelatedWorkSummarizer.Client neverCalledClient = prompt -> { + fail("Client should not be called when there are no snippets"); + return ""; + }; + + LangChainRelatedWorkSummarizer summarizer = + new LangChainRelatedWorkSummarizer(neverCalledClient); + + Optional result = summarizer.summarize( + List.of(), + sampleEntry(), + 200 + ); + + assertFalse(result.isPresent()); + } + + @Test + public void trimsEmptyModelOutputToEmptyOptional() { + LangChainRelatedWorkSummarizer.Client emptyClient = prompt -> " "; + + LangChainRelatedWorkSummarizer summarizer = + new LangChainRelatedWorkSummarizer(emptyClient); + + Optional result = summarizer.summarize( + List.of("some snippet"), + sampleEntry(), + 200 + ); + + assertFalse(result.isPresent()); + } +} diff --git a/jablib/src/test/java/org/jabref/logic/importer/relatedwork/PdfRelatedWorkTextExtratorTest.java b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/PdfRelatedWorkTextExtratorTest.java new file mode 100644 index 00000000000..d8c32ed6e47 --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/PdfRelatedWorkTextExtratorTest.java @@ -0,0 +1,113 @@ +package org.jabref.logic.importer.relatedwork; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class PdfRelatedWorkTextExtractorTest { + + // Fake provider for tests (no real PDF parsing) + static class FakePdfTextProvider implements PdfTextProvider { + private final Optional toReturn; + private final boolean throwIo; + + FakePdfTextProvider(Optional toReturn, boolean throwIo) { + this.toReturn = toReturn; + this.throwIo = throwIo; + } + + @Override + public Optional extractPlainText(Path pdf) throws IOException { + if (throwIo) { + throw new IOException("boom"); + } + return toReturn; + } + } + + @TempDir + Path tmp; + + @Test + void returnsSectionWhenHeaderPresent() throws Exception { + String full = String.join("\n", + "1 Introduction", + "blah", + "2 Related Work", + "Prior studies (Smith, 2020).", + "More (Doe, 2022).", + "3 Methods", + "…" + ); + + Path fakePdf = Files.createFile(tmp.resolve("paper.pdf")); + PdfTextProvider provider = new FakePdfTextProvider(Optional.of(full), false); + PdfRelatedWorkTextExtractor adapter = new PdfRelatedWorkTextExtractor( + provider, + new RelatedWorkSectionLocator() + ); + + Optional section = adapter.extractRelatedWorkSection(fakePdf); + assertTrue(section.isPresent()); + assertTrue(section.get().contains("(Smith, 2020)")); + assertFalse(section.get().contains("3 Methods")); + } + + @Test + void emptyWhenNoHeader() throws Exception { + String full = String.join("\n", + "1 Introduction", + "No related work header here.", + "2 Methods" + ); + + Path fakePdf = Files.createFile(tmp.resolve("no-related.pdf")); + PdfRelatedWorkTextExtractor adapter = new PdfRelatedWorkTextExtractor( + new FakePdfTextProvider(Optional.of(full), false), + new RelatedWorkSectionLocator() + ); + + assertTrue(adapter.extractRelatedWorkSection(fakePdf).isEmpty()); + } + + @Test + void emptyWhenProviderReturnsEmpty() throws Exception { + Path fakePdf = Files.createFile(tmp.resolve("empty.pdf")); + PdfRelatedWorkTextExtractor adapter = new PdfRelatedWorkTextExtractor( + new FakePdfTextProvider(Optional.empty(), false), + new RelatedWorkSectionLocator() + ); + + assertTrue(adapter.extractRelatedWorkSection(fakePdf).isEmpty()); + } + + @Test + void throwsOnIoError() throws Exception { + Path fakePdf = Files.createFile(tmp.resolve("ioerr.pdf")); + PdfRelatedWorkTextExtractor adapter = new PdfRelatedWorkTextExtractor( + new FakePdfTextProvider(Optional.empty(), true), + new RelatedWorkSectionLocator() + ); + + assertThrows(IOException.class, () -> adapter.extractRelatedWorkSection(fakePdf)); + } + + @Test + void throwsOnNonFilePath() { + PdfRelatedWorkTextExtractor adapter = new PdfRelatedWorkTextExtractor( + new FakePdfTextProvider(Optional.empty(), false), + new RelatedWorkSectionLocator() + ); + + assertThrows(IllegalArgumentException.class, + () -> adapter.extractRelatedWorkSection(tmp.resolve("missing.pdf"))); + } +} diff --git a/jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkHarvesterTest.java b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkHarvesterTest.java new file mode 100644 index 00000000000..6ba05c0385a --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkHarvesterTest.java @@ -0,0 +1,67 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.ArrayList; +import java.util.List; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.Field; +import org.jabref.model.entry.field.FieldFactory; +import org.jabref.model.entry.field.StandardField; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class RelatedWorkHarvesterTest { + private BibEntry entry(String key, String author, String year) { + BibEntry b = new BibEntry(); + b.setCitationKey(key); + b.setField(StandardField.AUTHOR, author); + b.setField(StandardField.YEAR, year); + return b; + } + + @Test + void harvestEndToEnd() { + String text = """ + 1.4 Related work + Population estimates vary across sources (CIA, 2021). See also (Nash 2022). + """; + + List lib = new ArrayList<>(); + lib.add(entry("Agency2021", "Central Intelligence Agency", "2021")); + lib.add(entry("Nash2022", "Nash, T.", "2022")); + + HeuristicRelatedWorkExtractor ex = new HeuristicRelatedWorkExtractor(); + RelatedWorkHarvester harvester = new RelatedWorkHarvester(ex); + + // Add/update function with explicit braces (Checkstyle NeedBraces) + int updated = harvester.harvestAndAnnotateCount( + "koppor", + "LunaOstos_2024", + text, + lib, + b -> { + if (!lib.contains(b)) { + lib.add(b); + } + } + ); + + assertEquals(2, updated); + + Field commentField = FieldFactory.parseField("comment-koppor"); + boolean agencyAnnotated = lib.stream().anyMatch(b -> + "Agency2021".equals(b.getCitationKey().orElse("")) + && b.getField(commentField).orElse("").contains("[LunaOstos_2024]:") + ); + boolean nashAnnotated = lib.stream().anyMatch(b -> + "Nash2022".equals(b.getCitationKey().orElse("")) + && b.getField(commentField).orElse("").contains("[LunaOstos_2024]:") + ); + + assertTrue(agencyAnnotated); + assertTrue(nashAnnotated); + } +} diff --git a/jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkMetricsTest.java b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkMetricsTest.java new file mode 100644 index 00000000000..a28800c256b --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkMetricsTest.java @@ -0,0 +1,62 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.List; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class RelatedWorkMetricsTest { + + private static BibEntry be(String author, String year) { + BibEntry e = new BibEntry(); + e.setField(StandardField.AUTHOR, author); + e.setField(StandardField.YEAR, year); + // Add a citation key so the adapter can match it + e.setCitationKey(author.split(",")[0].replaceAll("\\s+", "") + year); + return e; + } + + @Test + public void evaluateFixtureInline() { + // 1. Inline Related Work text (shortened example) + String relatedWorkText = """ + Existing environmental LCAs include Italian chocolate production (Vesce et al., 2016), + a comparison of milk and white chocolate (Bianchi et al., 2021), + chocolate production and consumption in the UK (Konstantas et al., 2018), + and dark chocolate cradle-to-grave (Recanati et al., 2018). + """; + + // 2. Inline expected matches + List expectations = List.of( + new RelatedWorkFixture.Expectation("Vesce", "2016", "Italian chocolate production"), + new RelatedWorkFixture.Expectation("Bianchi", "2021", "milk and white chocolate"), + new RelatedWorkFixture.Expectation("Konstantas", "2018", "production and consumption in the UK"), + new RelatedWorkFixture.Expectation("Recanati", "2018", "dark chocolate cradle-to-grave") + ); + + // 3. Build fixture object directly + RelatedWorkFixture fx = new RelatedWorkFixture("inline-fixture", relatedWorkText, expectations); + + // 4. Candidate BibEntries (with citation keys) + List candidates = List.of( + be("Vesce, E.; Olivieri, G.; Pairotti, M. B.", "2016"), + be("Bianchi, F. R.; Moreschi, L.; Gallo, M.", "2021"), + be("Recanati, F.; Marveggio, D.; Dotelli, G.", "2018"), + be("Konstantas, A.; Jeswani, H. K.; Stamford, L.; Azapagic, A.", "2018") + ); + + // 5. Run extractor via adapter + evaluation runner + HeuristicRelatedWorkExtractor extractor = new HeuristicRelatedWorkExtractor(); + RelatedWorkEvaluationRunner runner = new RelatedWorkEvaluationRunner(new HeuristicExtractorAdapter(extractor)); + RelatedWorkMetrics metrics = runner.run(fx, candidates); + + System.out.println(metrics.pretty()); + + // Loose sanity check + assertTrue(metrics.recall >= 0.5, "recall should be >= 0.5 on the inline snippet set"); + } +} diff --git a/jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkSectionLocatorTest.java b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkSectionLocatorTest.java new file mode 100644 index 00000000000..9f8a22ea006 --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/relatedwork/RelatedWorkSectionLocatorTest.java @@ -0,0 +1,86 @@ +package org.jabref.logic.importer.relatedwork; + +import java.util.Optional; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +final class RelatedWorkSectionLocatorTest { + + @Test + void findsRelatedWorkHeader_UppercaseAndStopsAtNextHeader() { + String txt = """ + 1 Introduction + Some intro text. + + 2 RELATED WORK + Prior studies showed X and Y. + Even more prior-art discussion. + + 3 Methods + Details here. + """; + + Optional opt = RelatedWorkSectionLocator.locateStatic(txt); + assertTrue(opt.isPresent()); + + RelatedWorkSectionLocator.SectionSpan span = opt.get(); + String sectionBody = txt.substring(span.startOffset, span.endOffset); + String header = headerLineOf(txt, span.startOffset); + + // Header recognition + assertTrue(header.contains("RELATED WORK")); + + // Body includes expected text + assertTrue(sectionBody.contains("Prior studies showed X and Y.")); + + // Cut off at next header + assertEquals(-1, sectionBody.indexOf("3 Methods")); + assertEquals(-1, sectionBody.indexOf("3 METHODS")); + } + + @Test + void supportsLiteratureReviewVariant_UppercaseAndStopsAtNextHeader() { + String txt = """ + Background + Setup text. + + 2.1 LITERATURE REVIEW + We review A and B. + More discussion. + + 3 RESULTS + Data stuff. + """; + + Optional opt = RelatedWorkSectionLocator.locateStatic(txt); + assertTrue(opt.isPresent()); + + RelatedWorkSectionLocator.SectionSpan span = opt.get(); + String sectionBody = txt.substring(span.startOffset, span.endOffset); + String header = headerLineOf(txt, span.startOffset); + + // Header recognition (variant) + assertTrue(header.contains("LITERATURE REVIEW")); + + // Body should include first content line after the header + assertTrue(sectionBody.contains("We review A and B.")); + + // Ensure it cut off before the next section header + assertEquals(-1, sectionBody.indexOf("3 RESULTS")); + } + + /** + * Helper: derive the header line immediately preceding the section body start. + */ + private static String headerLineOf(String text, int bodyStartOffset) { + int nlBeforeBody = text.lastIndexOf('\n', Math.max(0, bodyStartOffset - 1)); + if (nlBeforeBody < 0) { + return text.substring(0, bodyStartOffset).trim(); + } + int nlBeforeHeader = text.lastIndexOf('\n', Math.max(0, nlBeforeBody - 1)); + return text.substring(nlBeforeHeader + 1, nlBeforeBody).trim(); + } +} From 398df2bcd0cedd6e851f7e9f996e922184c1b8dc Mon Sep 17 00:00:00 2001 From: jsochava <144294623+jsochava@users.noreply.github.com> Date: Fri, 21 Nov 2025 16:59:55 -0800 Subject: [PATCH 2/2] Just to get CI in the green(#14085) --- .../importer/relatedwork/HeuristicRelatedWorkExtractor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractor.java b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractor.java index 20426401482..33c54a04359 100644 --- a/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractor.java +++ b/jablib/src/main/java/org/jabref/logic/importer/relatedwork/HeuristicRelatedWorkExtractor.java @@ -15,7 +15,7 @@ /** * Deterministic extractor for author–year style citations in "Related Work" sections. - * Handles single and multi-citation parentheticals, including diacritics and all-caps acronyms (e.g., CIA, Šimić). + * Handles single and multi-citation parentheticals, including diacritics and all-caps acronyms. */ public class HeuristicRelatedWorkExtractor implements RelatedWorkExtractor {