diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java index a62bbc378..9964c8fab 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java @@ -71,6 +71,12 @@ private ProcessedDocument process(Path path) { String body = Files.readString(path); String url = urlBase + dirBase.relativize(path); + // We trim "/index.html"-suffixes from the index if they are present, + // since this is typically an artifact from document retrieval + if (url.endsWith("/index.html")) { + url = url.substring(0, url.length() - "index.html".length()); + } + return sideloaderProcessing .processDocument(url, body, extraKeywords, 10_000); }