From 704de50a9b36cecd6ab106f6004a43f3dcedb43d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 18 Aug 2023 11:54:56 +0200 Subject: [PATCH] (forward-index, valuator) HTML features in valuator Put it in the forward index for easy access during index-side valuation. --- .../results/SearchResultKeywordScore.java | 9 ++- .../index/forward/ForwardIndexConverter.java | 3 +- .../index/forward/ForwardIndexParameters.java | 3 +- .../index/forward/ForwardIndexReader.java | 7 ++ .../journal/model/IndexJournalEntry.java | 10 ++- .../model/IndexJournalEntryBuilder.java | 12 ++- .../model/IndexJournalEntryHeader.java | 15 +++- .../journal/reader/IndexJournalReadEntry.java | 1 + .../writer/IndexJournalWriterImpl.java | 2 +- .../ReverseIndexFullConverterTest2.java | 2 +- .../ReverseIndexPriorityConverterTest2.java | 2 +- .../nu/marginalia/ranking/ResultValuator.java | 73 ++++++++++++++++--- .../ranking/ResultValuatorTest.java | 6 +- .../factors/TermCoherenceFactorTest.java | 2 +- .../converting/instruction/Interpreter.java | 2 +- .../instructions/LoadKeywords.java | 4 +- .../converting/InstructionWriterFactory.java | 2 +- .../compiler/DocumentsCompiler.java | 9 ++- .../converting/ConvertingIntegrationTest.java | 3 +- .../loading/loader/IndexLoadKeywords.java | 19 ++++- .../nu/marginalia/loading/loader/Loader.java | 4 +- .../loader/LoaderIndexJournalWriter.java | 8 +- .../marginalia/index/index/SearchIndex.java | 3 + .../index/index/SearchIndexReader.java | 4 + .../index/results/IndexMetadataService.java | 4 + .../index/results/IndexResultValuator.java | 2 + .../svc/IndexQueryServiceIntegrationTest.java | 4 +- 27 files changed, 167 insertions(+), 48 deletions(-) diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java index 6b8af8071..77266112b 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java @@ -2,7 +2,6 @@ import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import java.util.Objects; @@ -14,15 +13,19 @@ public final class SearchResultKeywordScore { private final long encodedDocMetadata; private final boolean hasPriorityTerms; + private final int htmlFeatures; + public SearchResultKeywordScore(int subquery, String keyword, long encodedWordMetadata, long encodedDocMetadata, + int htmlFeatures, boolean hasPriorityTerms) { this.subquery = subquery; this.keyword = keyword; this.encodedWordMetadata = encodedWordMetadata; this.encodedDocMetadata = encodedDocMetadata; + this.htmlFeatures = htmlFeatures; this.hasPriorityTerms = hasPriorityTerms; } @@ -58,6 +61,10 @@ public long encodedDocMetadata() { return encodedDocMetadata; } + public int htmlFeatures() { + return htmlFeatures; + } + public boolean hasPriorityTerms() { return hasPriorityTerms; } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java index 07a966f86..4aa083e33 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -88,8 +88,9 @@ public void convert() throws IOException { int ranking = domainRankings.getRanking(entry.domainId()); long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking); - docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId()); + docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); + docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures()); }); progress.progress(TaskSteps.FORCE); diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java index ca09c4407..f9c17a715 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java @@ -1,8 +1,9 @@ package nu.marginalia.index.forward; class ForwardIndexParameters { - public static final int ENTRY_SIZE = 2; + public static final int ENTRY_SIZE = 3; public static final int DOMAIN_OFFSET = 0; public static final int METADATA_OFFSET = 1; + public static final int FEATURES_OFFSET = 2; } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java index 17c66e079..3bdf14c85 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -77,6 +77,13 @@ public long getDocMeta(long docId) { return data.get(ENTRY_SIZE * offset + METADATA_OFFSET); } + public int getHtmlFeatures(long docId) { + long offset = idxForDoc(docId); + if (offset < 0) return 0; + + return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET); + } + public int getDomainId(long docId) { long offset = idxForDoc(docId); if (offset < 0) return 0; diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java index dd0b8e1b2..c36022660 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java @@ -7,7 +7,7 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) { public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) { - return new IndexJournalEntryBuilder(documentId, documentMeta); + return new IndexJournalEntryBuilder(0, documentId, documentMeta); } public static IndexJournalEntryBuilder builder(int domainId, @@ -15,7 +15,9 @@ public static IndexJournalEntryBuilder builder(int domainId, long documentMeta) { - return builder(new EdgeId<>(domainId), new EdgeId<>(urlId), documentMeta); + return builder(new EdgeId<>(domainId), + new EdgeId<>(urlId), + documentMeta); } public static IndexJournalEntryBuilder builder(EdgeId domainId, @@ -23,6 +25,8 @@ public static IndexJournalEntryBuilder builder(EdgeId domainId, long documentMeta) { - return new IndexJournalEntryBuilder(IndexJournalEntryHeader.combineIds(domainId, urlId), documentMeta); + return new IndexJournalEntryBuilder(0, + IndexJournalEntryHeader.combineIds(domainId, urlId), + documentMeta); } } diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java index 979af42d0..6bfa19ea4 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java @@ -4,10 +4,15 @@ public class IndexJournalEntryBuilder { private final long documentId; + private final int documentFeatures; private final long documentMeta; private final TLongArrayList items = new TLongArrayList(); - public IndexJournalEntryBuilder(long documentId, long documentMeta) { + public IndexJournalEntryBuilder( + int documentFeatures, + long documentId, + long documentMeta) { + this.documentFeatures = documentFeatures; this.documentId = documentId; this.documentMeta = documentMeta; } @@ -22,7 +27,10 @@ public IndexJournalEntryBuilder add(long wordId, long metadata) { public IndexJournalEntry build() { return new IndexJournalEntry( - new IndexJournalEntryHeader(items.size(), documentId, documentMeta), + new IndexJournalEntryHeader(items.size(), + documentFeatures, + documentId, + documentMeta), new IndexJournalEntryData(items.toArray()) ); } diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java index bbc81a17c..657b1685f 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java @@ -4,10 +4,19 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.EdgeId; -public record IndexJournalEntryHeader(int entrySize, long combinedId, long documentMeta) { +public record IndexJournalEntryHeader(int entrySize, + int documentFeatures, + long combinedId, + long documentMeta) { - public IndexJournalEntryHeader(EdgeId domainId, EdgeId urlId, long documentMeta) { - this(-1, combineIds(domainId, urlId), documentMeta); + public IndexJournalEntryHeader(EdgeId domainId, + int documentFeatures, + EdgeId urlId, + long documentMeta) { + this(-1, + documentFeatures, + combineIds(domainId, urlId), + documentMeta); } static long combineIds(EdgeId domainId, EdgeId urlId) { diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java index 464b14379..00ba3b88d 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java @@ -30,6 +30,7 @@ public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOE var header = new IndexJournalEntryHeader( (int) (sizeBlock >>> 32L), + (int) (sizeBlock & 0xFFFF_FFFFL), docId, meta); diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java index d39dfb156..c1cec279b 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java @@ -72,7 +72,7 @@ public synchronized void put(IndexJournalEntryHeader header, IndexJournalEntryDa } dataBuffer.putInt(entry.size()); - dataBuffer.putInt(0); + dataBuffer.putInt(header.documentFeatures()); dataBuffer.putLong(header.combinedId()); dataBuffer.putLong(header.documentMeta()); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java index e4c7b7e4b..4da283a0a 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java @@ -106,7 +106,7 @@ long createId(long url, long domain) { } public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { int[] factors = getFactorsI(id); - var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5); + var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java index dcd46e22a..a5ad69409 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java @@ -106,7 +106,7 @@ long createId(long url, long domain) { } public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { int[] factors = getFactorsI(id); - var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5); + var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { diff --git a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index c9bcbab6b..4602044b7 100644 --- a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -5,6 +5,7 @@ import nu.marginalia.index.client.model.results.SearchResultKeywordScore; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.ranking.factors.*; @@ -48,19 +49,20 @@ public double calculateSearchResultValue(List scores, double bestScore = 10; long documentMetadata = documentMetadata(scores); - + int features = htmlFeatures(scores); var rankingParams = ctx.params; int rank = DocumentMetadata.decodeRank(documentMetadata); int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata); int quality = DocumentMetadata.decodeQuality(documentMetadata); - int urlTypePenalty = getUrlTypePenalty(documentMetadata); + int size = DocumentMetadata.decodeSize(documentMetadata); + int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size, quality); int topology = DocumentMetadata.decodeTopology(documentMetadata); int year = DocumentMetadata.decodeYear(documentMetadata); double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty); - final double qualityPenalty = -quality * rankingParams.qualityPenalty; + final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams); final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus; final double topologyBonus = Math.log(1 + topology); final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty; @@ -80,7 +82,7 @@ public double calculateSearchResultValue(List scores, + rankingBonus + topologyBonus + temporalBias - + urlTypePenalty + + flagsPenalty + priorityTermBonus.calculate(scores); for (int set = 0; set <= sets; set++) { @@ -93,7 +95,8 @@ public double calculateSearchResultValue(List scores, final double bm25 = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx); final double bm25p = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx); - double score = normalize(bm25 + bm25p + tcf + overallPart, keywordSet.length()); + double nonNormalizedScore = bm25 + bm25p + tcf + overallPart; + double score = normalize(nonNormalizedScore, keywordSet.length()); bestScore = min(bestScore, score); @@ -102,16 +105,55 @@ public double calculateSearchResultValue(List scores, return bestScore; } - private int getUrlTypePenalty(long documentMetadata) { + private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { + if (size < 400) { + if (quality < 5) + return 0; + return -quality * rankingParams.qualityPenalty; + } + else { + return -quality * rankingParams.qualityPenalty * 20; + } + } + + private int flagsPenalty(int featureFlags, long docFlags, int size, double quality) { + + // Short-circuit for index-service, which does not have the feature flags + if (featureFlags == 0) + return 0; - // Long urls-that-look-like-this tend to be poor search results - if (DocumentMetadata.hasFlags(documentMetadata, - HtmlFeature.LONG_URL.getFeatureBit() - | HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) { - return 2; + double penalty = 0; + + boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags); + + // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site + double largeSiteFactor = 1.; + + if (!isForum && size > 400) { + // Long urls-that-look-like-this tend to be poor search results + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) + penalty += 30.0; + else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit())) + penalty += 30.; + else penalty += 5.; + + largeSiteFactor = 2; } - return 0; + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit())) + penalty += 5.0 * largeSiteFactor; + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit())) + penalty += 5.0 * largeSiteFactor; + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) + penalty += 2.5 * largeSiteFactor; + + if (isForum) { + penalty = Math.min(0, penalty - 2); + } + + return (int) -penalty; } private long documentMetadata(List rawScores) { @@ -121,6 +163,13 @@ private long documentMetadata(List rawScores) { return 0; } + private int htmlFeatures(List rawScores) { + for (var score : rawScores) { + return score.htmlFeatures(); + } + return 0; + } + private ResultKeywordSet createKeywordSet(ValuatorListPool listPool, List rawScores, int thisSet) diff --git a/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/ResultValuatorTest.java b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/ResultValuatorTest.java index 232d59137..a4100e79b 100644 --- a/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/ResultValuatorTest.java +++ b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/ResultValuatorTest.java @@ -40,20 +40,20 @@ public void setUp() { new SearchResultKeywordScore(0, "bob", wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - false) + 0, false) ); List highCountNoTitleSet = List.of( new SearchResultKeywordScore(0, "bob", wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - false) + 0, false) ); List highCountSubjectSet = List.of( new SearchResultKeywordScore(0, "bob", wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - false) + 0, false) ); diff --git a/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java index 66b60d046..a981ba800 100644 --- a/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java +++ b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java @@ -89,7 +89,7 @@ ResultKeywordSet createSet(long... positionMasks) { for (int i = 0; i < positionMasks.length; i++) { keywords.add(new SearchResultKeywordScore(0, "", - new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, false)); + new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0, false)); } return new ResultKeywordSet(keywords); diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java index 248ea38de..a7089b9f0 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java @@ -19,7 +19,7 @@ default void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, S default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {} - default void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {} + default void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {} default void loadDomainRedirect(DomainLink link) {} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java index b33103ee1..779ec79c2 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java @@ -7,11 +7,11 @@ import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.model.EdgeUrl; -public record LoadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) implements Instruction { +public record LoadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction { @Override public void apply(Interpreter interpreter) { - interpreter.loadKeywords(url, metadata, words); + interpreter.loadKeywords(url, features, metadata, words); } @Override diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java index 08f842c68..c3683cd00 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java @@ -130,7 +130,7 @@ public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) { } @Override - public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) { + public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) { keywords++; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index 9bc3f6b35..21a610fb1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -31,11 +31,16 @@ public void compileDocumentDetails(Consumer instructionConsumer, Pr } } - public void compileWords(Consumer instructionConsumer, ProcessedDocument doc) { + public void compileWords(Consumer instructionConsumer, + ProcessedDocument doc) { var words = doc.words; if (words != null) { - instructionConsumer.accept(new LoadKeywords(doc.url, doc.details.metadata, words.build())); + instructionConsumer.accept(new LoadKeywords(doc.url, + HtmlFeature.encode(doc.details.features), + doc.details.metadata, + words.build()) + ); } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index 850b6ec26..d43ddecfb 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -17,7 +17,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.io.IOException; +import java.io.*; import java.nio.file.Path; import java.time.LocalTime; import java.util.*; @@ -143,4 +143,5 @@ private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain return SerializableCrawlDataStream.fromIterator(data.iterator()); } + } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java index 7374c0a38..ae914d3d7 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java @@ -18,7 +18,11 @@ public class IndexLoadKeywords implements Runnable { private final LinkedBlockingQueue insertQueue = new LinkedBlockingQueue<>(32); private final LoaderIndexJournalWriter journalWriter; - private record InsertTask(int urlId, int domainId, DocumentMetadata metadata, DocumentKeywords wordSet) {} + private record InsertTask(int urlId, + int domainId, + int features, + DocumentMetadata metadata, + DocumentKeywords wordSet) {} private final Thread runThread; @@ -36,7 +40,10 @@ public void run() { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet); + journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), + data.features, + data.metadata(), + data.wordSet); } } } @@ -49,7 +56,11 @@ public void close() throws Exception { } } - public void load(LoaderData loaderData, EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) throws InterruptedException { + public void load(LoaderData loaderData, + EdgeUrl url, + int features, + DocumentMetadata metadata, + DocumentKeywords words) throws InterruptedException { int domainId = loaderData.getDomainId(url.domain); int urlId = loaderData.getUrlId(url); @@ -58,6 +69,6 @@ public void load(LoaderData loaderData, EdgeUrl url, DocumentMetadata metadata, return; } - insertQueue.put(new InsertTask(urlId, domainId, metadata, words)); + insertQueue.put(new InsertTask(urlId, domainId, features, metadata, words)); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index d6f97076f..80b6afec6 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -103,9 +103,9 @@ public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError docume } @Override - public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) { + public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) { try { - indexLoadKeywords.load(data, url, metadata, words); + indexLoadKeywords.load(data, url, features, metadata, words); } catch (InterruptedException e) { throw new RuntimeException(e); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java index 073b5c944..4aabdcea9 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -60,6 +60,7 @@ public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IO @SneakyThrows public void putWords(EdgeId domain, EdgeId url, + int features, DocumentMetadata metadata, DocumentKeywords wordSet) { if (wordSet.keywords().length == 0) { @@ -76,10 +77,10 @@ public void putWords(EdgeId domain, EdgeId url, // with a chonky work queue is a fairly decent improvement for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) { try { - keywordInsertionExecutor.submit(() -> loadWords(domain, url, metadata, chunk)); + keywordInsertionExecutor.submit(() -> loadWords(domain, url, features, metadata, chunk)); } catch (RejectedExecutionException ex) { - loadWords(domain, url, metadata, chunk); + loadWords(domain, url, features, metadata, chunk); } } @@ -87,6 +88,7 @@ public void putWords(EdgeId domain, EdgeId url, private void loadWords(EdgeId domain, EdgeId url, + int features, DocumentMetadata metadata, DocumentKeywords wordSet) { if (null == metadata) { @@ -95,7 +97,7 @@ private void loadWords(EdgeId domain, } var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata())); - var header = new IndexJournalEntryHeader(domain, url, metadata.encode()); + var header = new IndexJournalEntryHeader(domain, features, url, metadata.encode()); indexWriter.put(header, entry); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index 397c291c9..d46645315 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -196,6 +196,9 @@ public long[] getTermMetadata(int termId, long[] docs) { public long getDocumentMetadata(long docId) { return indexReader.getDocumentMetadata(docId); } + public int getHtmlFeatures(long docId) { + return indexReader.getHtmlFeatures(docId); + } public int getDomainId(long docId) { return indexReader.getDomainId(docId); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java index db7bed61e..8a3e3e6e0 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java @@ -67,4 +67,8 @@ public int getDomainId(long docId) { public int totalDocCount() { return forwardIndexReader.totalDocCount(); } + + public int getHtmlFeatures(long docId) { + return forwardIndexReader.getHtmlFeatures(docId); + } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java index 9fd299e51..996afafa8 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java @@ -34,6 +34,10 @@ public long getDocumentMetadata(long urlId) { return index.getDocumentMetadata(urlId); } + public int getHtmlFeatures(long urlId) { + return index.getHtmlFeatures(urlId); + } + public int getDomainId(long urlId) { return index.getDomainId(urlId); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index dd13e9428..34ea1826d 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -59,6 +59,7 @@ public SearchResultItem calculatePreliminaryScore(long id) { searchResult.setDomainId(metadataService.getDomainId(urlIdInt)); long docMetadata = metadataService.getDocumentMetadata(urlIdInt); + int htmlFeatures = metadataService.getHtmlFeatures(urlIdInt); int maxFlagsCount = 0; boolean anyAllSynthetic = false; @@ -85,6 +86,7 @@ public SearchResultItem calculatePreliminaryScore(long id) { searchTerm, metadata, docMetadata, + htmlFeatures, resultsWithPriorityTerms.contains(searchResult.combinedId) ); diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index 37030b1ff..01be347b4 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -177,7 +177,7 @@ public void loadData(int id) { long fullId = id | ((long) (32 - (id % 32)) << 32); - var header = new IndexJournalEntryHeader(factors.length, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); + var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { @@ -190,7 +190,7 @@ public void loadData(int id) { public void loadDataWithDomain(int domain, int id) { int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - var header = new IndexJournalEntryHeader(factors.length, id | ((long) domain << 32), DocumentMetadata.defaultValue()); + var header = new IndexJournalEntryHeader(factors.length, 0, id | ((long) domain << 32), DocumentMetadata.defaultValue()); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) {