Skip to content

Commit

Permalink
(forward-index, valuator) HTML features in valuator
Browse files Browse the repository at this point in the history
Put it in the forward index for easy access during index-side valuation.
  • Loading branch information
vlofgren committed Aug 18, 2023
1 parent fcfe07f commit 704de50
Show file tree
Hide file tree
Showing 27 changed files with 167 additions and 48 deletions.
Expand Up @@ -2,7 +2,6 @@

import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;

import java.util.Objects;
Expand All @@ -14,15 +13,19 @@ public final class SearchResultKeywordScore {
private final long encodedDocMetadata;
private final boolean hasPriorityTerms;

private final int htmlFeatures;

public SearchResultKeywordScore(int subquery,
String keyword,
long encodedWordMetadata,
long encodedDocMetadata,
int htmlFeatures,
boolean hasPriorityTerms) {
this.subquery = subquery;
this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
this.encodedDocMetadata = encodedDocMetadata;
this.htmlFeatures = htmlFeatures;
this.hasPriorityTerms = hasPriorityTerms;
}

Expand Down Expand Up @@ -58,6 +61,10 @@ public long encodedDocMetadata() {
return encodedDocMetadata;
}

public int htmlFeatures() {
return htmlFeatures;
}

public boolean hasPriorityTerms() {
return hasPriorityTerms;
}
Expand Down
Expand Up @@ -88,8 +88,9 @@ public void convert() throws IOException {
int ranking = domainRankings.getRanking(entry.domainId());
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);

docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures());
});

progress.progress(TaskSteps.FORCE);
Expand Down
@@ -1,8 +1,9 @@
package nu.marginalia.index.forward;

class ForwardIndexParameters {
public static final int ENTRY_SIZE = 2;
public static final int ENTRY_SIZE = 3;
public static final int DOMAIN_OFFSET = 0;
public static final int METADATA_OFFSET = 1;
public static final int FEATURES_OFFSET = 2;

}
Expand Up @@ -77,6 +77,13 @@ public long getDocMeta(long docId) {
return data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
}

public int getHtmlFeatures(long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return 0;

return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
}

public int getDomainId(long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return 0;
Expand Down
Expand Up @@ -7,22 +7,26 @@
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {

public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {
return new IndexJournalEntryBuilder(documentId, documentMeta);
return new IndexJournalEntryBuilder(0, documentId, documentMeta);
}

public static IndexJournalEntryBuilder builder(int domainId,
int urlId,
long documentMeta) {


return builder(new EdgeId<>(domainId), new EdgeId<>(urlId), documentMeta);
return builder(new EdgeId<>(domainId),
new EdgeId<>(urlId),
documentMeta);
}

public static IndexJournalEntryBuilder builder(EdgeId<EdgeDomain> domainId,
EdgeId<EdgeUrl> urlId,
long documentMeta) {


return new IndexJournalEntryBuilder(IndexJournalEntryHeader.combineIds(domainId, urlId), documentMeta);
return new IndexJournalEntryBuilder(0,
IndexJournalEntryHeader.combineIds(domainId, urlId),
documentMeta);
}
}
Expand Up @@ -4,10 +4,15 @@

public class IndexJournalEntryBuilder {
private final long documentId;
private final int documentFeatures;
private final long documentMeta;
private final TLongArrayList items = new TLongArrayList();

public IndexJournalEntryBuilder(long documentId, long documentMeta) {
public IndexJournalEntryBuilder(
int documentFeatures,
long documentId,
long documentMeta) {
this.documentFeatures = documentFeatures;
this.documentId = documentId;
this.documentMeta = documentMeta;
}
Expand All @@ -22,7 +27,10 @@ public IndexJournalEntryBuilder add(long wordId, long metadata) {

public IndexJournalEntry build() {
return new IndexJournalEntry(
new IndexJournalEntryHeader(items.size(), documentId, documentMeta),
new IndexJournalEntryHeader(items.size(),
documentFeatures,
documentId,
documentMeta),
new IndexJournalEntryData(items.toArray())
);
}
Expand Down
Expand Up @@ -4,10 +4,19 @@
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;

public record IndexJournalEntryHeader(int entrySize, long combinedId, long documentMeta) {
public record IndexJournalEntryHeader(int entrySize,
int documentFeatures,
long combinedId,
long documentMeta) {

public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, long documentMeta) {
this(-1, combineIds(domainId, urlId), documentMeta);
public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId,
int documentFeatures,
EdgeId<EdgeUrl> urlId,
long documentMeta) {
this(-1,
documentFeatures,
combineIds(domainId, urlId),
documentMeta);
}

static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {
Expand Down
Expand Up @@ -30,6 +30,7 @@ public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOE

var header = new IndexJournalEntryHeader(
(int) (sizeBlock >>> 32L),
(int) (sizeBlock & 0xFFFF_FFFFL),
docId,
meta);

Expand Down
Expand Up @@ -72,7 +72,7 @@ public synchronized void put(IndexJournalEntryHeader header, IndexJournalEntryDa
}

dataBuffer.putInt(entry.size());
dataBuffer.putInt(0);
dataBuffer.putInt(header.documentFeatures());
dataBuffer.putLong(header.combinedId());
dataBuffer.putLong(header.documentMeta());

Expand Down
Expand Up @@ -106,7 +106,7 @@ long createId(long url, long domain) {
}
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
int[] factors = getFactorsI(id);
var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5);
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);

long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
Expand Down
Expand Up @@ -106,7 +106,7 @@ long createId(long url, long domain) {
}
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
int[] factors = getFactorsI(id);
var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5);
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);

long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
Expand Down
Expand Up @@ -5,6 +5,7 @@
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.ranking.factors.*;

Expand Down Expand Up @@ -48,19 +49,20 @@ public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
double bestScore = 10;

long documentMetadata = documentMetadata(scores);

int features = htmlFeatures(scores);
var rankingParams = ctx.params;

int rank = DocumentMetadata.decodeRank(documentMetadata);
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
int quality = DocumentMetadata.decodeQuality(documentMetadata);
int urlTypePenalty = getUrlTypePenalty(documentMetadata);
int size = DocumentMetadata.decodeSize(documentMetadata);
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size, quality);
int topology = DocumentMetadata.decodeTopology(documentMetadata);
int year = DocumentMetadata.decodeYear(documentMetadata);

double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);

final double qualityPenalty = -quality * rankingParams.qualityPenalty;
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
final double topologyBonus = Math.log(1 + topology);
final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
Expand All @@ -80,7 +82,7 @@ public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
+ rankingBonus
+ topologyBonus
+ temporalBias
+ urlTypePenalty
+ flagsPenalty
+ priorityTermBonus.calculate(scores);

for (int set = 0; set <= sets; set++) {
Expand All @@ -93,7 +95,8 @@ public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
final double bm25 = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx);
final double bm25p = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx);

double score = normalize(bm25 + bm25p + tcf + overallPart, keywordSet.length());
double nonNormalizedScore = bm25 + bm25p + tcf + overallPart;
double score = normalize(nonNormalizedScore, keywordSet.length());

bestScore = min(bestScore, score);

Expand All @@ -102,16 +105,55 @@ public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
return bestScore;
}

private int getUrlTypePenalty(long documentMetadata) {
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
if (size < 400) {
if (quality < 5)
return 0;
return -quality * rankingParams.qualityPenalty;
}
else {
return -quality * rankingParams.qualityPenalty * 20;
}
}

private int flagsPenalty(int featureFlags, long docFlags, int size, double quality) {

// Short-circuit for index-service, which does not have the feature flags
if (featureFlags == 0)
return 0;

// Long urls-that-look-like-this tend to be poor search results
if (DocumentMetadata.hasFlags(documentMetadata,
HtmlFeature.LONG_URL.getFeatureBit()
| HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) {
return 2;
double penalty = 0;

boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);

// Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
double largeSiteFactor = 1.;

if (!isForum && size > 400) {
// Long urls-that-look-like-this tend to be poor search results
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
penalty += 30.0;
else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
penalty += 30.;
else penalty += 5.;

largeSiteFactor = 2;
}

return 0;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
penalty += 5.0 * largeSiteFactor;

if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
penalty += 5.0 * largeSiteFactor;

if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;

if (isForum) {
penalty = Math.min(0, penalty - 2);
}

return (int) -penalty;
}

private long documentMetadata(List<SearchResultKeywordScore> rawScores) {
Expand All @@ -121,6 +163,13 @@ private long documentMetadata(List<SearchResultKeywordScore> rawScores) {
return 0;
}

private int htmlFeatures(List<SearchResultKeywordScore> rawScores) {
for (var score : rawScores) {
return score.htmlFeatures();
}
return 0;
}

private ResultKeywordSet createKeywordSet(ValuatorListPool<SearchResultKeywordScore> listPool,
List<SearchResultKeywordScore> rawScores,
int thisSet)
Expand Down
Expand Up @@ -40,20 +40,20 @@ public void setUp() {
new SearchResultKeywordScore(0, "bob",
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
false)
0, false)
);
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
false)
0, false)
);

List<SearchResultKeywordScore> highCountSubjectSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
false)
0, false)
);


Expand Down
Expand Up @@ -89,7 +89,7 @@ ResultKeywordSet createSet(long... positionMasks) {

for (int i = 0; i < positionMasks.length; i++) {
keywords.add(new SearchResultKeywordScore(0, "",
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, false));
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0, false));
}

return new ResultKeywordSet(keywords);
Expand Down
Expand Up @@ -19,7 +19,7 @@ default void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, S
default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {}

default void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {}
default void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {}

default void loadDomainRedirect(DomainLink link) {}

Expand Down
Expand Up @@ -7,11 +7,11 @@
import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.model.EdgeUrl;

public record LoadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {
public record LoadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {

@Override
public void apply(Interpreter interpreter) {
interpreter.loadKeywords(url, metadata, words);
interpreter.loadKeywords(url, features, metadata, words);
}

@Override
Expand Down
Expand Up @@ -130,7 +130,7 @@ public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {
}

@Override
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {
keywords++;
}

Expand Down

0 comments on commit 704de50

Please sign in to comment.