From 4598c7f40f02b9ce5ba5ebe9577c9251d61f084b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 16 Aug 2023 13:11:24 +0200 Subject: [PATCH] (valuation) Penalize wordpress style kebab case urls --- .../marginalia/model/crawl/HtmlFeature.java | 5 +- .../nu/marginalia/ranking/ResultValuator.java | 15 ++++++ .../processor/logic/DocumentValuator.java | 53 +++++++++++++++++++ .../processor/logic/FeatureExtractor.java | 18 ++++++- .../plugin/HtmlDocumentProcessorPlugin.java | 6 +-- .../marginalia/search/model/UrlDetails.java | 4 +- .../templates/search/search-result.hdb | 10 ++-- 7 files changed, 101 insertions(+), 10 deletions(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java index 03e5557c1..56232a4bd 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java +++ b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java @@ -11,7 +11,10 @@ public enum HtmlFeature { JS("special:scripts"), AFFILIATE_LINK( "special:affiliate"), TRACKING("special:tracking"), - TRACKING_ADTECH("special:ads"), // We'll this as ads for now + TRACKING_ADTECH("special:ads"), // We'll call this ads for now + + KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/ + LONG_URL("special:longurl"), VIEWPORT("special:viewport"), diff --git a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 69a379fba..c9bcbab6b 100644 --- a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -3,6 +3,7 @@ import nu.marginalia.index.client.model.results.ResultRankingContext; import nu.marginalia.index.client.model.results.ResultRankingParameters; import nu.marginalia.index.client.model.results.SearchResultKeywordScore; +import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.ranking.factors.*; @@ -53,6 +54,7 @@ public double calculateSearchResultValue(List scores, int rank = DocumentMetadata.decodeRank(documentMetadata); int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata); int quality = DocumentMetadata.decodeQuality(documentMetadata); + int urlTypePenalty = getUrlTypePenalty(documentMetadata); int topology = DocumentMetadata.decodeTopology(documentMetadata); int year = DocumentMetadata.decodeYear(documentMetadata); @@ -78,6 +80,7 @@ public double calculateSearchResultValue(List scores, + rankingBonus + topologyBonus + temporalBias + + urlTypePenalty + priorityTermBonus.calculate(scores); for (int set = 0; set <= sets; set++) { @@ -99,6 +102,18 @@ public double calculateSearchResultValue(List scores, return bestScore; } + private int getUrlTypePenalty(long documentMetadata) { + + // Long urls-that-look-like-this tend to be poor search results + if (DocumentMetadata.hasFlags(documentMetadata, + HtmlFeature.LONG_URL.getFeatureBit() + | HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) { + return 2; + } + + return 0; + } + private long documentMetadata(List rawScores) { for (var score : rawScores) { return score.encodedDocMetadata(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 91003172a..5db3684d1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -4,6 +4,7 @@ import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.model.crawl.HtmlFeature; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -11,12 +12,15 @@ import org.jsoup.nodes.TextNode; import org.jsoup.select.NodeVisitor; +import java.util.Set; + public class DocumentValuator { public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument, int textLength) throws DisqualifiedException { + double scriptPenalty = getScriptPenalty(parsedDocument); int rawLength = crawledDocument.documentBody.length(); @@ -46,6 +50,55 @@ private int getScriptPenalty(Document parsed) { return value; } + public double adjustQuality(double quality, Set features) { + double adjustment = 0; + + if (features.contains(HtmlFeature.TRACKING_ADTECH)) { + adjustment -= 2.5; + } + if (features.contains(HtmlFeature.TRACKING)) { + adjustment -= 2.5; + } + if (features.contains(HtmlFeature.AFFILIATE_LINK)) { + adjustment -= 1.5; + } + if (features.contains(HtmlFeature.GA_SPAM)) { + adjustment -= 1; + } + if (features.contains(HtmlFeature.COOKIES)) { + adjustment -= 1; + } + if (features.contains(HtmlFeature.KEBAB_CASE_URL)) { + adjustment -= 2; + } + + if (features.contains(HtmlFeature.COOKIELAW)) { + adjustment -= 1; + } + if (features.contains(HtmlFeature.PARDOT)) { + adjustment -= 1; + } + if (features.contains(HtmlFeature.QUANTCAST)) { + adjustment -= 1; + } + + if (features.contains(HtmlFeature.WEBMENTION)) { + adjustment += 1; + } + if (features.contains(HtmlFeature.INDIEAUTH)) { + adjustment += 1; + } + + if (quality + adjustment > 0) { + return 0; + } + if (quality + adjustment < -15) { + return -15; + } + + return quality + adjustment; + } + private static class ScriptVisitor implements NodeVisitor { boolean hasBadScript = false; int scriptLength = 0; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 6c4ddcf9c..a6de14cc7 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -5,6 +5,7 @@ import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.adblock.GoogleAnwersSpamDetector; import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.topic.RecipeDetector; import nu.marginalia.topic.TextileCraftDetector; @@ -48,7 +49,9 @@ public class FeatureExtractor { "linkedin.com", "perfectaudience.com", "marketingautomation.services", - "usefathom"); + "usefathom", + "adthrive" + ); private final AdblockSimulator adblockSimulator; private final RecipeDetector recipeDetector; @@ -70,7 +73,7 @@ public FeatureExtractor(AdblockSimulator adblockSimulator, this.googleAnwersSpamDetector = googleAnwersSpamDetector; } - public Set getFeatures(Document doc, DocumentLanguageData dld) { + public Set getFeatures(EdgeUrl url, Document doc, DocumentLanguageData dld) { final Set features = new HashSet<>(); final Elements scriptTags = doc.getElementsByTag("script"); @@ -79,6 +82,13 @@ public Set getFeatures(Document doc, DocumentLanguageData dld) { features.add(HtmlFeature.GA_SPAM); } + if (isKebabCase(url)) { + features.add(HtmlFeature.KEBAB_CASE_URL); + } + if (url.path.length() > 64) { + features.add(HtmlFeature.LONG_URL); + } + for (var scriptTag : scriptTags) { final String type = scriptTag.attr("type"); @@ -301,6 +311,10 @@ else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) return features; } + private boolean isKebabCase(EdgeUrl url) { + return url.path.chars().filter(c -> c=='-').count() > 3; + } + private boolean hasInvasiveTrackingScript(Element scriptTag) { return hasInvasiveTrackingScript(scriptTag.attr("src")); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 8fb2b8016..d08276704 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -132,7 +132,6 @@ public DetailsWithWords createDetails(CrawledDocument crawledDocument) ret.length = length; ret.standard = standard; ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); - ret.quality = quality; // don't move this up! it uses title and quality // and is run before the heavy computations below @@ -141,8 +140,9 @@ public DetailsWithWords createDetails(CrawledDocument crawledDocument) throw new DisqualifiedException(DisqualificationReason.QUALITY); } - final Set features = featureExtractor.getFeatures(doc, dld); + final Set features = featureExtractor.getFeatures(url, doc, dld); ret.features = features; + ret.quality = documentValuator.adjustQuality(quality, features); ret.hashCode = dld.localitySensitiveHashCode(); PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true); @@ -151,7 +151,7 @@ public DetailsWithWords createDetails(CrawledDocument crawledDocument) ret.metadata = new DocumentMetadata( documentLengthLogic.getEncodedAverageLength(dld), - pubDate.yearByte(), (int) -quality, documentFlags); + pubDate.yearByte(), (int) -ret.quality, documentFlags); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index 2de12536d..09e36b0d0 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -148,7 +148,9 @@ public String getProblems() { if (isAffiliate()) { sj.add("Affiliate Linking"); } - + if (isAds()) { + sj.add("Ads/Adtech Tracking"); + } return sj.toString(); } diff --git a/code/services-core/search-service/src/main/resources/templates/search/search-result.hdb b/code/services-core/search-service/src/main/resources/templates/search/search-result.hdb index 0648cb0a7..d7393da02 100644 --- a/code/services-core/search-service/src/main/resources/templates/search/search-result.hdb +++ b/code/services-core/search-service/src/main/resources/templates/search/search-result.hdb @@ -1,7 +1,11 @@ - +