Skip to content

Commit

Permalink
(valuation) Penalize wordpress style kebab case urls
Browse files Browse the repository at this point in the history
  • Loading branch information
vlofgren committed Aug 16, 2023
1 parent 1d486bd commit 4598c7f
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 10 deletions.
Expand Up @@ -11,7 +11,10 @@ public enum HtmlFeature {
JS("special:scripts"),
AFFILIATE_LINK( "special:affiliate"),
TRACKING("special:tracking"),
TRACKING_ADTECH("special:ads"), // We'll this as ads for now
TRACKING_ADTECH("special:ads"), // We'll call this ads for now

KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
LONG_URL("special:longurl"),

VIEWPORT("special:viewport"),

Expand Down
Expand Up @@ -3,6 +3,7 @@
import nu.marginalia.index.client.model.results.ResultRankingContext;
import nu.marginalia.index.client.model.results.ResultRankingParameters;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.ranking.factors.*;
Expand Down Expand Up @@ -53,6 +54,7 @@ public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
int rank = DocumentMetadata.decodeRank(documentMetadata);
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
int quality = DocumentMetadata.decodeQuality(documentMetadata);
int urlTypePenalty = getUrlTypePenalty(documentMetadata);
int topology = DocumentMetadata.decodeTopology(documentMetadata);
int year = DocumentMetadata.decodeYear(documentMetadata);

Expand All @@ -78,6 +80,7 @@ public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
+ rankingBonus
+ topologyBonus
+ temporalBias
+ urlTypePenalty
+ priorityTermBonus.calculate(scores);

for (int set = 0; set <= sets; set++) {
Expand All @@ -99,6 +102,18 @@ public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
return bestScore;
}

private int getUrlTypePenalty(long documentMetadata) {

// Long urls-that-look-like-this tend to be poor search results
if (DocumentMetadata.hasFlags(documentMetadata,
HtmlFeature.LONG_URL.getFeatureBit()
| HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) {
return 2;
}

return 0;
}

private long documentMetadata(List<SearchResultKeywordScore> rawScores) {
for (var score : rawScores) {
return score.encodedDocMetadata();
Expand Down
Expand Up @@ -4,19 +4,23 @@
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.model.crawl.HtmlFeature;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;

import java.util.Set;

public class DocumentValuator {

public double getQuality(CrawledDocument crawledDocument,
HtmlStandard htmlStandard,
Document parsedDocument,
int textLength) throws DisqualifiedException {

double scriptPenalty = getScriptPenalty(parsedDocument);

int rawLength = crawledDocument.documentBody.length();
Expand Down Expand Up @@ -46,6 +50,55 @@ private int getScriptPenalty(Document parsed) {
return value;
}

public double adjustQuality(double quality, Set<HtmlFeature> features) {
double adjustment = 0;

if (features.contains(HtmlFeature.TRACKING_ADTECH)) {
adjustment -= 2.5;
}
if (features.contains(HtmlFeature.TRACKING)) {
adjustment -= 2.5;
}
if (features.contains(HtmlFeature.AFFILIATE_LINK)) {
adjustment -= 1.5;
}
if (features.contains(HtmlFeature.GA_SPAM)) {
adjustment -= 1;
}
if (features.contains(HtmlFeature.COOKIES)) {
adjustment -= 1;
}
if (features.contains(HtmlFeature.KEBAB_CASE_URL)) {
adjustment -= 2;
}

if (features.contains(HtmlFeature.COOKIELAW)) {
adjustment -= 1;
}
if (features.contains(HtmlFeature.PARDOT)) {
adjustment -= 1;
}
if (features.contains(HtmlFeature.QUANTCAST)) {
adjustment -= 1;
}

if (features.contains(HtmlFeature.WEBMENTION)) {
adjustment += 1;
}
if (features.contains(HtmlFeature.INDIEAUTH)) {
adjustment += 1;
}

if (quality + adjustment > 0) {
return 0;
}
if (quality + adjustment < -15) {
return -15;
}

return quality + adjustment;
}

private static class ScriptVisitor implements NodeVisitor {
boolean hasBadScript = false;
int scriptLength = 0;
Expand Down
Expand Up @@ -5,6 +5,7 @@
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
Expand Down Expand Up @@ -48,7 +49,9 @@ public class FeatureExtractor {
"linkedin.com",
"perfectaudience.com",
"marketingautomation.services",
"usefathom");
"usefathom",
"adthrive"
);

private final AdblockSimulator adblockSimulator;
private final RecipeDetector recipeDetector;
Expand All @@ -70,7 +73,7 @@ public FeatureExtractor(AdblockSimulator adblockSimulator,
this.googleAnwersSpamDetector = googleAnwersSpamDetector;
}

public Set<HtmlFeature> getFeatures(Document doc, DocumentLanguageData dld) {
public Set<HtmlFeature> getFeatures(EdgeUrl url, Document doc, DocumentLanguageData dld) {
final Set<HtmlFeature> features = new HashSet<>();

final Elements scriptTags = doc.getElementsByTag("script");
Expand All @@ -79,6 +82,13 @@ public Set<HtmlFeature> getFeatures(Document doc, DocumentLanguageData dld) {
features.add(HtmlFeature.GA_SPAM);
}

if (isKebabCase(url)) {
features.add(HtmlFeature.KEBAB_CASE_URL);
}
if (url.path.length() > 64) {
features.add(HtmlFeature.LONG_URL);
}

for (var scriptTag : scriptTags) {
final String type = scriptTag.attr("type");

Expand Down Expand Up @@ -301,6 +311,10 @@ else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld)
return features;
}

private boolean isKebabCase(EdgeUrl url) {
return url.path.chars().filter(c -> c=='-').count() > 3;
}

private boolean hasInvasiveTrackingScript(Element scriptTag) {
return hasInvasiveTrackingScript(scriptTag.attr("src"));
}
Expand Down
Expand Up @@ -132,7 +132,6 @@ public DetailsWithWords createDetails(CrawledDocument crawledDocument)
ret.length = length;
ret.standard = standard;
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
ret.quality = quality;

// don't move this up! it uses title and quality
// and is run before the heavy computations below
Expand All @@ -141,8 +140,9 @@ public DetailsWithWords createDetails(CrawledDocument crawledDocument)
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}

final Set<HtmlFeature> features = featureExtractor.getFeatures(doc, dld);
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, dld);
ret.features = features;
ret.quality = documentValuator.adjustQuality(quality, features);
ret.hashCode = dld.localitySensitiveHashCode();

PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
Expand All @@ -151,7 +151,7 @@ public DetailsWithWords createDetails(CrawledDocument crawledDocument)

ret.metadata = new DocumentMetadata(
documentLengthLogic.getEncodedAverageLength(dld),
pubDate.yearByte(), (int) -quality, documentFlags);
pubDate.yearByte(), (int) -ret.quality, documentFlags);

DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);

Expand Down
Expand Up @@ -148,7 +148,9 @@ public String getProblems() {
if (isAffiliate()) {
sj.add("Affiliate Linking");
}

if (isAds()) {
sj.add("Ads/Adtech Tracking");
}
return sj.toString();

}
Expand Down
@@ -1,7 +1,11 @@
<!-- {{rankingId}}.{{id}}/{{ranking}}/{{termScore}} -->
<!-- RankingID: {{rankingId}}
ID: {{id}}
Ranking: {{ranking}}
TermScore: {{termScore}}
Quality: {{urlQuality}}
-->
<!--
{{#each keywordScores}} {{{.}}}
{{/each}}
{{#each keywordScores}} {{{.}}} {{/each}}
-->
<section class="card search-result {{#unless hideRanking}}rs-rank-{{logRank}} ms-rank-{{matchRank}}{{/unless}} {{#if specialDomain}}special-domain{{/if}}" >
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
Expand Down

0 comments on commit 4598c7f

Please sign in to comment.