From e53bb70bef7dc833c88f689d6fbf052f45c9f3cb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 16:51:26 +0100 Subject: [PATCH] (converter) Penalize chatgpt content farm spam --- .../processor/logic/DocumentValuator.java | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 218f16b89..af080a3a5 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -11,6 +11,7 @@ import org.jsoup.nodes.Node; import org.jsoup.select.NodeVisitor; +import java.util.List; import java.util.Set; public class DocumentValuator { @@ -21,6 +22,7 @@ public double getQuality(CrawledDocument crawledDocument, int textLength) throws DisqualifiedException { double scriptPenalty = getScriptPenalty(parsedDocument); + double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument); int rawLength = crawledDocument.documentBody.length(); @@ -30,7 +32,36 @@ public double getQuality(CrawledDocument crawledDocument, return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale + htmlStandard.offset - - scriptPenalty; + - scriptPenalty + - chatGptPenalty; + } + + private double getChatGptContentFarmPenalty(Document parsedDocument) { + // easily 90% of modern AI-authored content farm spam has these nonsense headers + + boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false; + + outer: + for (String tagName : List.of("h1", "h2", "h3")) { + for (var elem : parsedDocument.getElementsByTag(tagName)) { + if (benefitsOf && keyBenefits && keyTakeaways) + break outer; + + String text = elem.text().toLowerCase(); + + benefitsOf = benefitsOf || text.startsWith("benefits of"); + keyBenefits = keyBenefits || text.startsWith("key benefits"); + keyTakeaways = keyTakeaways || text.startsWith("key takeaways"); + } + } + + double penalty = 0; + + if (benefitsOf) penalty += 10; + if (keyBenefits) penalty += 5; + if (keyTakeaways) penalty += 5; + + return penalty; }