Skip to content

Commit

Permalink
(converter) Penalize chatgpt content farm spam
Browse files Browse the repository at this point in the history
  • Loading branch information
vlofgren committed Jan 5, 2024
1 parent 109bec3 commit e53bb70
Showing 1 changed file with 32 additions and 1 deletion.
Expand Up @@ -11,6 +11,7 @@
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;

import java.util.List;
import java.util.Set;

public class DocumentValuator {
Expand All @@ -21,6 +22,7 @@ public double getQuality(CrawledDocument crawledDocument,
int textLength) throws DisqualifiedException {

double scriptPenalty = getScriptPenalty(parsedDocument);
double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument);

int rawLength = crawledDocument.documentBody.length();

Expand All @@ -30,7 +32,36 @@ public double getQuality(CrawledDocument crawledDocument,

return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
+ htmlStandard.offset
- scriptPenalty;
- scriptPenalty
- chatGptPenalty;
}

private double getChatGptContentFarmPenalty(Document parsedDocument) {
// easily 90% of modern AI-authored content farm spam has these nonsense headers

boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false;

outer:
for (String tagName : List.of("h1", "h2", "h3")) {
for (var elem : parsedDocument.getElementsByTag(tagName)) {
if (benefitsOf && keyBenefits && keyTakeaways)
break outer;

String text = elem.text().toLowerCase();

benefitsOf = benefitsOf || text.startsWith("benefits of");
keyBenefits = keyBenefits || text.startsWith("key benefits");
keyTakeaways = keyTakeaways || text.startsWith("key takeaways");
}
}

double penalty = 0;

if (benefitsOf) penalty += 10;
if (keyBenefits) penalty += 5;
if (keyTakeaways) penalty += 5;

return penalty;
}


Expand Down

0 comments on commit e53bb70

Please sign in to comment.