Skip to content

Commit

Permalink
Improve document processing in conversion.
Browse files Browse the repository at this point in the history
* Add flags for long and short documents.
* Break out common length logic from plugins.
* Cleaning up of related code.
  • Loading branch information
vlofgren committed Mar 28, 2023
1 parent 1e65ac3 commit 03bd892
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 51 deletions.
Expand Up @@ -7,8 +7,8 @@ public enum DocumentFlags {
PlainText,
Ads,
Tracking,
UnusedBit4,
UnusedBit5,
ShortDocument,
LongDocument,
UnusedBit6,
UnusedBit7,
;
Expand Down
@@ -0,0 +1,36 @@
package nu.marginalia.converting.processor.logic;

import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.idx.DocumentFlags;

import java.util.EnumSet;

@Singleton
public class DocumentLengthLogic {
private final int minDocumentLength;
private final int shortDocumentLength = 2500;
private final int longDocumentLength = 7500;

@Inject
public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) {
this.minDocumentLength = minDocumentLength;
}

public void setLengthFlags(int lengthTextInChars, EnumSet<DocumentFlags> flags) {
if (lengthTextInChars < shortDocumentLength)
flags.add(DocumentFlags.ShortDocument);
else if (lengthTextInChars > longDocumentLength)
flags.add(DocumentFlags.LongDocument);
}

public void validateLength(DocumentLanguageData dld) throws DisqualifiedException {
if (dld.totalNumWords() < minDocumentLength) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
}
}

}
Expand Up @@ -11,19 +11,7 @@

public class DocumentValuator {

private static final Set<String> filthTable = Set.of(
"xxx", "sex", "anal", "sexy",
"bdsm", "fetish", "porn", "camgirls", "dildo",
"gangbang", "buttplug", "orgasm", "vibrator",
"cameltoe", "download", "iso", "botox", "torrent",
"jackpot", "vegas", "casino", "coinbase", "poloniex",
"myetherwallet", "ethereum", "binance", "bitcoin",
"litecoin", "seo", "serp"

);

public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument) throws DisqualifiedException {
double scriptPenalty = getScriptPenalty(parsedDocument);

int textBodyLength = parsedDocument.text().length();
Expand All @@ -35,8 +23,7 @@ public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStand

return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
+ htmlStandard.offset
- scriptPenalty
- smutCoefficient;
- scriptPenalty;
}


Expand Down
Expand Up @@ -13,7 +13,6 @@ public class TitleExtractor {
@Inject
public TitleExtractor(@Named("max-title-length") Integer maxTitleLength) {
this.maxTitleLength = maxTitleLength;

}

public String getTitleAbbreviated(Document doc, DocumentLanguageData dld, String url) {
Expand Down
Expand Up @@ -32,7 +32,7 @@ protected void checkDocumentLanguage(DocumentLanguageData dld) throws Disqualifi
protected static class MetaTagsBuilder {
private final Set<String> tagWords = new HashSet<>();

public Set<String> build(DocumentKeywordsBuilder dest) {
public Set<String> build() {
return tagWords;
}

Expand Down
Expand Up @@ -5,6 +5,7 @@
import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.link_parser.LinkParser;
Expand All @@ -16,7 +17,6 @@
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.processor.logic.*;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.gregex.GuardedRegex;
Expand All @@ -40,7 +40,6 @@

public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {

private final int minDocumentLength;
private final double minDocumentQuality;

private final SentenceExtractor sentenceExtractor;
Expand All @@ -50,23 +49,26 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final SummaryExtractor summaryExtractor;
private final PubDateSniffer pubDateSniffer;

private final DocumentLengthLogic documentLengthLogic;

private final MetaRobotsTag metaRobotsTag;
private static final DocumentValuator documentValuator = new DocumentValuator();

private static final LinkParser linkParser = new LinkParser();
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);

@Inject
public HtmlDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength,
@Named("min-document-quality") Double minDocumentQuality,
SentenceExtractor sentenceExtractor,
FeatureExtractor featureExtractor,
TitleExtractor titleExtractor,
DocumentKeywordExtractor keywordExtractor,
SummaryExtractor summaryExtractor,
PubDateSniffer pubDateSniffer,
MetaRobotsTag metaRobotsTag) {
this.minDocumentLength = minDocumentLength;
public HtmlDocumentProcessorPlugin(
@Named("min-document-quality") Double minDocumentQuality,
SentenceExtractor sentenceExtractor,
FeatureExtractor featureExtractor,
TitleExtractor titleExtractor,
DocumentKeywordExtractor keywordExtractor,
SummaryExtractor summaryExtractor,
PubDateSniffer pubDateSniffer,
DocumentLengthLogic documentLengthLogic,
MetaRobotsTag metaRobotsTag) {
this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor;
this.featureExtractor = featureExtractor;
Expand Down Expand Up @@ -102,9 +104,7 @@ public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocume

final EdgeUrl url = new EdgeUrl(crawledDocument.url);

Document prunedDoc = prune(doc);

var dld = sentenceExtractor.extractSentences(prunedDoc);
DocumentLanguageData dld = sentenceExtractor.extractSentences(prune(doc));

checkDocumentLanguage(dld);

Expand All @@ -113,11 +113,12 @@ public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocume
ret.length = getLength(doc);
ret.standard = getHtmlStandard(doc);
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc);

// don't move this up! it uses title and quality
// and is run before the heavy computations below
if (isDisqualified(url, dld, ret)) {
documentLengthLogic.validateLength(dld);
if (isDisqualified(url, ret)) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}

Expand All @@ -128,6 +129,8 @@ public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocume
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(ret.features);

documentLengthLogic.setLengthFlags(ret.length, documentFlags);

ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);

DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
Expand All @@ -138,7 +141,7 @@ public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocume
.addUrl(url)
.addFeatures(ret.features)
.addFormat(ret.standard)
.build(words);
.build();

words.addAllSyntheticTerms(tagWords);

Expand Down Expand Up @@ -179,13 +182,11 @@ private Document prune(Document doc) {

private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");

private boolean isDisqualified(EdgeUrl url, DocumentLanguageData dld, ProcessedDocumentDetails ret) {
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
if (ret.quality < minDocumentQuality) {
return true;
}
if (dld.totalNumWords() < minDocumentLength) {
return true;
}

// These pages shouldn't be publicly accessible
if ("phpinfo()".equals(ret.title)) {
return true;
Expand Down
Expand Up @@ -2,6 +2,7 @@

import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.keyword.DocumentKeywordExtractor;
Expand All @@ -28,20 +29,21 @@

public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {

private final int minDocumentLength;
private final int maxTitleLength;
private final SentenceExtractor sentenceExtractor;
private final DocumentKeywordExtractor keywordExtractor;
private final PlainTextLogic plainTextLogic = new PlainTextLogic();
private final DocumentLengthLogic documentLengthLogic;


@Inject
public PlainTextDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength,
@Named("max-title-length") Integer maxTitleLength,
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
SentenceExtractor sentenceExtractor,
DocumentKeywordExtractor keywordExtractor)
DocumentKeywordExtractor keywordExtractor,
DocumentLengthLogic documentLengthLogic
)
{
this.minDocumentLength = minDocumentLength;
this.documentLengthLogic = documentLengthLogic;
this.maxTitleLength = maxTitleLength;
this.sentenceExtractor = sentenceExtractor;
this.keywordExtractor = keywordExtractor;
Expand All @@ -68,15 +70,14 @@ public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocume

checkDocumentLanguage(dld);

if (dld.totalNumWords() < minDocumentLength) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
}
documentLengthLogic.validateLength(dld);

var ret = new ProcessedDocumentDetails();

List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);

ret.length = documentBody.length();

ret.standard = HtmlStandard.PLAIN;
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);

Expand All @@ -88,7 +89,11 @@ public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocume

final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));

ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(DocumentFlags.PlainText));
EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PlainText);

documentLengthLogic.setLengthFlags(ret.length, documentFlags);

ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);

DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);

Expand All @@ -98,7 +103,7 @@ public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocume
.addUrl(url)
.addFeatures(ret.features)
.addFormat(ret.standard)
.build(words);
.build();

words.addAllSyntheticTerms(tagWords);

Expand Down

0 comments on commit 03bd892

Please sign in to comment.