Skip to content

Commit

Permalink
New synthetic keyword for document generator meta tag.
Browse files Browse the repository at this point in the history
  • Loading branch information
vlofgren committed Jun 20, 2023
1 parent 7326ba7 commit a9a2960
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package nu.marginalia.converting.processor.logic;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;

import java.util.Collections;
import java.util.List;

/** Extract keywords for the document meta generator tag */
public class DocumentGeneratorExtractor {

public List<String> generatorCleaned(Document doc) {

String generator = doc
.select("meta[name=generator]")
.attr("content");

// Remove leading or trailing junk from the generator string, "powered by" etc.
generator = trim(generator);

if (generator.isBlank())
return Collections.emptyList();

String[] parts = StringUtils.split(generator, " ,:!");
if (parts.length == 0)
return Collections.emptyList();

int slashIdx = parts[0].indexOf('/');
if (slashIdx >= 0) {
// mozilla and staroffice has a really weird format
return List.of(parts[0].substring(0, slashIdx));
}

if (parts.length > 3) {
return Collections.emptyList(); // if it's still very long after trim(), it's probably a custom hand written message
}

switch (parts[0]) {
case "joomla!":
return List.of("joomla");
case "plone":
case "claris":
case "one.com":
case "wix.com":
case "wpbakery":
return List.of(parts[0]);
case "adobe":
case "microsoft":
return List.of(parts[1]);
}

if (parts.length > 1) {
return List.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
}
else {
return List.of(parts[0]);
}
}

private String trim(String generator) {

generator = generator.toLowerCase().trim();
if (generator.startsWith("powered by ")) {
generator = generator.substring("powered by ".length());
}

int dashIdx = generator.indexOf('-'); // Some strings have values like 'foobar 2.3 - the free online generator!'
if (dashIdx >= 0) {
generator = generator.substring(0, dashIdx);
}

if (!StringUtils.isAsciiPrintable(generator))
return "";

return generator;
}

// Censor exact version strings, being able to search by major version is enough
// for any non-blackhat purpose
private String truncVersion(String part) {
int periodIdx = part.indexOf('.', part.startsWith("0.") ? 2 : 0);

if (periodIdx < 0)
return part;

return part.substring(0, periodIdx);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ public MetaTagsBuilder addUrl(EdgeUrl url) {
return this;
}

public MetaTagsBuilder addGenerator(List<String> generators) {

for (var generator : generators) {
add("generator", generator);
}

return this;
}

public MetaTagsBuilder addFormat(HtmlStandard standard) {

add("format", standard);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final DocumentLengthLogic documentLengthLogic;

private final MetaRobotsTag metaRobotsTag;
private final DocumentGeneratorExtractor documentGeneratorExtractor;
private static final DocumentValuator documentValuator = new DocumentValuator();

private static final LinkParser linkParser = new LinkParser();
Expand All @@ -69,7 +70,8 @@ public HtmlDocumentProcessorPlugin(
SummaryExtractor summaryExtractor,
PubDateSniffer pubDateSniffer,
DocumentLengthLogic documentLengthLogic,
MetaRobotsTag metaRobotsTag) {
MetaRobotsTag metaRobotsTag,
DocumentGeneratorExtractor documentGeneratorExtractor) {
this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor;
Expand All @@ -81,6 +83,7 @@ public HtmlDocumentProcessorPlugin(
this.pubDateSniffer = pubDateSniffer;
this.metaRobotsTag = metaRobotsTag;

this.documentGeneratorExtractor = documentGeneratorExtractor;
}

@Override
Expand Down Expand Up @@ -143,12 +146,15 @@ public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocume

ret.description = getDescription(doc, words.importantWords);

List<String> generatorParts = documentGeneratorExtractor.generatorCleaned(doc);

var tagWords = new MetaTagsBuilder()
.addDomainCrawlData(crawledDomain)
.addPubDate(pubDate)
.addUrl(url)
.addFeatures(features)
.addFormat(standard)
.addGenerator(generatorParts)
.build();

words.addAllSyntheticTerms(tagWords);
Expand Down

0 comments on commit a9a2960

Please sign in to comment.