Skip to content

Commit

Permalink
Tweaks to pub date heuristics to make it mostly get the 'historyofphi…
Browse files Browse the repository at this point in the history
…losophy.net' case right.

Use HTML standard for plausibility checks in the more guesswork-like heuristics. Added more class names to look for date strings.
  • Loading branch information
vlofgren committed Jun 20, 2023
1 parent a9fabba commit 7326ba7
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 19 deletions.
@@ -0,0 +1,38 @@
package nu.marginalia.pubdate;

import nu.marginalia.converting.model.HtmlStandard;

public class PubDateFromHtmlStandard {
/** Used to bias pub date heuristics */
public static int blindGuess(HtmlStandard standard) {
return switch (standard) {
case PLAIN -> 1993;
case HTML123 -> 1997;
case HTML4, XHTML -> 2006;
case HTML5 -> 2018;
case UNKNOWN -> 2000;
};
}

/** Sanity check a publication year based on the HTML standard.
* It is for example unlikely for a HTML5 document to be published
* in 1998, since that is 6 years before the HTML5 standard was published.
* <p>
* Discovering publication year involves a lot of guesswork, this helps
* keep the guesses relatively sane.
*/
public static boolean isGuessPlausible(HtmlStandard standard, int year) {
switch (standard) {
case HTML123:
return year <= 2000;
case XHTML:
case HTML4:
return year >= 2000;
case HTML5:
return year >= 2014;
default:
return true;
}
}

}
Expand Up @@ -26,6 +26,17 @@ public static Optional<PubDate> attemptParseDate(String date) {
.filter(PubDateParser::validateDate);
}

public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
return Optional.ofNullable(date)
.filter(str -> str.length() >= 4 && str.length() < 32)
.flatMap(str ->
parse8601(str)
.or(() -> parse1123(str))
.or(() -> dateFromHighestYearLookingSubstringWithGuess(str, standard))
)
.filter(PubDateParser::validateDate);
}

public static OptionalInt parseYearString(String yearString) {
try {
return OptionalInt.of(Integer.parseInt(yearString));
Expand Down Expand Up @@ -70,7 +81,9 @@ public static Optional<PubDate> dateFromHighestYearLookingSubstring(String maybe
}


public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) {
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
int guess = PubDateFromHtmlStandard.blindGuess(standard);

var matcher = yearPattern.matcher(maybe);

int min = PubDate.MAX_YEAR + 1;
Expand Down Expand Up @@ -126,7 +139,7 @@ public static int guessYear(HtmlStandard standard) {
// Create some jitter to avoid having documents piling up in the same four years
// as this would make searching in those years disproportionately useless

double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian();
double guess = PubDateFromHtmlStandard.blindGuess(standard) + ThreadLocalRandom.current().nextGaussian();

if (guess < PubDate.MIN_YEAR) {
return PubDate.MIN_YEAR;
Expand Down
Expand Up @@ -74,6 +74,8 @@ public boolean isCandidatForCopyrightNotice(String text) {
return true;
if (text.contains("opyright"))
return true;
if (text.contains("Posted on"))
return true;
if (text.contains("&copy;"))
return true;
if (text.contains("(c)"))
Expand All @@ -90,6 +92,8 @@ public boolean hasCommonClass(Element el) {
|| classes.contains("byline")
|| classes.contains("author")
|| classes.contains("submitted")
|| classes.contains("date")
|| classes.contains("datey")
|| el.id().contains("footer-info-lastmod"); // mediawiki
}

Expand Down Expand Up @@ -137,7 +141,7 @@ private void parse(String text) {
}
else {
PubDateParser
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
.attemptParseDate(text)
.ifPresent(this::setPubDate);
}
}
Expand Down
Expand Up @@ -3,6 +3,7 @@
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateFromHtmlStandard;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
Expand Down Expand Up @@ -42,7 +43,7 @@ private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
public FilterResult head(@NotNull Node node, int depth) {
if (node instanceof TextNode tn) onTextNode(tn);

if (hasPubDate()) {
if (hasPubDate() && PubDateFromHtmlStandard.isGuessPlausible(htmlStandard, pubDate.year())) {
return FilterResult.STOP;
}
return FilterResult.CONTINUE;
Expand Down Expand Up @@ -78,7 +79,7 @@ private void parse(String text) {
}
else {
PubDateParser
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard)
.ifPresent(this::setPubDate);
}
}
Expand Down
Expand Up @@ -235,7 +235,7 @@ public void testDOM() throws URISyntaxException {

assertFalse(ret.isEmpty());
assertNull(ret.dateIso8601());
assertEquals(2015, ret.year());
assertEquals(2012, ret.year());
}

@Test
Expand Down
Expand Up @@ -2,26 +2,21 @@


public enum HtmlStandard {
PLAIN(0, 1, 1993),
UNKNOWN(0, 1, 2000),
HTML123(0, 1, 1997),
HTML4(-0.1, 1.05, 2006),
XHTML(-0.1, 1.05, 2006),
HTML5(0.5, 1.1, 2018);
PLAIN(0, 1),
UNKNOWN(0, 1),
HTML123(0, 1),
HTML4(-0.1, 1.05),
XHTML(-0.1, 1.05),
HTML5(0.5, 1.1);

/** Used to tune quality score */
public final double offset;
/** Used to tune quality score */
public final double scale;

/** This parameter is used to bias publish date heuristics
* */
public final int yearGuess;

HtmlStandard(double offset, double scale, int yearGuess) {
HtmlStandard(double offset, double scale) {
this.offset = offset;
this.scale = scale;
this.yearGuess = yearGuess;
}

}
Expand Up @@ -16,6 +16,7 @@ public static HtmlStandard parseDocType(DocumentType docType) {
if (null == docType) {
return HtmlStandard.UNKNOWN;
}

String publicId = docType.publicId();
if (Strings.isNullOrEmpty(publicId))
return HtmlStandard.HTML5;
Expand Down
Expand Up @@ -266,7 +266,6 @@ private Set<String> createLinkKeywords(LinkProcessor lp) {

private HtmlStandard getHtmlStandard(Document doc) {
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());

if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
return HtmlStandardExtractor.sniffHtmlStandard(doc);
}
Expand Down

0 comments on commit 7326ba7

Please sign in to comment.