-
-
Notifications
You must be signed in to change notification settings - Fork 22
/
PubDateHeuristicDOMParsingPass2.java
124 lines (99 loc) · 3.72 KB
/
PubDateHeuristicDOMParsingPass2.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateFromHtmlStandard;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeFilter;
import java.util.Optional;
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
if (effortLevel == PubDateEffortLevel.LOW)
return Optional.empty();
DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard);
document.filter(filter);
return Optional.ofNullable(filter.pubDate);
}
private static class DateExtractingNodeVisitor implements NodeFilter {
public PubDate pubDate;
private final HtmlStandard htmlStandard;
private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
this.htmlStandard = htmlStandard;
}
@NotNull
@Override
public FilterResult head(@NotNull Node node, int depth) {
if (node instanceof TextNode tn) onTextNode(tn);
if (hasPubDate() && PubDateFromHtmlStandard.isGuessPlausible(htmlStandard, pubDate.year())) {
return FilterResult.STOP;
}
return FilterResult.CONTINUE;
}
public void onTextNode(TextNode tn) {
String text = tn.getWholeText();
if (isPossibleCandidate(text)) {
parse(text);
}
}
public boolean hasPubDate() {
return pubDate != null;
}
public void setPubDate(PubDate pubDate) {
this.pubDate = pubDate;
}
@NotNull
@Override
public FilterResult tail(@NotNull Node node, int depth) {
return FilterResult.CONTINUE;
}
private void parse(String text) {
if (htmlStandard == HtmlStandard.UNKNOWN) {
PubDateParser
.dateFromHighestYearLookingSubstring(text)
.ifPresent(this::setPubDate);
}
else {
PubDateParser
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard)
.ifPresent(this::setPubDate);
}
}
}
// This is basically the regex (^|[ ./\-])(\d{4})([ ./\-]$), but
// unchecked regexes are too slow
public static boolean isPossibleCandidate(String text) {
if (text.length() >= 4 && text.length() < 24) {
int ct = 0;
char prevC = ' ';
boolean goodStart = true;
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (Character.isDigit(c)) {
if (ct++ == 0) {
goodStart = isGoodBreak(prevC);
}
}
else {
if (ct == 4 && goodStart && isGoodBreak(c)) return true;
else {
ct = 0;
}
}
prevC = c;
}
if (ct == 4 && goodStart)
return true;
}
return false;
}
private static boolean isGoodBreak(char c) {
return "./-,".indexOf(c) >= 0 || Character.isSpaceChar(c);
}
}