Skip to content

Commit

Permalink
Better logic for summarization.
Browse files Browse the repository at this point in the history
  • Loading branch information
vlofgren committed Jun 19, 2023
1 parent 67c15a3 commit f0b4acb
Showing 1 changed file with 42 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
import org.jsoup.select.NodeFilter;

import java.util.*;
import java.util.function.Function;

import static nu.marginalia.summary.heuristic.HeuristicTextUtil.countOccurrencesOfAnyWord;
import static org.jsoup.internal.StringUtil.isActuallyWhitespace;
import static org.jsoup.internal.StringUtil.isInvisibleChar;

Expand Down Expand Up @@ -57,9 +59,9 @@ public boolean shouldPruneTag(Element tag) {
public String getSummary(int maxLength, Collection<String> importantWords) {
List<NodeStatistics> ret = new ArrayList<>(statistics.size());
for (var stats : statistics.values()) {
if (stats.textToTagRatio() < 0.85) continue;
if (stats.textToTagRatio() < 0.75) continue;
if (!stats.isElement() || !stats.isAppropriateTagType()) continue;
if (stats.textLength() < 128) continue;
if (stats.textLength() < 64) continue;
if (stats.isLink()) continue;

ret.add(stats);
Expand All @@ -81,38 +83,40 @@ public String getSummary(int maxLength, Collection<String> importantWords) {
return "";
}


// Words we don't want to appear in the summary
private static List<String> badWords = List.of("copyright", "rights", "reserved", "post",
"posted", "author", "published", "publish", "cookie", "cookies", "©", "terms", "conditions");

private List<NodeStatistics> sortByWordRelevance(List<NodeStatistics> in,
Collection<String> words) {
Collection<String> importantWords) {

if (words.isEmpty())
if (importantWords.isEmpty())
return in;

Map<NodeStatistics, Integer> ret = new HashMap<>(in.size());
int cntTotal = 0;

// This is a relatively small list at this point
// so this function isn't as bad as it looks

for (var stats : in) {
var lcText = stats.text().toLowerCase();
// text() is expensive, we don't mind sifting through superfluous whitespace
int cnt = stats.score(tn ->
countOccurrencesOfAnyWord(tn.getWholeText(), importantWords)
- countOccurrencesOfAnyWord(tn.getWholeText(), badWords));

int cnt = 0;
for (var word : words) {
if (lcText.contains(word)) {
cnt++;
cntTotal++;
}
if (cnt > 0) {
ret.put(stats, -cnt);
}

ret.put(stats, -cnt);
}

// Skip the sorting if we didn't match any words
if (cntTotal == 0) {
// Skip the sorting if we didn't match any importantWords
if (ret.isEmpty()) {
return in;
}

in.sort(Comparator.comparing(ret::get));
in.sort(Comparator.comparing(w -> ret.getOrDefault(w, 0)));

return in;
}

Expand Down Expand Up @@ -216,6 +220,27 @@ else if (node instanceof TextNode tn) {
}
return "";
}
public String wholeText() {
if (node instanceof Element e) {
return e.wholeText();
}
else if (node instanceof TextNode tn) {
return tn.getWholeText();
}
return "";
}

public int score(Function<TextNode, Integer> fn) {
int[] score = new int[1];

node.traverse((node, depth) -> {
if (node instanceof TextNode tn) {
score[0] += fn.apply(tn);
}
});

return score[0];
}

public boolean isElement() {
return node instanceof Element;
Expand Down

0 comments on commit f0b4acb

Please sign in to comment.