Skip to content

Commit

Permalink
Merge remote-tracking branch 'karussell/master'
Browse files Browse the repository at this point in the history
Conflicts:
	src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
	src/main/java/de/jetwick/snacktory/OutputFormatter.java
	src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java
  • Loading branch information
Herb Jiang committed May 7, 2012
2 parents 0e9269c + 4713c80 commit 586bb7a
Show file tree
Hide file tree
Showing 13 changed files with 225 additions and 130 deletions.
22 changes: 5 additions & 17 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,20 @@

This is a small helper utility for pepole don't want to write yet another java clone of Readability.

In most cases, this is applied to articles, although it should work for any website to find its
major area and extract its text and its important picture.
In most cases, this is applied to articles, although it should work for any website to find its major
area, extract its text, keywords, its main picture and more.

Have a look into http://jetsli.de where Snacktory is used. Jetslide is a new way to consume news,
it does not only display the Websites' title but it displays a small preview of the site ('a snack')
and the important image if available.
Have a look into http://jetsli.de where Snacktory is used. Jetslide is a new way to consume news, it does not only display the Websites' title but it displays a small preview of the site ('a snack') and the important image if available.

= License

The software stands under Apache 2 License and comes with NO WARRANTY

= Features

Snacktory borrows some ideas from jReadability

https://github.com/ifesdjeen/jReadability

and goose (ideas + a lot test cases):

Snacktory borrows some ideas and a lot of test cases from goose:
https://github.com/jiminoc/goose

The advantages over jReadability are
* better article text detection than jReadability
* only Java deps
* more tests

The advantages over Goose are
* similar article text detection although better detection for none-english sites (German, Japanese, ...)
* snacktory does not depend on the word count in its text detection to support CJK languages
Expand All @@ -48,7 +36,7 @@ The disadvantages to Goose are
<dependency>
<groupId>de.jetwick</groupId>
<artifactId>snacktory</artifactId>
<version>1.0-SNAPSHOT</version>
<version>1.1-SNAPSHOT</version>
</dependency>

Now you can use it as follows:
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.5.2</version>
<version>1.6.2</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
Expand Down
58 changes: 43 additions & 15 deletions src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
package de.jetwick.snacktory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import org.jsoup.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.util.regex.Pattern;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -38,10 +40,11 @@ public class ArticleTextExtractor {
private static final Pattern NEGATIVE =
Pattern.compile("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
+ "sidebar|sponsor|tags|tool|widget");
+ "sidebar|sponsor|tags|tool|widget|player");

private static final Pattern IGNORE_IMAGE_PATTERN =
Pattern.compile("ico(/|n|\\.)|spacer|blank|zoom");
private static final Pattern NEGATIVE_STYLE = Pattern.compile("hidden|display: ?none");
private static final Pattern IGNORE_IMAGE_PATTERN =
Pattern.compile("ico(/|n|\\.)|spacer|blank|zoom");
private static final String IMAGE_CAPTION = "caption";
private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {

Expand Down Expand Up @@ -71,7 +74,13 @@ public JResult extractContent(JResult res, String html, OutputFormatter formatte
throw new IllegalArgumentException("html string is empty!?");

// http://jsoup.org/cookbook/extracting-data/selector-syntax
Document doc = Jsoup.parse(html);
return extractContent(res, Jsoup.parse(html), formatter);
}

public JResult extractContent(JResult res, Document doc, OutputFormatter formatter) throws Exception {
if (doc == null)
throw new NullPointerException("missing document");

res.setTitle(extractTitle(doc));

res.setDescription(extractDescription(doc));
Expand Down Expand Up @@ -129,6 +138,8 @@ public JResult extractContent(JResult res, String html, OutputFormatter formatte

res.setFaviconUrl(extractFaviconUrl(doc));

res.setKeywords(extractKeywords(doc));

return res;
}

Expand All @@ -147,6 +158,22 @@ protected String extractDescription(Document doc){
return SHelper.innerTrim(doc.select("head meta[name=description]").attr("content"));
}

protected Collection<String> extractKeywords(Document doc){
String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));

if(content != null) {
if(content.startsWith("[") && content.endsWith("]"))
content = content.substring(1, content.length() - 1);

String[] split = content.split("\\s*,\\s*");

if(split.length > 1 || !split[0].equals(""))
return Arrays.asList(split);
}

return Collections.emptyList();
}

/***
* Tries to extract an image url from metadata if determineImageSource failed
* @param doc
Expand Down Expand Up @@ -207,6 +234,10 @@ protected int getWeight(Element e) {

if (NEGATIVE.matcher(e.id()).find())
weight -= 50;

String style = e.attr("style");
if(style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
weight -= 50;

weight += (int) Math.round(e.ownText().length() / 100.0 * 10);

Expand Down Expand Up @@ -479,19 +510,16 @@ private String doTitleSplits(String title, String delimeter) {
* @return a set of all important nodes
*/
public Collection<Element> getNodes(Document doc) {
Map<Integer, Element> nodes = new LinkedHashMap<Integer, Element>(32);
Map<Element, Object> nodes = new LinkedHashMap<Element, Object>(64);
int score = 100;
for (Element el : doc.select("body").select("*")) {
if ("p;div;td;h1;h2".contains(el.tagName())) {
// TODO reduce calculation of hashcode!
nodes.put(el.hashCode(), el);
Element p = el.parent();
nodes.put(p.hashCode(), p);
nodes.put(el, null);
setScore(el, score);
score = score / 2;
}
}
return nodes.values();
return nodes.keySet();
}

public String cleanTitle(String title) {
Expand Down
57 changes: 30 additions & 27 deletions src/main/java/de/jetwick/snacktory/Converter.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,10 @@
*/
package de.jetwick.snacktory;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.io.*;
import java.net.SocketTimeoutException;
import java.nio.charset.Charset;
import org.apache.log4j.Logger;
import org.apache.log4j.Priority;

/**
* This class is not thread safe. Use one new instance every time due to encoding
Expand Down Expand Up @@ -52,8 +48,13 @@ public Converter setMaxBytes(int maxBytes) {
return this;
}

public static String extractEncoding(String contentType) {
String[] values = contentType.split(";");
public static String extractEncoding(String contentType) {
String[] values;
if(contentType != null)
values = contentType.split(";");
else
values = new String[0];

String charset = "";

for (String value : values) {
Expand Down Expand Up @@ -100,18 +101,18 @@ public String streamToString(InputStream is, int maxBytes, String enc) {
BufferedInputStream in = null;
try {
in = new BufferedInputStream(is, K2);
StringBuilder sb = new StringBuilder();
ByteArrayOutputStream output = new ByteArrayOutputStream();

// detect encoding with the help of meta tag
try {
in.mark(K2 * 2);
String tmpEnc = detectCharset("charset=", sb, in);
String tmpEnc = detectCharset("charset=", output, in, encoding);
if (tmpEnc != null)
encoding = tmpEnc;
else {
logger.debug("no charset found in first stage");
// detect with the help of xml beginning ala encoding="charset"
tmpEnc = detectCharset("encoding=", sb, in);
tmpEnc = detectCharset("encoding=", output, in, encoding);
if (tmpEnc != null)
encoding = tmpEnc;
else
Expand All @@ -129,8 +130,8 @@ public String streamToString(InputStream is, int maxBytes, String enc) {
// SocketException: Connection reset
// IOException: missing CR => problem on server (probably some xml character thing?)
// IOException: Premature EOF => socket unexpectly closed from server
int bytesRead = K2;
byte[] arr = new byte[K2];
int bytesRead = output.size();
byte[] arr = new byte[K2];
while (true) {
if (bytesRead >= maxBytes) {
logger.warn("Maxbyte of " + maxBytes + " exceeded! Maybe html is now broken but try it nevertheless. Url: " + url);
Expand All @@ -140,12 +141,11 @@ public String streamToString(InputStream is, int maxBytes, String enc) {
int n = in.read(arr);
if (n < 0)
break;
bytesRead += K2;
sb.append(new String(arr, 0, n, encoding));
bytesRead += n;
output.write(arr, 0, n);
}

return sb.toString();

return output.toString(encoding);
} catch (SocketTimeoutException e) {
logger.info(e.toString() + " url:" + url);
} catch (IOException e) {
Expand All @@ -168,7 +168,9 @@ public String streamToString(InputStream is, int maxBytes, String enc) {
*
* @throws IOException
*/
public String detectCharset(String key, StringBuilder sb, BufferedInputStream in) throws IOException {
protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in,
String enc) throws IOException {

// Grab better encoding from stream
byte[] arr = new byte[K2];
int nSum = 0;
Expand All @@ -178,45 +180,46 @@ public String detectCharset(String key, StringBuilder sb, BufferedInputStream in
break;

nSum += n;
sb.append(new String(arr, 0, n, encoding));
bos.write(arr, 0, n);
}

int encIndex = sb.indexOf(key);
String str = bos.toString(enc);
int encIndex = str.indexOf(key);
int clength = key.length();
if (encIndex > 0) {
char startChar = sb.charAt(encIndex + clength);
char startChar = str.charAt(encIndex + clength);
int lastEncIndex;
if (startChar == '\'')
// if we have charset='something'
lastEncIndex = sb.indexOf("'", ++encIndex + clength);
lastEncIndex = str.indexOf("'", ++encIndex + clength);
else if (startChar == '\"')
// if we have charset="something"
lastEncIndex = sb.indexOf("\"", ++encIndex + clength);
lastEncIndex = str.indexOf("\"", ++encIndex + clength);
else {
// if we have "text/html; charset=utf-8"
int first = sb.indexOf("\"", encIndex + clength);
int first = str.indexOf("\"", encIndex + clength);
if (first < 0)
first = Integer.MAX_VALUE;

// or "text/html; charset=utf-8 "
int sec = sb.indexOf(" ", encIndex + clength);
int sec = str.indexOf(" ", encIndex + clength);
if (sec < 0)
sec = Integer.MAX_VALUE;
lastEncIndex = Math.min(first, sec);

// or "text/html; charset=utf-8 '
int third = sb.indexOf("'", encIndex + clength);
int third = str.indexOf("'", encIndex + clength);
if (third > 0)
lastEncIndex = Math.min(lastEncIndex, third);
}

// re-read byte array with different encoding
// assume that the encoding string cannot be greater than 40 chars
if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) {
String tmpEnc = SHelper.encodingCleanup(sb.substring(encIndex + clength, lastEncIndex));
String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength, lastEncIndex));
try {
in.reset();
sb.setLength(0);
bos.reset();
return tmpEnc;
} catch (IOException ex) {
logger.warn("Couldn't reset stream to re-read with new encoding " + tmpEnc + " "
Expand Down
13 changes: 12 additions & 1 deletion src/main/java/de/jetwick/snacktory/JResult.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
*/
package de.jetwick.snacktory;

import java.util.Collection;

/**
* Parsed result from web page containing important title, text and image.
*
Expand All @@ -32,7 +34,8 @@ public class JResult {
private String faviconUrl;
private String description;
private String dateString;

private Collection<String> keywords;

public JResult() {
}

Expand Down Expand Up @@ -139,6 +142,14 @@ public JResult setDate(String date) {
return this;
}

public Collection<String> getKeywords() {
return keywords;
}

public void setKeywords(Collection<String> keywords) {
this.keywords = keywords;
}

/**
* @return get date from url or guessed from text
*/
Expand Down
Loading

0 comments on commit 586bb7a

Please sign in to comment.