Merge remote-tracking branch 'karussell/master'

Conflicts: src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java src/main/java/de/jetwick/snacktory/OutputFormatter.java src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java
HerbJiang · May 7, 2012 · 586bb7a · 586bb7a
2 parents 0e9269c + 4713c80
commit 586bb7a
Show file tree

Hide file tree

Showing 13 changed files with 225 additions and 130 deletions.
diff --git a/README b/README
@@ -2,32 +2,20 @@
 
 This is a small helper utility for pepole don't want to write yet another java clone of Readability.
 
-In most cases, this is applied to articles, although it should work for any website to find its 
-major area and extract its text and its important picture.
+In most cases, this is applied to articles, although it should work for any website to find its major
+area, extract its text, keywords, its main picture and more.
 
-Have a look into http://jetsli.de where Snacktory is used. Jetslide is a new way to consume news,
-it does not only display the Websites' title but it displays a small preview of the site ('a snack') 
-and the important image if available.
+Have a look into http://jetsli.de where Snacktory is used. Jetslide is a new way to consume news, it does not only display the Websites' title but it displays a small preview of the site ('a snack') and the important image if available.
 
 = License 
 
 The software stands under Apache 2 License and comes with NO WARRANTY
 
 = Features
 
-Snacktory borrows some ideas from jReadability 
-
-https://github.com/ifesdjeen/jReadability
-
-and goose (ideas + a lot test cases):
-
+Snacktory borrows some ideas and a lot of test cases from goose:
 https://github.com/jiminoc/goose
 
-The advantages over jReadability are
-    * better article text detection than jReadability 
-    * only Java deps
-    * more tests
-
 The advantages over Goose are
     * similar article text detection although better detection for none-english sites (German, Japanese, ...)
     * snacktory does not depend on the word count in its text detection to support CJK languages
@@ -48,7 +36,7 @@ The disadvantages to Goose are
  <dependency>
     <groupId>de.jetwick</groupId>
     <artifactId>snacktory</artifactId>
-    <version>1.0-SNAPSHOT</version>
+    <version>1.1-SNAPSHOT</version>
  </dependency>
 
  Now you can use it as follows:

diff --git a/pom.xml b/pom.xml
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>org.jsoup</groupId>
             <artifactId>jsoup</artifactId>
-            <version>1.5.2</version>
+            <version>1.6.2</version>
         </dependency>
         <dependency>
             <groupId>org.slf4j</groupId>

diff --git a/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java b/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
@@ -1,17 +1,19 @@
 package de.jetwick.snacktory;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
-import org.jsoup.*;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-
-import java.util.regex.Pattern;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -38,10 +40,11 @@ public class ArticleTextExtractor {
     private static final Pattern NEGATIVE =
             Pattern.compile("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
             + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
-            + "sidebar|sponsor|tags|tool|widget");
+            + "sidebar|sponsor|tags|tool|widget|player");
 
-    private static final Pattern IGNORE_IMAGE_PATTERN = 
-    		Pattern.compile("ico(/|n|\\.)|spacer|blank|zoom");
+    private static final Pattern NEGATIVE_STYLE = Pattern.compile("hidden|display: ?none");
+	private static final Pattern IGNORE_IMAGE_PATTERN = 
+			Pattern.compile("ico(/|n|\\.)|spacer|blank|zoom");
     private static final String IMAGE_CAPTION = "caption";
     private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {
 
@@ -71,7 +74,13 @@ public JResult extractContent(JResult res, String html, OutputFormatter formatte
             throw new IllegalArgumentException("html string is empty!?");
 
         // http://jsoup.org/cookbook/extracting-data/selector-syntax
-        Document doc = Jsoup.parse(html);
+        return extractContent(res, Jsoup.parse(html), formatter);
+    }
+
+    public JResult extractContent(JResult res, Document doc, OutputFormatter formatter) throws Exception {
+        if (doc == null)
+            throw new NullPointerException("missing document");
+
         res.setTitle(extractTitle(doc));
 
         res.setDescription(extractDescription(doc));
@@ -129,6 +138,8 @@ public JResult extractContent(JResult res, String html, OutputFormatter formatte
 
         res.setFaviconUrl(extractFaviconUrl(doc));
 
+        res.setKeywords(extractKeywords(doc));
+
         return res;
     }
 
@@ -147,6 +158,22 @@ protected String extractDescription(Document doc){
         return SHelper.innerTrim(doc.select("head meta[name=description]").attr("content"));
     }
 
+    protected Collection<String> extractKeywords(Document doc){
+        String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));
+
+        if(content != null) {
+            if(content.startsWith("[") && content.endsWith("]"))
+                content = content.substring(1, content.length() - 1);
+
+            String[] split = content.split("\\s*,\\s*");
+
+            if(split.length > 1 || !split[0].equals(""))
+                return Arrays.asList(split);
+        }
+
+        return Collections.emptyList();
+    }
+
     /***
      *  Tries to extract an image url from metadata if determineImageSource failed
      * @param doc
@@ -207,6 +234,10 @@ protected int getWeight(Element e) {
 
         if (NEGATIVE.matcher(e.id()).find())
             weight -= 50;
+
+        String style = e.attr("style");
+        if(style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
+            weight -= 50;
 
         weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
 
@@ -479,19 +510,16 @@ private String doTitleSplits(String title, String delimeter) {
      * @return a set of all important nodes
      */
     public Collection<Element> getNodes(Document doc) {        
-        Map<Integer, Element> nodes = new LinkedHashMap<Integer, Element>(32);
+        Map<Element, Object> nodes = new LinkedHashMap<Element, Object>(64);
         int score = 100;
         for (Element el : doc.select("body").select("*")) {
             if ("p;div;td;h1;h2".contains(el.tagName())) {
-                // TODO reduce calculation of hashcode!
-                nodes.put(el.hashCode(), el);
-                Element p = el.parent();
-                nodes.put(p.hashCode(), p);
+                nodes.put(el, null);
                 setScore(el, score);
                 score = score / 2;
             }
         }
-        return nodes.values();
+        return nodes.keySet();
     }
 
     public String cleanTitle(String title) {

diff --git a/src/main/java/de/jetwick/snacktory/Converter.java b/src/main/java/de/jetwick/snacktory/Converter.java
@@ -15,14 +15,10 @@
  */
 package de.jetwick.snacktory;
 
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
+import java.io.*;
 import java.net.SocketTimeoutException;
 import java.nio.charset.Charset;
 import org.apache.log4j.Logger;
-import org.apache.log4j.Priority;
 
 /**
  * This class is not thread safe. Use one new instance every time due to encoding
@@ -52,8 +48,13 @@ public Converter setMaxBytes(int maxBytes) {
         return this;
     }
 
-    public static String extractEncoding(String contentType) {
-        String[] values = contentType.split(";");
+    public static String extractEncoding(String contentType) {        
+        String[] values;
+        if(contentType != null)
+            values = contentType.split(";");
+        else            
+            values = new String[0];
+
         String charset = "";
 
         for (String value : values) {
@@ -100,18 +101,18 @@ public String streamToString(InputStream is, int maxBytes, String enc) {
         BufferedInputStream in = null;
         try {
             in = new BufferedInputStream(is, K2);
-            StringBuilder sb = new StringBuilder();
+            ByteArrayOutputStream output = new ByteArrayOutputStream();
 
             // detect encoding with the help of meta tag
             try {
                 in.mark(K2 * 2);
-                String tmpEnc = detectCharset("charset=", sb, in);
+                String tmpEnc = detectCharset("charset=", output, in, encoding);
                 if (tmpEnc != null)
                     encoding = tmpEnc;
                 else {
                     logger.debug("no charset found in first stage");
                     // detect with the help of xml beginning ala encoding="charset"
-                    tmpEnc = detectCharset("encoding=", sb, in);
+                    tmpEnc = detectCharset("encoding=", output, in, encoding);
                     if (tmpEnc != null)                        
                         encoding = tmpEnc;
                     else
@@ -129,8 +130,8 @@ public String streamToString(InputStream is, int maxBytes, String enc) {
             // SocketException: Connection reset
             // IOException: missing CR    => problem on server (probably some xml character thing?)
             // IOException: Premature EOF => socket unexpectly closed from server
-            int bytesRead = K2;
-            byte[] arr = new byte[K2];
+            int bytesRead = output.size();
+            byte[] arr = new byte[K2];            
             while (true) {
                 if (bytesRead >= maxBytes) {
                     logger.warn("Maxbyte of " + maxBytes + " exceeded! Maybe html is now broken but try it nevertheless. Url: " + url);
@@ -140,12 +141,11 @@ public String streamToString(InputStream is, int maxBytes, String enc) {
                 int n = in.read(arr);
                 if (n < 0)
                     break;
-                bytesRead += K2;
-                sb.append(new String(arr, 0, n, encoding));
+                bytesRead += n;
+                output.write(arr, 0, n);
             }
 
-            return sb.toString();
-
+            return output.toString(encoding);
         } catch (SocketTimeoutException e) {
             logger.info(e.toString() + " url:" + url);
         } catch (IOException e) {
@@ -168,7 +168,9 @@ public String streamToString(InputStream is, int maxBytes, String enc) {
      * 
      * @throws IOException 
      */
-    public String detectCharset(String key, StringBuilder sb, BufferedInputStream in) throws IOException {
+    protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in, 
+            String enc) throws IOException {
+
         // Grab better encoding from stream        
         byte[] arr = new byte[K2];
         int nSum = 0;
@@ -178,45 +180,46 @@ public String detectCharset(String key, StringBuilder sb, BufferedInputStream in
                 break;
 
             nSum += n;
-            sb.append(new String(arr, 0, n, encoding));
+            bos.write(arr, 0, n);
         }
 
-        int encIndex = sb.indexOf(key);
+        String str = bos.toString(enc);
+        int encIndex = str.indexOf(key);
         int clength = key.length();
         if (encIndex > 0) {
-            char startChar = sb.charAt(encIndex + clength);
+            char startChar = str.charAt(encIndex + clength);
             int lastEncIndex;
             if (startChar == '\'')
                 // if we have charset='something'
-                lastEncIndex = sb.indexOf("'", ++encIndex + clength);
+                lastEncIndex = str.indexOf("'", ++encIndex + clength);
             else if (startChar == '\"')
                 // if we have charset="something"
-                lastEncIndex = sb.indexOf("\"", ++encIndex + clength);
+                lastEncIndex = str.indexOf("\"", ++encIndex + clength);
             else {
                 // if we have "text/html; charset=utf-8"                    
-                int first = sb.indexOf("\"", encIndex + clength);
+                int first = str.indexOf("\"", encIndex + clength);
                 if (first < 0)
                     first = Integer.MAX_VALUE;
 
                 // or "text/html; charset=utf-8 "
-                int sec = sb.indexOf(" ", encIndex + clength);
+                int sec = str.indexOf(" ", encIndex + clength);
                 if (sec < 0)
                     sec = Integer.MAX_VALUE;
                 lastEncIndex = Math.min(first, sec);
 
                 // or "text/html; charset=utf-8 '
-                int third = sb.indexOf("'", encIndex + clength);
+                int third = str.indexOf("'", encIndex + clength);
                 if (third > 0)
                     lastEncIndex = Math.min(lastEncIndex, third);
             }
 
             // re-read byte array with different encoding
             // assume that the encoding string cannot be greater than 40 chars
             if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) {
-                String tmpEnc = SHelper.encodingCleanup(sb.substring(encIndex + clength, lastEncIndex));
+                String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength, lastEncIndex));
                 try {
                     in.reset();
-                    sb.setLength(0);
+                    bos.reset();
                     return tmpEnc;
                 } catch (IOException ex) {
                     logger.warn("Couldn't reset stream to re-read with new encoding " + tmpEnc + " "

diff --git a/src/main/java/de/jetwick/snacktory/JResult.java b/src/main/java/de/jetwick/snacktory/JResult.java
@@ -15,6 +15,8 @@
  */
 package de.jetwick.snacktory;
 
+import java.util.Collection;
+
 /**
  * Parsed result from web page containing important title, text and image.
  * 
@@ -32,7 +34,8 @@ public class JResult {
     private String faviconUrl;
     private String description;
     private String dateString;
-
+    private Collection<String> keywords;
+
     public JResult() {
     }
 
@@ -139,6 +142,14 @@ public JResult setDate(String date) {
         return this;
     }
 
+    public Collection<String> getKeywords() {
+        return keywords;
+    }
+
+    public void setKeywords(Collection<String> keywords) {
+        this.keywords = keywords;
+    }
+
     /**
      * @return get date from url or guessed from text
      */