クロマグロ残り2匹 葛西臨海水族園の大量死は未だに原因不明
+ + + + +このエントリは、期限切れになった
+diff --git a/cleaner.go b/cleaner.go index ff621f3..c367fbe 100644 --- a/cleaner.go +++ b/cleaner.go @@ -51,8 +51,9 @@ var removeNodesRegEx = regexp.MustCompile("" + "^banner|" + "^bar$|" + "blog-pager|" + - "button|" + + "brass\\-rail|" + "breadcrumbs|" + + "button|" + "byline|" + "cabecalho|" + "^caption$|" + @@ -210,7 +211,7 @@ var removeNodesRegEx = regexp.MustCompile("" + // Clean removes HTML elements around the main content and prepares the document for parsing func (c *Cleaner) Clean(docToClean *goquery.Document) *goquery.Document { if c.config.debug { - log.Println("Starting cleaning phase with Cleaner") + log.Println("Starting cleaning phase with Cleaner\n") } docToClean = c.cleanArticleTags(docToClean) docToClean = c.cleanEMTags(docToClean) diff --git a/crawler_test.go b/crawler_test.go index 43ff7bb..3369419 100644 --- a/crawler_test.go +++ b/crawler_test.go @@ -41,7 +41,7 @@ func ValidateArticle(expected Article, removed *[]string) error { } if !strings.Contains(result.CleanedText, expected.CleanedText) { - //fmt.Printf("EXPECTED: %s \n\n\n\nACTUAL: %s\n\n", expected.CleanedText, result.CleanedText) + fmt.Printf("EXPECTED: %s \n\n\n\nACTUAL: %s\n\n", expected.CleanedText, result.CleanedText) return fmt.Errorf("article cleanedText does not contain %q", expected.CleanedText) } @@ -570,6 +570,25 @@ func Test_HuffingtonPostCoUk(t *testing.T) { } } +func Test_HuffingtonPostJp(t *testing.T) { + article := Article{ + Domain: "huffingtonpost.jp", + Title: "クロマグロ残り2匹 葛西臨海水族園の大量死は未だに原因不明", + MetaDescription: "クロマグロやカツオ類が大量死した問題で、葛西臨海水族園(東京都江戸川区)は3日、病理検査の結果、海の養殖魚を大量死させることで知られる2種類のウイルスが原因ではないことが確認されたと発表した。", + CleanedText: "", + MetaKeywords: "クロマグロ残り2匹 葛西臨海水族園の大量死は未だに原因不明, japan", + CanonicalLink: "http://www.huffingtonpost.jp/2015/03/03/tuna-death_n_6796602.html", + TopImage: "http://i.huffpost.com/gen/2678692/images/o-TUNA-DEATH-facebook.jpg", + } + //article.Links = []string{""} + + removed := []string{"~~~REMOVED~~~"} + err := ValidateArticle(article, &removed) + if err != nil { + t.Error(err) + } +} + func Test_IncCom(t *testing.T) { article := Article{ Domain: "inc.com", diff --git a/outputformatter.go b/outputformatter.go index dfc4490..2e4d263 100644 --- a/outputformatter.go +++ b/outputformatter.go @@ -83,7 +83,6 @@ func (formatter *outputFormatter) linksToText() []string { } func (formatter *outputFormatter) getOutputText() string { - out := formatter.topNode.Text() out = normalizeWhitespaceRegexp.ReplaceAllString(out, " ") diff --git a/sites/huffingtonpost.jp.html b/sites/huffingtonpost.jp.html new file mode 100644 index 0000000..ea73f2d --- /dev/null +++ b/sites/huffingtonpost.jp.html @@ -0,0 +1,2351 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+このエントリは、期限切れになった
+
+-
+
+
+
+
+ -
+
+
+
+
+Follow
+
+
+
+
+ -
+
+ メールマガジンを講読
+
+
+
+
+
+
+
+
+ -
+
+ ハフィントンポスト内を検索
+
+
+
+
+
+
+
+
+
+
+