diff --git a/cleaner.go b/cleaner.go index ff621f3..c367fbe 100644 --- a/cleaner.go +++ b/cleaner.go @@ -51,8 +51,9 @@ var removeNodesRegEx = regexp.MustCompile("" + "^banner|" + "^bar$|" + "blog-pager|" + - "button|" + + "brass\\-rail|" + "breadcrumbs|" + + "button|" + "byline|" + "cabecalho|" + "^caption$|" + @@ -210,7 +211,7 @@ var removeNodesRegEx = regexp.MustCompile("" + // Clean removes HTML elements around the main content and prepares the document for parsing func (c *Cleaner) Clean(docToClean *goquery.Document) *goquery.Document { if c.config.debug { - log.Println("Starting cleaning phase with Cleaner") + log.Println("Starting cleaning phase with Cleaner\n") } docToClean = c.cleanArticleTags(docToClean) docToClean = c.cleanEMTags(docToClean) diff --git a/crawler_test.go b/crawler_test.go index 43ff7bb..3369419 100644 --- a/crawler_test.go +++ b/crawler_test.go @@ -41,7 +41,7 @@ func ValidateArticle(expected Article, removed *[]string) error { } if !strings.Contains(result.CleanedText, expected.CleanedText) { - //fmt.Printf("EXPECTED: %s \n\n\n\nACTUAL: %s\n\n", expected.CleanedText, result.CleanedText) + fmt.Printf("EXPECTED: %s \n\n\n\nACTUAL: %s\n\n", expected.CleanedText, result.CleanedText) return fmt.Errorf("article cleanedText does not contain %q", expected.CleanedText) } @@ -570,6 +570,25 @@ func Test_HuffingtonPostCoUk(t *testing.T) { } } +func Test_HuffingtonPostJp(t *testing.T) { + article := Article{ + Domain: "huffingtonpost.jp", + Title: "クロマグロ残り2匹 葛西臨海水族園の大量死は未だに原因不明", + MetaDescription: "クロマグロやカツオ類が大量死した問題で、葛西臨海水族園(東京都江戸川区)は3日、病理検査の結果、海の養殖魚を大量死させることで知られる2種類のウイルスが原因ではないことが確認されたと発表した。", + CleanedText: "", + MetaKeywords: "クロマグロ残り2匹 葛西臨海水族園の大量死は未だに原因不明, japan", + CanonicalLink: "http://www.huffingtonpost.jp/2015/03/03/tuna-death_n_6796602.html", + TopImage: "http://i.huffpost.com/gen/2678692/images/o-TUNA-DEATH-facebook.jpg", + } + //article.Links = []string{""} + + removed := []string{"~~~REMOVED~~~"} + err := ValidateArticle(article, &removed) + if err != nil { + t.Error(err) + } +} + func Test_IncCom(t *testing.T) { article := Article{ Domain: "inc.com", diff --git a/outputformatter.go b/outputformatter.go index dfc4490..2e4d263 100644 --- a/outputformatter.go +++ b/outputformatter.go @@ -83,7 +83,6 @@ func (formatter *outputFormatter) linksToText() []string { } func (formatter *outputFormatter) getOutputText() string { - out := formatter.topNode.Text() out = normalizeWhitespaceRegexp.ReplaceAllString(out, " ") diff --git a/sites/huffingtonpost.jp.html b/sites/huffingtonpost.jp.html new file mode 100644 index 0000000..ea73f2d --- /dev/null +++ b/sites/huffingtonpost.jp.html @@ -0,0 +1,2351 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + クロマグロ残り2匹 葛西臨海水族園の大量死は未だに原因不明 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + +
+ +
+ +
+ + + + +
+
+ + +
+
+ + + + Huffpost Japan + +
+ + + + + + + + + + +
+
+
+ +
+ + + + + + + + + +
+ +
+ + + + + + +
+
+
+ + + + + + +
+ + + + + + +
+ + +
+ + +
+
+ + +

クロマグロ残り2匹 葛西臨海水族園の大量死は未だに原因不明

+ + +
+ +
+ +
+ +
+ + 投稿日: + + + 更新: + +
+
+ + + + +
+ TUNA DEATH +
葛西臨海水族園の水槽の中を悠々と泳ぐ2匹のクロマグロ=3日午後5時24分、東京都江戸川区、小宮路勝撮影 | 朝日新聞社
+ + + + + + + + 印刷 + +
+
+ + +
+
+ + +
+
+ +
+ +
+
+ + + +

このエントリは、期限切れになった

+
+ +
+
+
+ +
+

Also on HuffPost:

+
+ + + +
+
+ + +
+
+ Close +
+ + + +
+
+
+
+ + グロ可愛い?深海生物の世界へようこそ +
+ + +
+ + / + +
+ +
+
+ +
+
+ + +
+ +
+
+
+ + +
+
    +
  • + +
  • +
  • + +
  • +
+
+ +
+ + +
+
+
+
+
シェア
+
+
+
ツイート
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+
+ + + +
+
AD
+
+
+
+ +
+
+
+
+
+ + +
+ +
+ + + + + +
+
+
+
この記事をシェア:
+ 閉じる +
+ +
+ 現在のスライド
+
+
+
+ + +
+
+ +
+
+ +
+
+ + + + +
+

訂正箇所を連絡

+
+ + + +
+
+ +
+
+
+ + + + + + + + +
+

他のサイトの関連記事

+

いまだ原因不明 葛西臨海水族園のマグロ大量死

未知ウイルスかストレスか マグロ156匹死亡の異常事態“真相”

「おさかなさん がんばれ」 葛西臨海水族園、生き残りクロマグロに激励の手紙

+ +
+ +
+ +
+
+ +
 
+ +
+ + + + +
+ +
+ +
+
+
+ + + +
+ +
+
+ +
+ +
+ +
+ + + + +
+ +
+
+ +
+
+ + + + + + + + + + +
+
+ +
+
+ + +
+
+ + + + +
+ +
+
+ + + + +
+ +
+ +
+
+ + + + +
+ + + +
+ + + +
+ +
+
+ + + + +
+
+ + +
+ + + +
+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/sites/instagram.html b/sites/instagram.html new file mode 100644 index 0000000..afaea5a --- /dev/null +++ b/sites/instagram.html @@ -0,0 +1,270 @@ + + + + + + + + + + + +Instagram + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file