In [1]:
import datetime
import itertools
import os
import re
import requests
import sys
import time
from copy import deepcopy
from collections import namedtuple, defaultdict

from bs4 import BeautifulSoup as BS
from tenacity import retry, stop_after_attempt
import simplejson as json
from selenium import webdriver

In [2]:
ArticleMeta = namedtuple('ArticleMeta', ['url', 'date', 'time', 'category', 'title', 'content'])

In [3]:
class NewsCrawler(object):
    
    def __init__(self, output_dir, total_days, start_date=datetime.date.today()):
        
        self.session = requests.Session()
        self.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6",
            "Connection": "keep-alive",
            "Referer": "https://www.google.com.tw/",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "\
                          "Chrome/69.0.3497.92 Safari/537.36"
        }
        self.scroll_pause_time = 0.3
        self.driver = webdriver.PhantomJS(executable_path='../phantomjs-2.1.1-linux-x86_64/bin/phantomjs')
        self.output_dir = output_dir
        self.total_days = total_days
        self.start_date = start_date
        self.newslinks = set()
    
    @retry(stop=stop_after_attempt(3))
    def get_bsObj(self, url):
        
        req = self.session.get(url, headers=self.headers)
        if req.url != url:
            return None
        bsObj = BS(req.text, "html.parser")
        return bsObj
    
    def get_bsObj_scroll_down(self, url):

        self.driver.get(url)
        # Get scroll height
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        while True:
            # Scroll down to bottom
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            # Wait to load page
            time.sleep(self.scroll_pause_time)
            # Calculate new scroll height and compare with last scroll height
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        return BS(self.driver.page_source, "html.parser")
    
    def date_generator(self):
        
        date = self.start_date
        for _ in range(self.total_days):
            yield str(date)
            date = date - datetime.timedelta(days=1)
    
    # the functions below will be different for several news site.
    def newslink_generator(self):
        pass
    
    def bad_newspage_checker(self, bsObj):
        pass

In [4]:
class EttodayNewsCrawler(NewsCrawler):
    
    def date_to_newslist_url(self, date):
        
        return "https://www.ettoday.net/news/news-list-%s-0.htm" % date
    
    def unfold_abbr_newslink(self, url):
        
        return "https://www.ettoday.net" + url
    
    def newslink_generator(self):
        
        ettoday_date_form = re.compile("[0-9]{4}/[0-9]{2}/[0-9]{2} [0-9]{2}:[0-9]{2}")
        
        for date in self.date_generator():
            newslist_url = self.date_to_newslist_url(date)
            newslist = self.get_bsObj_scroll_down(newslist_url)
            news_datetimes = newslist.find_all('span', class_='date', text=ettoday_date_form)

            for news_datetime in news_datetimes:
                newslinks = news_datetime.parent.a['href']
                yield newslinks
    
    def parse_category(self, newspage):
        return newspage.findAll('a', itemprop='item')[1].span.text
    
    def parse_title(self, newspage):
        return newspage.find('h1', class_='title', itemprop='headline').text
    
    def parse_article(self, newspage):
        paragraphs = []
        journalist = None
        for paragraph in newspage.find('article').find_all('p'):
            if paragraph.findChild():
                continue
            else:
                paragraphs.append(paragraph.text)
        return '\n'.join(paragraphs), journalist
    
    def parse_date_time(self, newspage):
        # workaround
        datetime_string = newspage.find('time')['datetime'][:-6]
        dt = datetime.datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S")
        return str(dt.date()), str(dt.time())
    
    def is_valid_newspage(self, bsObj):
        
        if bsObj is None:
            return False
        
        try:
            if page.find('em').text == '404錯誤':
                return False
            else:
                return True
        except:
            return True
    
    def saved_filename(self, url):
        return url.split('/')[-1].split('.')[0] + '.json'
    
    def crawl_and_save(self):
        
        for newslink in self.newslink_generator():
            
            unfolded_newslink = self.unfold_abbr_newslink(newslink)
            if unfolded_newslink in self.newslinks:
                continue
            else:
                self.newslinks.add(unfolded_newslink)

            page = self.get_bsObj(unfolded_newslink)
            if not self.is_valid_newspage(page):
                print('Invalid Page or Redirected Page:', unfolded_newslink)
                continue
            
            print('Crawling News:', unfolded_newslink)
            category = self.parse_category(page)
            text = self.parse_article(page)
            date_str, time_str = self.parse_date_time(page)
            title = self.parse_title(page)
            
            article = ArticleMeta(
                url=unfolded_newslink,
                date=date_str,
                time=time_str,
                category=category,
                title=title,
                content=text
            )
            
            output_dir_with_date = os.path.join(self.output_dir, date_str)
            os.makedirs(output_dir_with_date ,exist_ok=True)
            filename = os.path.join(output_dir_with_date, self.saved_filename(unfolded_newslink))
            with open(filename, 'w+') as f:
                json.dump(article._asdict(), f, ensure_ascii=False, indent=4)

In [5]:
ettoday_crawler = EttodayNewsCrawler(output_dir='news/ettoday', total_days=1)



In [6]:
ettoday_crawler.crawl_and_save()

Crawling News: https://www.ettoday.net/news/20191209/1598187.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1598181.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1598184.htm
Crawling News: https://www.ettoday.net/news/20191209/1598183.htm
Crawling News: https://www.ettoday.net/news/20191209/1598180.htm
Crawling News: https://www.ettoday.net/news/20191209/1598172.htm
Crawling News: https://www.ettoday.net/news/20191209/1598085.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1598175.htm
Crawling News: https://www.ettoday.net/news/20191209/1598178.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1591381.htm
Crawling News: https://www.ettoday.net/news/20191209/1598114.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1598177.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1598138.htm
Invalid Page or Redirected Page: https://www.et

Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597607.htm
Crawling News: https://www.ettoday.net/news/20191209/1597783.htm
Crawling News: https://www.ettoday.net/news/20191209/1598061.htm
Crawling News: https://www.ettoday.net/news/20191209/1598072.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1598070.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1598066.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1598050.htm
Crawling News: https://www.ettoday.net/news/20191209/1598054.htm
Crawling News: https://www.ettoday.net/news/20191209/1598034.htm
Crawling News: https://www.ettoday.net/news/20191209/1594143.htm
Crawling News: https://www.ettoday.net/news/20191209/1598063.htm
Crawling News: https://www.ettoday.net/news/20191209/1598009.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1598068.htm
Crawling News: https://www.ettoday.net/news/20191209/1597982.htm


Crawling News: https://www.ettoday.net/news/20191209/1597796.htm
Crawling News: https://www.ettoday.net/news/20191209/1597528.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1596599.htm
Crawling News: https://www.ettoday.net/news/20191209/1597973.htm
Crawling News: https://www.ettoday.net/news/20191209/1597882.htm
Crawling News: https://www.ettoday.net/news/20191209/1597902.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597470.htm
Crawling News: https://www.ettoday.net/news/20191209/1597978.htm
Crawling News: https://www.ettoday.net/news/20191209/1597960.htm
Crawling News: https://www.ettoday.net/news/20191209/1597897.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597931.htm
Crawling News: https://www.ettoday.net/news/20191209/1597864.htm
Crawling News: https://www.ettoday.net/news/20191209/1597437.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597965.htm
Crawling News: htt

Crawling News: https://www.ettoday.net/news/20191209/1597575.htm
Crawling News: https://www.ettoday.net/news/20191209/1597799.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597850.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597820.htm
Crawling News: https://www.ettoday.net/news/20191209/1597848.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597667.htm
Crawling News: https://www.ettoday.net/news/20191209/1597843.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597829.htm
Crawling News: https://www.ettoday.net/news/20191209/1597805.htm
Crawling News: https://www.ettoday.net/news/20191209/1597818.htm
Crawling News: https://www.ettoday.net/news/20191209/1597698.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597814.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1595771.htm
Crawling News: https://www.ettoday.net/news/201

Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597559.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597676.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1595942.htm
Crawling News: https://www.ettoday.net/news/20191209/1597476.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597697.htm
Crawling News: https://www.ettoday.net/news/20191209/1597646.htm
Crawling News: https://www.ettoday.net/news/20191209/1597693.htm
Crawling News: https://www.ettoday.net/news/20191209/1597664.htm
Crawling News: https://www.ettoday.net/news/20191209/1597699.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597657.htm
Crawling News: https://www.ettoday.net/news/20191209/1597688.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597685.htm
Crawling News: https://www.ettoday.net/news/20191209/1597588.htm
Crawling News: https://www.ettoday.net/news/201

Crawling News: https://www.ettoday.net/news/20191209/1597460.htm
Crawling News: https://www.ettoday.net/news/20191209/1597440.htm
Crawling News: https://www.ettoday.net/news/20191209/1597599.htm
Crawling News: https://www.ettoday.net/news/20191209/1597546.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597582.htm
Crawling News: https://www.ettoday.net/news/20191209/1595616.htm
Crawling News: https://www.ettoday.net/news/20191209/1597577.htm
Crawling News: https://www.ettoday.net/news/20191209/1597498.htm
Crawling News: https://www.ettoday.net/news/20191209/1597590.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597584.htm
Crawling News: https://www.ettoday.net/news/20191209/1597578.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597586.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597472.htm
Crawling News: https://www.ettoday.net/news/20191209/1597514.htm
Crawling News: htt

Crawling News: https://www.ettoday.net/news/20191209/1597377.htm
Crawling News: https://www.ettoday.net/news/20191209/1597334.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597237.htm
Crawling News: https://www.ettoday.net/news/20191209/1597331.htm
Crawling News: https://www.ettoday.net/news/20191209/1597164.htm
Crawling News: https://www.ettoday.net/news/20191209/1597369.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597367.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597361.htm
Crawling News: https://www.ettoday.net/news/20191209/1597243.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1584070.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597366.htm
Invalid Page or Redirected Page: https://www.ettoday.net/news/20191209/1597365.htm
Crawling News: https://www.ettoday.net/news/20191209/1597363.htm
Invalid Page or Redirected Page: https://www.et