In [None]:
import concurrent.futures
import contextlib
import datetime
import itertools
import os
import re
import requests
import sys
import time
from copy import deepcopy
from collections import namedtuple, defaultdict

from bs4 import BeautifulSoup as BS
from tenacity import retry, stop_after_attempt, wait_random
import simplejson as json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

from utils import ArticleMeta, NewsCrawler

In [None]:
class NownewsCrawler(NewsCrawler):
    
    # 3811581 is a piece of news at 2018/12/10 01:10:00, I start to crawl
    # <link rel="shortlink" href="https://www.nownews.com/?p=3787634">
    def __init__(self, output_dir, total_days, start_date=datetime.date.today(), start_id=3811581):
        super(NownewsCrawler, self).__init__(output_dir, total_days, start_date)
        self.start_id = start_id
        self.end_date = start_date - datetime.timedelta(days=total_days)
        self.redirected_url_checker = re.compile(r'^https:\/\/www.nownews.com\/news\/[0-9]{8}\/[0-9]*\/$')
    
    def newslink_generator(self):
        
        news_prefix = "https://www.nownews.com/?p="
        for news_id in range(self.start_id, 0, -1):
            yield news_prefix + str(news_id)
    
    @retry(stop=stop_after_attempt(3),
           wait=wait_random(min=1, max=2),
           retry_error_callback=lambda x: None)
    def get_bsObj_check_redirected(self, url, url_check_func):

        req = self.session.get(url, headers=self.headers)
        if url_check_func(req.url, url):
            return None
        
        second_url = req.url
        req = requests.get(second_url)
        if url_check_func(req.url, url):
            print(req.url, second_url)
            return None
        
        bsObj = BS(req.text, "html.parser")
        return bsObj
    
    @retry(stop=stop_after_attempt(0),
           retry_error_callback=lambda x: None)
    def parse_category(self, newspage):
        return newspage.find('span', class_='td-bred-no-url-last').text
    
    @retry(stop=stop_after_attempt(0),
           retry_error_callback=lambda x: None)
    def parse_title(self, newspage):
        return newspage.find('h1', class_='entry-title').text
    
    @retry(stop=stop_after_attempt(0),
           retry_error_callback=lambda x: None)
    def parse_article(self, newspage):
        paragraphs = []
        for paragraph in newspage.find('span', itemprop='articleBody').findAll('p'):
            if paragraph.findChild() or paragraph.attrs or not paragraph.text:
                continue
            else:
                paragraphs.append(paragraph.text)
        return '\n'.join(paragraphs)
    
    @retry(stop=stop_after_attempt(0),
           retry_error_callback=lambda x: (None, None))
    def parse_date_time(self, newspage):
        # workaround
        datetime_string = newspage.find('time', class_="entry-date updated td-module-date").text
        dt = datetime.datetime.strptime(datetime_string, "%Y-%m-%d %H:%M:%S")
        return str(dt.date()), str(dt.time())
    
    def is_valid_newspage(self, bsObj):
        
        if bsObj is None:
            return False
        
        try:
            if bsObj.findAll('body', class_='error404'):
                return False
            else:
                return True
        except:
            return True
    
    def saved_filename(self, url):
        return url.split('=')[-1] + '.json'
    
    def crawl_and_save(self):
        
        num_old_news = 0
        
        for newslink in self.newslink_generator():

            article = self.get_page_attribute_from_link(
                newslink,
                lambda x: (self.get_bsObj_check_redirected(x, lambda x, y: x.split('/')[-2] != y.split('=')[-1]))
            )
            
            if article is None:
                continue
            
            # check if there are continuous 100 piece of old news
            if article.date <= str(self.end_date):
                num_old_news += 1
                if num_old_news >= 100:
                    break
            else:
                num_old_news = 0
        
            self.save_article_meta(article)
            

In [None]:
crawler = NownewsCrawler(output_dir='../news/nownews', total_days=1095, start_id=3767932)

In [None]:
# with open('../logs/nownews.txt', 'a') as f:
#     with contextlib.redirect_stdout(f):
crawler.crawl_and_save()