In [None]:
import datetime
import itertools
import os
import re
import requests
import sys
import time
from copy import deepcopy
from collections import namedtuple, defaultdict

from bs4 import BeautifulSoup as BS
from tenacity import retry, stop_after_attempt
import simplejson as json
from selenium import webdriver

from utils import ArticleMeta, NewsCrawler

In [None]:
class SetnNewsCrawler(NewsCrawler):
    
    # 650887 is a piece of news at 2018/12/10 01:10:00, I start to crawl
    def __init__(self, output_dir, total_days, start_date=datetime.date.today(), start_id=630214):
        super(SetnNewsCrawler, self).__init__(output_dir, total_days, start_date)
        self.start_id = start_id
        self.end_date = start_date - datetime.timedelta(days=total_days)
    
    def newslink_generator(self):
        
        news_prefix = "https://www.setn.com/News.aspx?NewsID="
        for news_id in range(self.start_id, 0, -1):
            yield news_prefix + str(news_id)
    
    @retry(stop=stop_after_attempt(0),
           retry_error_callback=lambda x: None)
    def parse_category(self, newspage):
        return newspage.find('meta', property='article:section')['content']
    
    @retry(stop=stop_after_attempt(0),
           retry_error_callback=lambda x: None)
    def parse_title(self, newspage):
        return newspage.title.text.split('|')[0].strip()
    
    @retry(stop=stop_after_attempt(0),
           retry_error_callback=lambda x: None)
    def parse_article(self, newspage):
        paragraphs = []
        for paragraph in newspage.find('article').find_all('p'):
            if paragraph.findChild() or paragraph.attrs:
                continue
            else:
                paragraphs.append(paragraph.text)
        return '\n'.join(paragraphs)
    
    @retry(stop=stop_after_attempt(0),
           retry_error_callback=lambda x: None)
    def parse_date_time(self, newspage):
        # workaround
        datetime_string = newspage.find('meta', property='article:published_time')['content']
        dt = datetime.datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S")
        return str(dt.date()), str(dt.time())
    
    def is_valid_newspage(self, bsObj):
        
        if bsObj is None:
            return False
        
        try:
            if bsObj.findAll('img', src='/images/404.png'):
                return False
            else:
                return True
        except:
            return True
    
    def saved_filename(self, url):
        return url.split('=')[-1]
    
    def crawl_and_save(self):
        
        num_early_news = 0
        
        for newslink in self.newslink_generator():

            if newslink in self.newslinks:
                continue
            else:
                self.newslinks.add(newslink)

            page = self.get_bsObj(newslink)
            if not self.is_valid_newspage(page):
                print('Invalid Page or Redirected Page:', newslink)
                continue
            
            category = self.parse_category(page)
            text = self.parse_article(page)
            date_str, time_str = self.parse_date_time(page)
            title = self.parse_title(page)
            
            if text is None:
                print('Invalid Page or Redirected Page:', newslink)
                continue
            
            print('Crawling News:', newslink)
            if date_str <= str(self.end_date):
                num_early_news += 1
                if num_early_news > 100:
                    break
            
            article = ArticleMeta(
                url=newslink,
                date=date_str,
                time=time_str,
                category=category,
                title=title,
                content=text
            )
            
            output_dir_with_date = os.path.join(self.output_dir, date_str)
            os.makedirs(output_dir_with_date ,exist_ok=True)
            filename = os.path.join(output_dir_with_date, self.saved_filename(newslink))
            with open(filename, 'w+', encoding='utf-8') as f:
                json.dump(article._asdict(), f, ensure_ascii=False, indent=4)

In [None]:
setn_crawler = SetnNewsCrawler(output_dir='news/setn', total_days=1095, start_id=568884)

In [None]:
page = setn_crawler.crawl_and_save()