In [1]:
import json
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from time import gmtime, strftime

def now():
    """
    Returns
    -------
    Current time : str
        eg: 2018-11-22 13:35:23
    """
    return strftime("%Y-%m-%d %H:%M:%S", gmtime())

def get_soup(url, headers=None):
    """
    Arguments
    ---------
    url : str
        Web page url
    headers : dict
        Headers for requests. If None, use Mozilla/5.0 as default user-agent
    Returns
    -------
    soup : bs4.BeautifulSoup
        Soup format web page
    """

    if headers is None:
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    r = requests.get(url, headers=headers)
    html = r.text
    page = BeautifulSoup(html, 'lxml')
    return page

doublespace_pattern = re.compile('\s+')
lineseparator_pattern = re.compile('\n+')

def normalize_text(text):
    doublespace_pattern = re.compile('\s+')
    lineseparator_pattern = re.compile('\n+')
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = lineseparator_pattern.sub('\n', text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()

In [2]:
from nltk.tokenize import sent_tokenize
from dateutil.parser import parse
import re

## this function need to url from urls. use for

def parse_article(url):
    def parse_title(soup):
        title = soup.find('div', class_='view_headline HD')
        if not title:
            return 'title error'
        return title.text
    def parse_date(soup):
        date = soup.find_all('div', class_ = 'view_date')[0].text[9:19] ## need to fix date part by regex 
        if not date:
            return 'date error'
        return parse(date).strftime("%Y-%m-%d")
    
    def parse_content(soup):
        content = soup.find('div', itemprop='articleBody')
        return normalize_text(content.text)

    soup = get_soup(url)

    fist_dic =  {
            'url': url,
            'title': parse_title(soup),
            'subtitle' : "",
            'date': parse_date(soup),
            'content' :parse_content(soup),
            'category' : url.split("/")[4],
            'source': 'koreatimes',
            'scrap_time': now()
    }
    return fist_dic

In [3]:
import json
import time
import requests
import re
from dateutil.parser import parse

## caution: global times limit page 100, you should use date 


def yield_latest_article(begin_date,end_date,  max_num=10, sleep=1.0):
    """
    Artuments
    ---------
    begin_date : str
        eg. 20180701
    end_date :str
        eg. 20190331
    max_num : int
        Maximum number of news to be scraped
    sleep : float
        Sleep time. Default 1.0 sec

    It yields
    ---------
    news : json object
    """

    # prepare parameters

    for page in range(1, 10000):
        # get urls
        page = str(page)
        url = "https://www.koreatimes.co.kr/www2/common/search.asp?kwd=&pageNum={}&pageSize=10&category=total&sort=&startDate={}&endDate={}&date=0&srchFd=&range=&author=all&authorData=&mysrchFd=".format(page, begin_date, end_date)
        soup = get_soup(url)
        sub_links = soup.find('tbody', id = 'divSearchList').find_all('a')
        links = [a['href'] for a in sub_links]
        for a in links:
            print(a)
            try:
                news_json = parse_article(a)
                time.sleep(sleep)
            except:
                print("this "+url+" is not working")
                continue
            if a == None:
                break

In [4]:
def save(json_obj, directory):
    date = json_obj.get("date", "")
    title = json_obj.get("title", "")

    filepath = "{}/{}_{}.json".format(
        directory, date, re.sub('[\/:*?\<>|%]"', "", title[:50])
    )
    print("scraped {}".format(json_obj["title"]))
    with open(filepath, "w", encoding="utf-8") as fp:
        json.dump(json_obj, fp, indent=2, ensure_ascii=False)

In [5]:
directory = "~/output"
for article in yield_latest_article(20210901, 20210901, 10, 0.1):
    n_exceptions = 0
    try:
        save(article, directory)
        print("scraped {}".format(article.get("url"), ""))
    except Exception as e:
        n_exceptions += 1
        print(e)
        continue
    if n_exceptions > 0:
        print("Exist %d article exceptions" % n_exceptions)

/www/sublist_740.html
this https://www.koreatimes.co.kr/www2/common/search.asp?kwd=&pageNum=1&pageSize=10&category=total&sort=&startDate=20210901&endDate=20210901&date=0&srchFd=&range=&author=all&authorData=&mysrchFd= is not working
/www/sublist_745.html
this https://www.koreatimes.co.kr/www2/common/search.asp?kwd=&pageNum=1&pageSize=10&category=total&sort=&startDate=20210901&endDate=20210901&date=0&srchFd=&range=&author=all&authorData=&mysrchFd= is not working
http://www.koreatimes.co.kr/www/nation/2021/09/113_314896.html
http://www.koreatimes.co.kr/www/nation/2021/09/113_314896.html
/www/nation/2022/04/745_314896.html
this https://www.koreatimes.co.kr/www2/common/search.asp?kwd=&pageNum=1&pageSize=10&category=total&sort=&startDate=20210901&endDate=20210901&date=0&srchFd=&range=&author=all&authorData=&mysrchFd= is not working
/www/sublist_740.html
this https://www.koreatimes.co.kr/www2/common/search.asp?kwd=&pageNum=1&pageSize=10&category=total&sort=&startDate=20210901&endDate=2021090

KeyboardInterrupt: 

In [None]:
 yield_latest_article(20210901, 20210901)