In [1]:
import time
import N38_scraper
from N38_scraper import get_allnews_urls
from N38_scraper import parse_page

urls = get_allnews_urls(begin_page=1, end_page=3, verbose=True)
print('%d urls for all news' % len(urls))

get briefing statement urls 1 / 3
get briefing statement urls 2 / 3
get briefing statement urls 3 / 3
12 urls for all news


In [2]:
urls

['https://www.38north.org/2019/01/yongbyon010919/',
 'https://www.38north.org/2019/01/gford010919/',
 'https://www.38north.org/2019/01/jgilesjsiebens010819/',
 'https://www.38north.org/2019/01/mwilliams010419/',
 'https://www.38north.org/2019/01/rcarlin010319/',
 'https://www.38north.org/2019/01/rfrank010219/',
 'https://www.38north.org/2018/12/afostercarter123018/',
 'https://www.38north.org/2018/12/lsigal122818/',
 'https://www.38north.org/2018/12/mwilliams122718/',
 'https://www.38north.org/2018/12/gtoloraya122618/',
 'https://www.38north.org/2018/12/editor122118/',
 'https://www.38north.org/2018/12/rcarlin122118/']

In [9]:
import re
import requests
from bs4 import BeautifulSoup
from time import gmtime, strftime


def now():
    """
    Returns
    -------
    Current time : str
        eg: 2018-11-22 13:35:23
    """
    return strftime("%Y-%m-%d %H:%M:%S", gmtime())

def get_soup(url, headers=None):
    """
    Arguments
    ---------
    url : str
        Web page url
    headers : dict
        Headers for requests. If None, use Mozilla/5.0 as default user-agent

    Returns
    -------
    soup : bs4.BeautifulSoup
        Soup format web page
    """

    if headers is None:
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    r = requests.get(url, headers=headers)
    html = r.text
    page = BeautifulSoup(html, 'lxml')
    return page

doublespace_pattern = re.compile('\s+')
lineseparator_pattern = re.compile('\n+')

def normalize_text(text):
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = lineseparator_pattern.sub('\n', text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()

In [10]:
def parse_page(url):
    """
    Argument
    --------
    url : str
        Web page url

    Returns
    -------
    json_object : dict
        JSON format web page contents
        It consists with
            title : article title
            time : article written time
            content : text with line separator \\n
            url : web page url
            scrap_time : scrapped time
    """

    try:
        soup = get_soup(url)
        title = soup.find('h1', class_= 'page-header').text
        time = soup.find('ul', class_='post__meta list-inline').find('li', class_='meta--date').text
        phrases = soup.find('div', class_='content').find_all('p')
        content = '\n'.join([p.text.strip() for p in phrases])

        json_object = {
            'title' : title,
            'time' : time,
            'content' : content,
            'url' : url,
            'scrap_time' : now()
        }
        return json_object
    except Exception as e:
        print(e)
        print('Parsing error from {}'.format(url))
        return None


In [12]:
def pprint(json_object):
    for k, v in json_object.items():
        print('{} : {} ..'.format(k, str(v)[:100]))
    print('\n')

SLEEP = 0.5

for url in urls[:3]:
    json_object = parse_page(url)
    pprint(json_object)

title : North Korea’s Yongbyon Nuclear Facilities: Well Maintained but Showing Limited Operations ..
time : January 9, 2019 ..
content : A 38 North exclusive with analysis by Frank V. Pabian and Jack Liu
Commercial satellite imagery of N ..
url : https://www.38north.org/2019/01/yongbyon010919/ ..
scrap_time : 2019-01-15 12:23:30 ..


title : Kim Jong Un’s New Year’s Speech: On the Domestic Front ..
time : January 9, 2019 ..
content : Commentary on Kim Jong Un’s New Year’s speech on January 1 has understandably focused almost exclusi ..
url : https://www.38north.org/2019/01/gford010919/ ..
scrap_time : 2019-01-15 12:23:31 ..


title : Opportunities for Dynamic Force Employment in East Asia ..
time : January 8, 2019 ..
content : The United States is at a crossroads with North Korea. It can continue efforts to reduce tensions, s ..
url : https://www.38north.org/2019/01/jgilesjsiebens010819/ ..
scrap_time : 2019-01-15 12:23:32 ..


