In [3]:
import time
import re
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from time import gmtime, strftime

news_dateformat = '%B %d, %Y'
user_dateformat = '%Y-%m-%d'

def now():
    """
    Returns
    -------
    Current time : str
        eg: 2018-11-22 13:35:23
    """
    return strftime("%Y-%m-%d %H:%M:%S", gmtime())

def get_soup(url, headers=None):
    """
    Arguments
    ---------
    url : str
        Web page url
    headers : dict
        Headers for requests. If None, use Mozilla/5.0 as default user-agent

    Returns
    -------
    soup : bs4.BeautifulSoup
        Soup format web page
    """

    if headers is None:
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    r = requests.get(url, headers=headers)
    html = r.text
    page = BeautifulSoup(html, 'lxml')
    return page

doublespace_pattern = re.compile('\s+')
lineseparator_pattern = re.compile('\n+')

def normalize_text(text):
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = lineseparator_pattern.sub('\n', text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()


def strf_to_datetime(strf, form):
    return datetime.strptime(strf, form)

In [2]:
from whitehouse_scraper import yield_latest_allnews

begin_date = '2019-01-10'
max_num = 50
sleep = 1.0

for i, json_obj in enumerate(yield_latest_allnews(begin_date, max_num, sleep)):
    title = json_obj['title']
    time = json_obj['time']
    print('[{} / {}] ({}) {}'.format(i+1, max_num, time, title))

In [4]:

import re
from dateutil.parser import parse

def parse_page(url):
    """
    Argument
    --------
    url : str
        Web page url

    Returns
    -------
    json_object : dict
        JSON format web page contents
        It consists with
            title : article title
            time : article written time
            content : text with line separator \\n
            url : web page url
            scrap_time : scrapped time
    """

    try:
        soup = get_soup(url)
        title = soup.find('h1', class_ = 'page-title topper__title news').text 
        title = re.sub('\n', '' , title, 100)
        title = re.sub('\t', '' , title, 100)
        date = soup.find('time', class_='posted-on entry-date published updated').text
        date = parse(date).strftime("%Y-%m-%d")
        phrases = soup.find('section', class_='body-content').find_all('p')
        content = '\n'.join([p.text.strip() for p in phrases])
        content = re.sub('\n', '' , content, 100)
        content = re.sub('\t', '' , content, 100)
        content = re.sub('\xa0', '' , content, 1000)
        
        json_object = {
            'title' : title,
            'date' : date,
            'content' : content,
            'url' : url,
            'scrap_time' : now()
        }
        return json_object
    except Exception as e:
        print(e)
        print('Parsing error from {}'.format(url))
        return None

In [28]:
patterns = [
    re.compile('https://www.whitehouse.gov/briefing-room/[\w]+')]
url_base = 'https://www.whitehouse.gov/briefing-room/page/{}/'

In [29]:

def is_matched(url):
    for pattern in patterns:
        if pattern.match(url):
            return True
    return False


In [30]:

def yield_latest_allnews(begin_date, max_num=10, sleep=1.0):
    """
    Artuments
    ---------
    begin_date : str
        eg. 2018-01-01
    max_num : int
        Maximum number of news to be scraped
    sleep : float
        Sleep time. Default 1.0 sec

    It yields
    ---------
    news : json object
    """

    for page in range(1, 11):

        # check number of scraped news

        # get urls
        url = url_base.format(page)
        soup = get_soup(url)
        links = soup.find_all('a', class_ = 'news-item__title')
        urls = [i['href'] for i in links]
        print(url)
        urls = [url for url in urls if is_matched(url)]

        # scrap
        for url in urls:
            print(url)

            news_json = parse_page(url)
            

            # check date
            return news_json
            time.sleep(sleep)

In [31]:
yield_latest_allnews('2021-06-01', 100)

https://www.whitehouse.gov/briefing-room/page/1/
https://www.whitehouse.gov/briefing-room/statements-releases/2021/06/26/statement-by-president-joe-biden-on-the-bipartisan-infrastructure-framework/


{'title': 'Statement by President Joe Biden on the Bipartisan Infrastructure Framework',
 'date': '2021-06-26',
 'content': 'On Thursday, I reached a historic agreement with a bipartisan group of Senators on a $1.2 trillion plan to transform our physical infrastructure.The plan would make the largest investment in infrastructure in history, the biggest investment in rail since the creation of Amtrak, and the largest investment in transit ever.It would fix roads and bridges, make critical investments in our clean energy future, and help this country compete with China and other economic rivals.It would replace lead water pipes in our schools and houses, and connect every American to high-speed internet.It would create millions of high-paying jobs that could not be outsourced.In the days since, the primary focus in Washington has not been about the Plan’s scope, scale or provisions—but rather, how it relates to other legislation before Congress:my American Families Plan.The American Fami

In [19]:
urls

NameError: name 'urls' is not defined