In [191]:
import calendar
import pickle
import json
import logging
import time

import bs4
import numpy as np
import requests

In [87]:
url = 'https://www.npr.org/sections/news/archive'

# month number (no pad) / day (no pad) / year (4 digits)
params = {'date': '3-6-2019'}

In [88]:
day_resp = requests.get(url, params=params)
day_resp.raise_for_status()
day_soup = bs4.BeautifulSoup(day_resp.text)

In [188]:
articles = day_soup.find_all('article')
article = articles[0]

In [196]:
def delay(seconds, lam=1.0):
    """Sleep for at least a given number of seconds plus random amount of seconds from Poisson distribution.
    
    Parameters
    ----------
    seconds : float
        Minimum delay execution for a given number of seconds.
    lam : float
        Expectation of interval, should be >= 0.
        
    Returns
    -------
    None
    """
    
    time.sleep(seconds + np.random.rand() + np.random.poisson(lam=lam))

In [183]:
def extract_article(article, *, seconds=3, path=None):
    """
    Parameters
    ----------
    article : bs4.element.Tag
    
    Returns
    -------
    info : dict
        Keys -- date, summary, story, author
    """
    
    teaser = article.find(class_='teaser')
    date, summary = teaser.text.split('\x95')
    
    link = article.a['href']
    resp = requests.get(link)
    
    try:
        resp.raise_for_status()
    except requests.exceptions.HTTPError as exc:
        logging.exception(exc)
        raise
    
    soup = bs4.BeautifulSoup(resp.text).article
    
    author = soup.find(class_='byline__name').text.strip()
    paragraphs = soup.find(id='storytext').find_all('p')
    story = '\n\n'.join(p.text.strip() for p in paragraphs if p.parent.get('id') == 'storytext')
    title = soup.find(class_='storytitle').text.strip()
    
    info = dict(date=date, title=title, author=author, summary=summary, story=story)
    
    if path:
        with open(path) as fp:
            json.dump(info, fp)
        logging.info(f'Saved to: {path}')
    
    delay(seconds)
    return info

In [184]:
extract = extract_article(article)