In [222]:
import calendar
import pickle
import json
import logging
import pathlib
import time

import bs4
import numpy as np
import pandas as pd
import requests

In [196]:
def delay(seconds, lam=1.0):
    """Sleep for at least a given number of seconds plus random amount 
    of seconds from Poisson distribution.
    
    Parameters
    ----------
    seconds : float
        Minimum delay execution for a given number of seconds.
    lam : float
        Expectation of interval, should be >= 0.
        
    Returns
    -------
    None
    """
    
    time.sleep(seconds + np.random.rand() + np.random.poisson(lam=lam))

In [None]:
def get_pages(start, end, *, seconds=3):
    """TODO
    
    Parameters
    ----------
    start : str or datetime-like, optional
        Left bound for generating dates.
    end : str or datetime-like, optional
        Right bound for generating dates.
    seconds : float, optional
        Delay time.
    
    Yields
    -------
    page : bs4.BeautifulSoup
    """
    
    url = 'https://www.npr.org/sections/news/archive'
    dates = pd.date_range(start, end)
    for d in dates:
        date = f'{d.month}-{d.day}-{d.year}'
        params = {'date': date}
        resp = requests.get(url, params=params)
        
        try:
            resp.raise_for_status()
        except requests.exceptions.HTTPError:
            logging.exception(f'HTTPError - {resp.url} - {resp.status_code}')
            continue
        
        page = bs4.BeautifulSoup(resp.text)
        delay(seconds)
        yield page

In [None]:
def get_articles(page, *, directory=None):
    """TODO
    
    Parameters
    ----------
    page : bs4.BeautifulSoup
####    directory : path-like object, optional
    
    Returns
    -------
    articles : list[bs4.element.Tag]
    """
    
    # actually put in a `main` function
#     if (directory is not None) and (not pathlib.Path(directory).is_dir()):
#         raise ValueError('must be a directory')
    
    articles = page.find_all('article')
    return articles

In [220]:
def get_info(article, *, seconds=3, path=None):
    """TODO
    
    Parameters
    ----------
    article : bs4.element.Tag
        Article preview from archive page.
    seconds : float, optional
        Delay time
    path : path-like object, optional
        Save info as JSON if path is not None.
    
    Returns
    -------
    info : dict
        Dictionary with keys: date, title, author, summary, story
    """
    
    if article.find(class_='audio-availability-message'):
        return
    
    teaser = article.find(class_='teaser')
    date, summary = teaser.text.split('\x95')
    
    link = article.a['href']
    resp = requests.get(link)
    
    try:
        resp.raise_for_status()
    except requests.exceptions.HTTPError
        logging.exception(f'HTTPError - {resp.url} - {resp.status_code}')
        return
    
    soup = bs4.BeautifulSoup(resp.text).article
    
    try:
        author = soup.find(class_='byline__name').text.strip()
    except AttributeError:
        author = None
    
    text = soup.find(class_='transcript')
    if not text:
        text = soup.find(id='storytext')
        
    paragraphs = text.find_all('p')
    story = '\n\n'.join(p.text.strip() for p in paragraphs if p.parent.get('id') == 'storytext')
    title = soup.find(class_='storytitle').text.strip()
    
    info = dict(date=date, title=title, author=author, summary=summary, story=story)
    
    if path:
        with open(path) as fp:
            json.dump(info, fp)
        logging.info(f'Saved - {resp.url} - {path}')
    
    delay(seconds)
    return info

In [184]:
info = get_info(article)

### audio transcript

In [202]:
_url = 'https://www.npr.org/2014/11/30/367544593/in-liberia-ebola-shifts-from-cities-to-villages'
# _resp = requests.get(_url)
# _soup = bs4.BeautifulSoup(_resp.text)

In [206]:
json.dumps({'a': None})

'{"a": null}'