In [1]:
import itertools
import json
import logging
import time

import bs4
import numpy as np
import pandas as pd
import requests

In [2]:
def delay(seconds, lam=1.0):
    """Sleep for at least a given number of seconds plus random amount 
    of seconds from Poisson distribution.
    
    Parameters
    ----------
    seconds : float
        Minimum delay execution for a given number of seconds.
    lam : float
        Expectation of interval, should be >= 0.
        
    Returns
    -------
    None
    """
    
    time.sleep(seconds + np.random.rand() + np.random.poisson(lam=lam))

In [3]:
def get_pages(dates, *, seconds=3):
    """TODO
    
    Parameters
    ----------
    dates : pandas.DatetimeIndex
    seconds : float, optional
        Delay time.
    
    Yields
    -------
    page : bs4.BeautifulSoup
    """
    
    url = 'https://www.npr.org/sections/news/archive'
    dates = pd.date_range(start, end)
    for d in dates:
        date = f'{d.month}-{d.day}-{d.year}'
        params = {'date': date}
        resp = requests.get(url, params=params)
        
        try:
            resp.raise_for_status()
        except requests.exceptions.HTTPError:
            logging.exception(f'HTTPError - {resp.url} - {resp.status_code}')
            continue
        
        page = bs4.BeautifulSoup(resp.text)
        delay(seconds)
        yield page

In [4]:
def get_info(article, *, seconds=3):
    """TODO
    
    Parameters
    ----------
    article : bs4.element.Tag
        Article preview from archive page.
    seconds : float, optional
        Delay time
    path : path-like object, optional
        Save info as JSON if path is not None.
    
    Returns
    -------
    info : dict
        Dictionary with keys: date, title, author, summary, story
    """
    
    if article.find(class_='audio-availability-message'):
        return
    
    teaser = article.find(class_='teaser')
    if not teaser:  # articles with audio have this inside of umbrella article tag
        return
    date, summary = teaser.text.split('\x95')
    
    link = article.h2.a['href']
    resp = requests.get(link)
    
    try:
        resp.raise_for_status()
    except requests.exceptions.HTTPError:
        logging.exception(f'HTTPError - {resp.url} - {resp.status_code}')
        return
    
    soup = bs4.BeautifulSoup(resp.text)
    
    try:
        author = soup.find(class_='byline__name').text.strip()
    except AttributeError:
        author = None
    
    text = soup.find(class_='transcript')
    if not text:
        text = soup.find(id='storytext')
        
    try:
        paragraphs = text.find_all('p')
    except AttributeError:
        return
    
    story = '\n\n'.join(p.text.strip() for p in paragraphs if 'storytext' in p.parent.get('class', []))
    title = soup.find(class_='storytitle').text.strip()
    
    info = dict(date=date, title=title, author=author, summary=summary, story=story)
    delay(seconds)
    return info

In [5]:
dates = pd.date_range('2010-01-01', '2010-01-02', closed='left')
start = dates[0].date()
end = dates[-1].date()

fmt = '{name} - {asctime} - {levelname} : {message}'
logging.basicConfig(filename=f'{start}__{end}.log', level=logging.INFO, style='{', format=fmt)

logging.info(f'STARTED {start} to {end}')

jsons = []
try:
    for date, page in zip(dates, get_pages(dates, seconds=0)):
        logging.info(date.date())
        for article in page.find_all('article'):
            info = get_info(article, seconds=0)
            if info:
                jsons.append(json.dumps(info))
except Exception:
    logging.exception('*** MAIN ERROR ***')


file_json = f'{start}__{date.date()}.json'
with open(file_json, 'w') as fp:
    lines = (j + n for j, n in zip(jsons, itertools.repeat('\n')))
    fp.writelines(lines)

logging.info(f'FINISHED {start} to {end}')

In [98]: no_story = list(d for d in dicts if not d['story'])

In [99]: dicts = [json.loads(line) for line in set(lines)]

In [100]: with open(jan_01_12) as fp:
     ...:     dicts = []
     ...:     for line in fp.readlines():
     ...:         dicts.append(json.loads(line))
     ...:

In [101]: jan = ''