In [1]:
import asyncio
import requests
import json
from tqdm import tqdm_notebook as tqdm
from bs4 import BeautifulSoup

In [2]:
async def do_request(url, method='get'):
    loop = asyncio.get_event_loop()
    future = loop.run_in_executor(None, getattr(requests, method), url)
    return await future

In [3]:
async def nyt_page(number):
    url = 'https://www.nytimes.com/svc/collections/v1/publish/www.nytimes.com/section/aponline/news?q=&sort=newest&page={}&dom=www.nytimes.com&dedupe_hl=y'
    return await do_request(url.format(number))

async def main():
    news = []
    
    for page in tqdm(range(500)):
        resp = await nyt_page(page)
        resp = json.loads(resp.content)    
        news += resp['members']['items']
    
    return news

In [None]:
loop = asyncio.get_event_loop()
news = loop.run_until_complete(main())

In [4]:
async def api_request(year, month, api_key):
    url = 'http://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}'
    resp = await do_request(url.format(year, month, api_key))
    return json.loads(resp.content)

async def fetch_year(year, api_key):
    async def only_content(year, month, api_key):
        await asyncio.sleep(1)
        data = await api_request(year, month, api_key)
        try:
            return data['response']['docs']
        except Exception as e:
            print(data)
            return await only_content(year, month, api_key)
        
    news = [await only_content(year, month, api_key) for month in tqdm(range(1, 13), leave=False)]
    return news

async def fetch_many(api_key):
    api_news = [await fetch_year(year, api_key) for year in tqdm(range(2010, 2017))]
    api_news = sum(api_news, [])
    
    articles = [[news for news in api_news[k] if news['document_type'] == 'article'] for k in range(12)]
    articles = sum(articles, [])

    return articles

In [None]:
loop = asyncio.get_event_loop()
articles = loop.run_until_complete(fetch_many('e48729d78b824b76a6eb151cd6e81ec7'))

In [5]:
import pickle

In [None]:
with open('articles.pkl', 'rb') as fp:
    articles = pickle.load(fp)

In [None]:
async def fetch_p(articles):
    for article in tqdm(articles):
        if article is None:
            break
            
        print(article['web_url'])

        resp = await do_request(article['web_url'])
        bs = BeautifulSoup(resp.content, 'lxml')


        for p in bs.select('.story-body-text.story-content'):
            if int(p['data-para-count']) > 100:
                article['first-para'] = p.text
                break
                
    return articles

import itertools
def grouper(n, iterable, fillvalue=None):
    "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)

async def do_magic():
    tasks = [asyncio.ensure_future(fetch_p(group)) for group in grouper(1000, articles)]
    for task in tasks:
        await task
        
loop = asyncio.get_event_loop()
loop.run_until_complete(do_magic())

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


https://www.nytimes.com/2010/01/01/nyregion/01bloomberg.text.html


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installe

https://www.nytimes.com/2010/01/05/nyregion/05nyc.html
https://query.nytimes.com/gst/fullpage.html?res=9A01E7DB123AF93AA35752C0A9669D8B63
https://www.nytimes.com/2010/01/13/theater/reviews/13versus.html
https://www.nytimes.com/2010/01/17/magazine/17FOB-onlanguage-t.html
https://www.nytimes.com/2010/01/21/arts/design/21abroad.html
https://www.nytimes.com/2010/01/25/sports/hockey/25sportsbriefs-nhl.html
https://www.nytimes.com/2010/01/29/education/29brush.html
https://www.nytimes.com/2010/02/02/sports/football/02freeney.html
https://www.nytimes.com/2010/02/06/us/06brfs-DISMISSALOFA_BRF.html
https://www.nytimes.com/2010/02/10/arts/music/10arts-AIRFORCERESE_BRF.html
https://www.nytimes.com/2010/02/14/nyregion/14love.html
https://www.nytimes.com/2010/02/18/arts/television/18arts-IDOLOUTDRAWS_BRF.html
https://www.nytimes.com/2010/02/22/health/research/22trialside.html
https://www.nytimes.com/2010/02/26/nyregion/26call.html
https://www.nytimes.com/2010/03/02/arts/television/02arts-POSTOLYMPIC

In [None]:
with open('articles-all.pkl', 'wb') as fp:
    pickle.dump(articles, fp)