In [20]:
from bs4 import BeautifulSoup
import asyncio
from aiohttp import ClientSession
import numpy as np
import pandas as pd
from functools import partial
from aiohttp import ClientConnectorError
import json
import re
from bs4 import NavigableString
from pprint import pprint
from datetime import datetime, timedelta
import os
from itertools import chain

In [2]:
async def get_html_data(trail, session):
    """Access CNN daily archives webpage"""
    url = trail
    attempts = 0
    while attempts < 3:
        try:
            res = await session.get(url)
            try:
                html = await res.text()
            except UnicodeDecodeError:
                return trail, None
            soup = BeautifulSoup(html, 'html.parser')
            urls = [a['href'] for a in soup.select(".entry-title a") if not re.search(r'/video/', a['href'])]
            return trail, urls
        except ClientConnectorError:
            attempts += 1
            print("Connector error occurred!")
    if attempts == 3:
        return trail, None

async def gather_results(curr, step, trails):
    """Launch scrape tasks and collect results"""
    tasks = []
    async with ClientSession() as session:
        for trail in trails[curr: curr + step]:
            task = asyncio.ensure_future(get_html_data(trail, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        # you now have all response bodies in this variable
        return responses


def process_df(future, curr, step):
    """Save scrape results in json files"""
    cache = {k: v for k, v in future.result()}
#     cache = future.result()
    if len(cache) == 0:
        raise RuntimeError("Empty response!")
    else:
        json.dump(cache, open("./data/scraped/scraped_{0}_{1}.json".format(curr, curr + step), "w"))
        print("got it! ({0}, {1})".format(curr, curr + step))


# df = pd.read_csv("./data/ratings.csv")
# app_ids = df.loc[:, "app_id"].astype(str)

In [3]:
header = "http://cnnespanol.cnn.com/"

In [4]:
(datetime.strptime('2015/05/05', '%Y/%m/%d') + timedelta(days=1)).strftime('%Y/%m/%d')

'2015/05/06'

In [5]:
def date_generator(final):
    date = datetime.strptime('2015/05/05', '%Y/%m/%d')
    while date < datetime.strptime(final, '%Y/%m/%d'):
        yield date.strftime('%Y/%m/%d')
        date += timedelta(days=1)

In [6]:
trails = [header + date + "/" for date in date_generator('2017/09/08')]

In [7]:
len(trails)

857

In [8]:
start = 0
end = 857
step = 50

for curr in range(start, end, step):
    print("loading data from {0} to {1}".format(curr, curr + step))
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(gather_results(curr, step, trails))
    future.add_done_callback(partial(process_df, curr=curr, step=step))
    loop.run_until_complete(future)

loading data from 0 to 50
got it! (0, 50)
loading data from 50 to 100
got it! (50, 100)
loading data from 100 to 150
Connector error occurred!
Connector error occurred!
Connector error occurred!
got it! (100, 150)
loading data from 150 to 200
got it! (150, 200)
loading data from 200 to 250
got it! (200, 250)
loading data from 250 to 300
got it! (250, 300)
loading data from 300 to 350
got it! (300, 350)
loading data from 350 to 400
got it! (350, 400)
loading data from 400 to 450
got it! (400, 450)
loading data from 450 to 500
got it! (450, 500)
loading data from 500 to 550
got it! (500, 550)
loading data from 550 to 600
got it! (550, 600)
loading data from 600 to 650
got it! (600, 650)
loading data from 650 to 700
got it! (650, 700)
loading data from 700 to 750
got it! (700, 750)
loading data from 750 to 800
got it! (750, 800)
loading data from 800 to 850
got it! (800, 850)
loading data from 850 to 900
got it! (850, 900)


In [11]:
file_list = os.listdir("./data/scraped/")

In [15]:
articles = dict()
for file in file_list:
    with open(os.path.join("./data/scraped/", file), "r") as f:
        articles.update(json.load(f))

In [21]:
all_articles = list(chain.from_iterable(articles.values()))

In [23]:
len(all_articles)

3828