In [2]:
from bs4 import BeautifulSoup
import asyncio
from aiohttp import ClientSession
import numpy as np
import pandas as pd
from functools import partial
from aiohttp import ClientConnectorError
import json
import re
from bs4 import NavigableString
from pprint import pprint
from datetime import datetime, timedelta
import os
from itertools import chain

In [28]:
async def get_html_data(trail, session):
    """Access CNN webpage"""
    url = trail
    attempts = 0
    while attempts < 3:
        try:
            res = await session.get(url)
            try:
                html = await res.text()
            except UnicodeDecodeError:
                return trail, None
            try:
                soup = BeautifulSoup(html, 'html.parser')
                headline = soup.select(".headline-article-main")[0].text
                facts = [fact.text for fact in soup.select(".key-facts p")]
                body = "\n".join([sent.text for sent in soup.select(".content-body-text p")]).replace("\n\n", "\n")
                tag = [tag.text for tag in soup.select(".tags a")]
            except IndexError:
                global DEBUG
                DEBUG.append((trail, soup))
                print("content error!")                
                return trail, None
            try:
                time = re.search(r"\(.*\d\d\d\d", soup.select("time")[0].text).group(0)
            except:
                time = None
            attrs = {
                "headline": headline,
                "keyfacts": facts, 
                "content": body,
                "tags": tag,
                "time": time
                    }
            return trail, attrs
        except ClientConnectorError:
            attempts += 1
            print("Connector error occurred!")
    if attempts == 3:
        return trail, None
    
async def gather_results(curr, step, trails):
    """Launch scrape tasks and collect results"""
    tasks = []
    async with ClientSession() as session:
        for trail in trails[curr: curr + step]:
            task = asyncio.ensure_future(get_html_data(trail, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        # you now have all response bodies in this variable
        return responses


def process_df(future, curr, step):
    """Save scrape results in json files"""
    cache = {k: v for k, v in future.result()}
#     cache = future.result()
    if len(cache) == 0:
        raise RuntimeError("Empty response!")
    else:
#         json.dump(cache, open("./data/scraped/scraped_{0}_{1}.json".format(curr, curr + step), "w"))
        json.dump(cache, open("./data/contents/scraped_{0}_{1}.json".format(curr, curr + step), "w"))
        print("got it! ({0}, {1})".format(curr, curr + step))

In [3]:
file_list = os.listdir("./data/scraped/")
articles = dict()
for file in file_list:
    with open(os.path.join("./data/scraped/", file), "r") as f:
        articles.update(json.load(f))

trails = list(chain.from_iterable(articles.values()))
len(trails)

3828

In [34]:
# df = pd.read_csv("./data/ratings.csv")
# app_ids = df.loc[:, "app_id"].astype(str)

start = 1800
# end = len(trails)
end = 1900
step = 100
DEBUG = []

for curr in range(start, end, step):
    print("loading data from {0} to {1}".format(curr, curr + step))
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(gather_results(curr, step, trails))
    future.add_done_callback(partial(process_df, curr=curr, step=step))
    loop.run_until_complete(future)

loading data from 1800 to 1900
got it! (1800, 1900)


In [4]:
doc_list = os.listdir("./data/contents/")
articles = dict()
for file in doc_list:
    with open(os.path.join("./data/contents/", file), "r") as f:
        articles.update(json.load(f))

len(articles)

3828

In [5]:
clean = {k: v for k, v in articles.items() if v is not None}

In [6]:
len(clean)

3803

In [7]:
docs = pd.DataFrame.from_dict(clean, orient='index')

In [8]:
valids = docs.loc[docs['keyfacts'].astype(np.bool), :]
valids.shape

(3069, 5)

In [9]:
valids.head()

Unnamed: 0,headline,keyfacts,content,tags,time
http://cnnespanol.cnn.com/2015/05/05/estudiantes-peruanos-ensenan-historia-en-los-autobuses/,Estudiantes peruanos enseñan historia... en lo...,[Un grupo de estudiantes de la Universidad San...,(CNN Español) - En medio de los tediosos y caó...,[],"(02:22 GMT) 5 mayo, 2015"
http://cnnespanol.cnn.com/2015/05/05/la-dea-sanciona-a-agentes-que-olvidaron-a-estudiante-en-una-celda-por-cinco-dias/,La DEA sanciona a agentes que olvidaron a estu...,[Daniel Chong fue esposado y mantenido dentro ...,(CNN) – La Administración para el Control de D...,"[Daniel Chong, DEA]","(03:09 GMT) 5 mayo, 2015"
http://cnnespanol.cnn.com/2015/05/06/el-papa-francisco-aprende-trucos-de-basquetbol-con-los-harlem-globetrotters/,El papa Francisco aprende trucos de basquetbol...,[Francisco es la novena persona en la historia...,(CNNMéxico) — ¿Un papa basquetbolista? No exac...,"[Harlem Globetrotters, Papa Francisco]","(01:23 GMT) 6 mayo, 2015"
http://cnnespanol.cnn.com/2015/05/06/es-momento-de-un-cambio-michelle-bachelet-le-pide-la-renuncia-a-su-gabinete/,'Es momento de un cambio': Michelle Bachelet l...,[Michelle Bachelet anunció que pidió la renunc...,(CNN Español) - La presidenta Michelle Bachele...,[Michele Bachelet],"(01:33 GMT) 6 mayo, 2015"
http://cnnespanol.cnn.com/2015/05/06/partidos-politicos-de-mexico-adoptan-medidas-para-lidiar-con-la-violencia-contra-candidatos/,Partidos políticos de México adoptan medidas p...,[Una serie de hechos de violencia registrados ...,(CNN Español) - El Partido Acción Nacional de ...,[],"(01:29 GMT) 6 mayo, 2015"


In [10]:
valids.to_pickle('./data/cnn_es.pkl')