In [1]:
from bs4 import BeautifulSoup
import asyncio
from aiohttp import ClientSession
import numpy as np
import pandas as pd
from functools import partial
from aiohttp import ClientConnectorError
import json
import re
from bs4 import NavigableString
from pprint import pprint
from datetime import datetime, timedelta
import os
from itertools import chain
import pickle

## Collect Wayback URL

In [2]:
EM_ROOT = "./data/mondo_scraped/lists_sp/"
file_list = os.listdir(EM_ROOT)

In [3]:
file_list

['Wayback Machine sp 2014 - Wayback Machine.tsv',
 'Wayback Machine sp 2015 - Wayback Machine.tsv',
 'Wayback Machine sp 2016 - Wayback Machine.tsv',
 'Wayback Machine sp 2017 - Wayback Machine.tsv']

In [4]:
links = []
for i in range(4):
    links += [pd.read_csv(os.path.join(EM_ROOT, file_list[i]), delimiter='\t')]

In [5]:
full = pd.concat(links, axis=0, ignore_index=True)

In [6]:
full.head()

Unnamed: 0,Link,URL
0,1,/web/20140101032345/http://www.elmundo.es/depo...
1,2,/web/20140102045259/http://www.elmundo.es/depo...
2,3,/web/20140103051125/http://www.elmundo.es/depo...
3,4,/web/20140104001706/http://www.elmundo.es/depo...
4,5,/web/20140105002555/http://www.elmundo.es/depo...


In [7]:
full.shape

(1080, 2)

In [8]:
full.loc[0, 'URL']

'/web/20140101032345/http://www.elmundo.es/deportes.html'

article h1 a,article h2 a,article h3 a

In [9]:
HEAD = "https://web.archive.org"

In [10]:
trails = full['URL']

In [11]:
len(trails)

1080

## Scrape Article URLs

In [12]:
async def get_html_data(trail, session):
    """Access El Mondo daily news webpage"""
    url = HEAD + trail
    attempts = 0
    while attempts < 3:
        try:
            res = await session.get(url)
            try:
                html = await res.text()
            except UnicodeDecodeError:
                return trail, None
            soup = BeautifulSoup(html, 'html.parser')
            urls = [a['href'] for a in soup.select("article h1 a,article h2 a,article h3 a")
                    if not re.search(r'/video/', a['href'])]
            return trail, urls
        except ClientConnectorError:
            attempts += 1
#             print("Connector error occurred!")
    if attempts == 3:
        print("Connector error occurred! Connection Failed!")
        return trail, None

async def gather_results(curr, step, trails):
    """Launch scrape tasks and collect results"""
    tasks = []
    async with ClientSession() as session:
        for trail in trails[curr: curr + step]:
            task = asyncio.ensure_future(get_html_data(trail, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        # you now have all response bodies in this variable
        return responses


def process_df(future, curr, step):
    """Save scrape results in json files"""
    cache = {k: v for k, v in future.result()}
#     cache = future.result()
    if len(cache) == 0:
        raise RuntimeError("Empty response!")
    else:
        json.dump(cache, open("./data/mondo_scraped/urls_sp/scraped_{0}_{1}.json".format(curr, curr + step), "w"))
#         print(cache)
        print("got it! ({0}, {1})".format(curr, curr + step))

In [21]:
start = 1070
# end = len(trails)
step = 10
end = start + step
abandoned = []

for curr in range(start, end, step):
    print("loading data from {0} to {1}".format(curr, curr + step))
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(gather_results(curr, step, trails))
    future.add_done_callback(partial(process_df, curr=curr, step=step))
    loop.run_until_complete(future)

loading data from 1070 to 1080
got it! (1070, 1080)


70, 180, 410, 520, 660, 690, 870, 1070

## Collect Article URLs

In [24]:
URL_ROOT = "./data/mondo_scraped/urls_sp/"
fn = os.listdir(URL_ROOT)

In [25]:
articles = dict()
for file in fn:
    with open(os.path.join(URL_ROOT, file), "r") as f:
        articles.update(json.load(f))

In [26]:
all_articles = list(chain.from_iterable(x for x in articles.values() if x is not None))

In [27]:
chopped = [re.search(r'http://.*\.html', url).group(0) for url in all_articles 
           if re.search(r'http://.*\.html', url) and not re.search(r'(?:/album/|/blogs/)', url)]

In [28]:
cleaned = list(set(chopped))

In [29]:
len(cleaned)

15290

In [30]:
cleaned[:5]

['http://www.elmundo.es/deportes/2016/02/12/56bd9f7222601dec3d8b4677.html',
 'http://www.elmundo.es/deportes/mas-deporte/2017/08/04/5983777ce5fdea972d8b45d0.html',
 'http://www.elmundo.es/deportes/2015/08/20/55d5ea94ca4741023b8b4594.html',
 'http://www.elmundo.es/deportes/2016/08/21/57b8d2b3e5fdea111c8b458f.html',
 'http://www.elmundo.es/comunidad-valenciana/2014/04/12/53497f89268e3e0c6a8b4575.html']

In [31]:
pickle.dump(cleaned, open("./data/mondo_scraped/cleaned_int_sp.pkl", "wb"))