In [49]:
from bs4 import BeautifulSoup
import asyncio
from aiohttp import ClientSession
import numpy as np
import pandas as pd
from functools import partial
from aiohttp import ClientConnectorError
import json
import re
from bs4 import NavigableString
from pprint import pprint
from datetime import datetime, timedelta
import os
from itertools import chain
import pickle

## Collect Wayback URL

In [2]:
EM_ROOT = "./data/mondo_scraped/lists/"
file_list = os.listdir(EM_ROOT)

In [3]:
file_list

['Wayback Machine 1 - Wayback Machine.tsv',
 'Wayback Machine 2 - Wayback Machine.tsv',
 'Wayback Machine 3 - Wayback Machine.tsv',
 'Wayback Machine 4 - Wayback Machine.tsv']

In [4]:
links = []
for i in range(4):
    links += [pd.read_csv(os.path.join(EM_ROOT, file_list[i]), delimiter='\t')]

In [5]:
full = pd.concat(links, axis=0, ignore_index=True)

In [6]:
full.head()

Unnamed: 0,Link,URL
0,1,/web/20170101125857/http://www.elmundo.es/inte...
1,2,/web/20170102133719/http://www.elmundo.es/inte...
2,3,/web/20170103141433/http://www.elmundo.es/inte...
3,4,/web/20170104145738/http://www.elmundo.es/inte...
4,5,/web/20170105154311/http://www.elmundo.es/inte...


In [7]:
full.shape

(1068, 2)

In [8]:
full.loc[0, 'URL']

'/web/20170101125857/http://www.elmundo.es/internacional.html'

article h1 a,article h2 a,article h3 a

In [9]:
HEAD = "https://web.archive.org"

In [10]:
trails = full['URL']

In [11]:
len(trails)

1068

## Scrape Article URLs

In [12]:
async def get_html_data(trail, session):
    """Access El Mondo daily news webpage"""
    url = HEAD + trail
    attempts = 0
    while attempts < 3:
        try:
            res = await session.get(url)
            try:
                html = await res.text()
            except UnicodeDecodeError:
                return trail, None
            soup = BeautifulSoup(html, 'html.parser')
            urls = [a['href'] for a in soup.select("article h1 a,article h2 a,article h3 a")
                    if not re.search(r'/video/', a['href'])]
            return trail, urls
        except ClientConnectorError:
            attempts += 1
#             print("Connector error occurred!")
    if attempts == 3:
        print("Connector error occurred! Connection Failed!")
        return trail, None

async def gather_results(curr, step, trails):
    """Launch scrape tasks and collect results"""
    tasks = []
    async with ClientSession() as session:
        for trail in trails[curr: curr + step]:
            task = asyncio.ensure_future(get_html_data(trail, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        # you now have all response bodies in this variable
        return responses


def process_df(future, curr, step):
    """Save scrape results in json files"""
    cache = {k: v for k, v in future.result()}
#     cache = future.result()
    if len(cache) == 0:
        raise RuntimeError("Empty response!")
    else:
        json.dump(cache, open("./data/mondo_scraped/urls/scraped_{0}_{1}.json".format(curr, curr + step), "w"))
#         print(cache)
        print("got it! ({0}, {1})".format(curr, curr + step))

In [13]:
start = 0
end = 1068
step = 10
abandoned = []

for curr in range(start, end, step):
    print("loading data from {0} to {1}".format(curr, curr + step))
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(gather_results(curr, step, trails))
    future.add_done_callback(partial(process_df, curr=curr, step=step))
    loop.run_until_complete(future)

loading data from 0 to 10
Connector error occurred! Connection Failed!
got it! (0, 10)
loading data from 10 to 20
Connector error occurred! Connection Failed!
got it! (10, 20)
loading data from 20 to 30
got it! (20, 30)
loading data from 30 to 40
Connector error occurred! Connection Failed!
got it! (30, 40)
loading data from 40 to 50
Connector error occurred! Connection Failed!
got it! (40, 50)
loading data from 50 to 60
Connector error occurred! Connection Failed!
got it! (50, 60)
loading data from 60 to 70
Connector error occurred! Connection Failed!
got it! (60, 70)
loading data from 70 to 80
Connector error occurred! Connection Failed!
got it! (70, 80)
loading data from 80 to 90
Connector error occurred! Connection Failed!
got it! (80, 90)
loading data from 90 to 100
got it! (90, 100)
loading data from 100 to 110
Connector error occurred! Connection Failed!
got it! (100, 110)
loading data from 110 to 120
Connector error occurred! Connection Failed!
got it! (110, 120)
loading data f

## Collect Article URLs

In [14]:
URL_ROOT = "./data/mondo_scraped/urls/"
fn = os.listdir(URL_ROOT)

In [17]:
articles = dict()
for file in fn:
    with open(os.path.join(URL_ROOT, file), "r") as f:
        articles.update(json.load(f))

In [30]:
all_articles = list(chain.from_iterable(x for x in articles.values() if x is not None))

In [43]:
chopped = [re.search(r'http://.*\.html', url).group(0) for url in all_articles 
           if re.search(r'http://.*\.html', url) and not re.search(r'(?:/album/|/blogs/)', url)]

In [46]:
cleaned = list(set(chopped))

In [47]:
len(cleaned)

20639

In [48]:
cleaned[:5]

['http://www.elmundo.es/internacional/2017/05/26/592716a7268e3e39608b45c9.html',
 'http://www.elmundo.es/internacional/2016/09/19/57e03e3746163fe0148b460b.html',
 'http://www.elmundo.es/internacional/2015/09/05/55eadcfa46163f706b8b4578.html',
 'http://www.elmundo.es/internacional/2014/09/21/541eca59ca474105538b457a.html',
 'http://www.elmundo.es/internacional/2015/12/12/566c71c3ca47415a7e8b4677.html']

In [50]:
pickle.dump(cleaned, open("./data/mondo_scraped/cleaned_int.pkl", "wb"))