In [1]:
from collections import defaultdict, deque
import logging
import pathlib
import re
import time
from urllib.parse import urljoin

import lxml.html
import requests

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
BASE_URL = 'https://www.bbc.com/news'
DATA_DIR = pathlib.Path('./data')
KNOWN_TYPES = {'website', 'article'}

In [4]:
def fetch_page(url):
    res = requests.get(url)
    res.raise_for_status()
    return res.text

text = fetch_page(BASE_URL)

In [5]:
html = lxml.html.fromstring(text)
urls = html.xpath('.//a[@href]/@href')
urls = sorted({url for url in urls if re.match(r'/news/.+?\d$', url)})
len(urls), urls

(37,
 ['/news/blogs-trending-44495136',
  '/news/blogs-trending-45331730',
  '/news/business-11428889',
  '/news/business-12686570',
  '/news/business-22434141',
  '/news/business-38507481',
  '/news/entertainment-arts-45329060',
  '/news/in-pictures-45209586',
  '/news/newsbeat-45245349',
  '/news/technology-45333960',
  '/news/technology-45341822',
  '/news/world-asia-china-45338985',
  '/news/world-asia-india-45339265',
  '/news/world-australia-45206206',
  '/news/world-australia-45339727',
  '/news/world-europe-44709253',
  '/news/world-europe-45328477',
  '/news/world-europe-45340716',
  '/news/world-europe-45342536',
  '/news/world-europe-45342721',
  '/news/world-europe-45345516',
  '/news/world-europe-45346629',
  '/news/world-europe-45347228',
  '/news/world-latin-america-45341575',
  '/news/world-us-canada-44009916',
  '/news/world-us-canada-45226132',
  '/news/world-us-canada-45229580',
  '/news/world-us-canada-45300969',
  '/news/world-us-canada-45329320',
  '/news/world-us

In [6]:
meta = html.xpath('.//meta[@property and @content]')
len(meta), {item.attrib['property']: item.attrib['content'] for item in meta}

(10,
 {'article:author': 'https://www.facebook.com/bbcnews',
  'fb:admins': '100004154058350',
  'og:article:section': 'Home',
  'og:description': 'Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.',
  'og:image': '//m.files.bbci.co.uk/modules/bbc-morph-news-waf-page-meta/2.2.2/bbc_news_logo.png',
  'og:locale': 'en_GB',
  'og:site_name': 'BBC News',
  'og:title': 'Home - BBC News',
  'og:type': 'website',
  'og:url': 'https://www.bbc.co.uk/news'})

In [7]:
class Page:
    def __init__(self, url):
        self.url = url
        self._text = None
        self._html = None
        
    @property
    def text(self):
        self._require_text()
        return self._text

    def extract_urls(self):
        self._require_html()
        urls = self._html.xpath('.//a[@href]/@href')
        # make the urls absolute
        return {urljoin(self.url, url) for url in urls if re.match(r'/news/.+?\d$', url)}
    
    def extract_meta_properties(self):
        self._require_html()
        meta = self._html.xpath('.//meta[@property and @content]')
        return {item.attrib['property']: item.attrib['content'] for item in meta}
    
    def _require_text(self):
        if not self._text:
            self._text = fetch_page(self.url)
            
    def _require_html(self):
        self._require_text()
        if self._html is None:
            self._html = lxml.html.fromstring(self._text)
            
    @property
    def id(self):
        match = re.search(r'\d{7,}$', self.url)
        if match:
            return match.group(0)
            
        

In [8]:
page1 = Page(urljoin(BASE_URL, urls[0]))
page2 = Page(urljoin(BASE_URL, urls[-1]))

In [9]:
page1.extract_meta_properties(), page2.extract_meta_properties()

({'article:author': 'https://www.facebook.com/bbcnews',
  'article:section': 'BBC Trending',
  'fb:admins': '100004154058350',
  'fb:app_id': '1609039196070050',
  'fb:pages': '1143803202301544,317278538359186,1392506827668140,742734325867560,185246968166196,156060587793370,137920769558355,193435954068976,21263239760,156400551056385,929399697073756,154344434967,228735667216,80758950658,260212261199,294662213128,1086451581439054,283348121682053,295830058648,239931389545417,304314573046,310719525611571,647687225371774,1159932557403143,286567251709437,1731770190373618,125309456546,163571453661989,285361880228,512423982152360,238003846549831,176663550714,260967092113,118450564909230,100978706649892,15286229625,122103087870579,120655094632228,102814153147070,124715648647,153132638110668,150467675018739',
  'og:description': "Moscow is accused of using the World Cup as a 'distraction' as pension age increases are proposed.",
  'og:image': 'https://ichef.bbci.co.uk/news/1024/branded_news/14E4

In [10]:
page1.extract_urls(), page2.extract_urls()

({'https://www.bbc.com/news/business-11428889',
  'https://www.bbc.com/news/business-12686570',
  'https://www.bbc.com/news/business-22434141',
  'https://www.bbc.com/news/business-38507481'},
 {'https://www.bbc.com/news/business-11428889',
  'https://www.bbc.com/news/business-12686570',
  'https://www.bbc.com/news/business-22434141',
  'https://www.bbc.com/news/business-38507481',
  'https://www.bbc.com/news/newsbeat-45245349',
  'https://www.bbc.com/news/uk-41405671',
  'https://www.bbc.com/news/world-asia-india-45339265',
  'https://www.bbc.com/news/world-europe-45328477',
  'https://www.bbc.com/news/world-europe-45342536',
  'https://www.bbc.com/news/world-europe-45342721',
  'https://www.bbc.com/news/world-us-canada-17139243',
  'https://www.bbc.com/news/world-us-canada-41377185',
  'https://www.bbc.com/news/world-us-canada-41398263',
  'https://www.bbc.com/news/world-us-canada-41447184',
  'https://www.bbc.com/news/world-us-canada-41452995',
  'https://www.bbc.com/news/world-us-c

In [11]:
class CachedPage(Page):
    def __init__(self, url, data_dir):
        super().__init__(url)
        self._data_dir = data_dir
        
    def is_cached(self):
        _id = self.id
        return bool(_id) and self._build_path(_id).exists()
    
    def purge_cache(self):
        path = self._build_path(self.id)
        if path.exists():
            path.unlink()
        
    def _require_text(self):
        if self._text:
            return
        _id = self.id
        path = None
        if _id:
            # check if we have the page in cache
            path = self._build_path(_id)
            if path.exists():
                with path.open() as file_obj:
                    self._text = file_obj.read()
                return
        super()._require_text()
        if path:
            # save a local copy
            with path.open('w') as file_obj:
                file_obj.write(self._text)
                logging.info('cached %s', path)                

    def _build_path(self, id_):
        return self._data_dir.joinpath('{}.html'.format(id_))

In [12]:
page1 = CachedPage(urljoin(BASE_URL, urls[-1]), DATA_DIR)
page1.extract_meta_properties()

INFO:root:cached data/45348356.html


{'article:author': 'https://www.facebook.com/bbcnews',
 'article:section': 'US & Canada',
 'fb:admins': '100004154058350',
 'fb:app_id': '1609039196070050',
 'fb:pages': '1143803202301544,317278538359186,1392506827668140,742734325867560,185246968166196,156060587793370,137920769558355,193435954068976,21263239760,156400551056385,929399697073756,154344434967,228735667216,80758950658,260212261199,294662213128,1086451581439054,283348121682053,295830058648,239931389545417,304314573046,310719525611571,647687225371774,1159932557403143,286567251709437,1731770190373618,125309456546,163571453661989,285361880228,512423982152360,238003846549831,176663550714,260967092113,118450564909230,100978706649892,15286229625,122103087870579,120655094632228,102814153147070,124715648647,153132638110668,150467675018739',
 'og:description': 'The head of Puerto Rico\'s capital calls the response to Hurricane Maria a "stain" on the presidency.',
 'og:image': 'https://ichef.bbci.co.uk/news/1024/branded_news/3319/prod

In [13]:
class BbcCrawler:
    def __init__(self, start_url=BASE_URL, data_dir=DATA_DIR, min_http_interval=1):
        self._yet_to_visit = deque([start_url])
        self._visited = set()
        assert min_http_interval > 0.5
        self._min_http_interval = min_http_interval
        # not to slow down the first call
        self._prev_http_time = time.time() - min_http_interval

        self._data_dir = data_dir
        self.stats = defaultdict(int)
                
    def __iter__(self):
        return self
    
    def __next__(self):
        if not self._yet_to_visit:
            raise StopIteration()
        while True:
            next_url = self._yet_to_visit.popleft()
            page = CachedPage(next_url, self._data_dir)
            # skip visited
            if page.id in self._visited:
                continue

            is_cached = page.is_cached()
            # cached versions of news lists are not used
            if is_cached:
                meta = page.extract_meta_properties()
                if meta.get('og:type') == 'website':
                    page.purge_cache()
                is_cached = False
            # keep the interval between http calls
            if not is_cached:
                cur_time = time.time()
                to_wait = self._prev_http_time + self._min_http_interval - cur_time
                if to_wait > 0:
                    time.sleep(to_wait)
                self._prev_http_time = time.time()

            # add extracted urls to the list
            try:
                urls = page.extract_urls()
            except requests.HTTPError as err:
                logging.error('failure with the url %s: %s', next_url, err)
                continue
                
            self._visited.add(page.id)
            for url in urls:
                new_page_id = Page(url).id
                if new_page_id in self._visited:
                    continue
                self._yet_to_visit.append(url)
            meta = page.extract_meta_properties()
            meta_type = meta.get('og:type')
            self.stats['type:{}'.format(meta_type)] += 1
            if meta_type == 'article':
                self.stats['section:{}'.format(meta.get('article:section'))] += 1
            self.stats['total'] += 1
            break
        return next_url


In [14]:
from itertools import islice
crawler = BbcCrawler()
for url in islice(crawler, 0, 10):
    print(url)

https://www.bbc.com/news


INFO:root:cached data/11428889.html


https://www.bbc.com/news/business-11428889


INFO:root:cached data/45347228.html


https://www.bbc.com/news/world-europe-45347228


INFO:root:cached data/44709253.html


https://www.bbc.com/news/world-europe-44709253


INFO:root:cached data/45343328.html


https://www.bbc.com/news/world-us-canada-45343328


INFO:root:cached data/45339265.html


https://www.bbc.com/news/world-asia-india-45339265


INFO:root:cached data/45339727.html


https://www.bbc.com/news/world-australia-45339727


INFO:root:cached data/45333440.html


https://www.bbc.com/news/world-us-canada-45333440


INFO:root:cached data/44495136.html


https://www.bbc.com/news/blogs-trending-44495136


INFO:root:cached data/45334160.html


https://www.bbc.com/news/stories-45334160


In [15]:
crawler.stats

defaultdict(int,
            {'section:BBC Trending': 1,
             'section:Europe': 2,
             'section:India': 1,
             'total': 10,
             'type:article': 4,
             'type:website': 6})

In [16]:
# for url in islice(crawler, 0, 2000): pass
len(crawler._yet_to_visit), crawler.stats

(182,
 defaultdict(int,
             {'section:BBC Trending': 1,
              'section:Europe': 2,
              'section:India': 1,
              'total': 10,
              'type:article': 4,
              'type:website': 6}))