# The Guardian weekly quiz by Thomas Eaton

In [143]:
import datetime as dt
import itertools
import json
import pathlib

import bs4
import requests

url = 'https://www.theguardian.com/theguardian/series/the-quiz-thomas-eaton?page=%d'
date_urls = json.load(pathlib.Path('date_urls.json').open())
done = False
page_no = 0

while not done:
    page_no += 1

    response = requests.get(url % page_no)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    week_sections = soup.find_all('section', class_='fc-container')

    for ws in week_sections:
        ws_date = dt.datetime.strptime(ws.get('data-id'), '%d %B %Y').date()

        if ws_date <= max(dt.date.fromisoformat(d) for d in date_urls.keys()):
            done = True
            break

        ws_link = next(
            link.get('href')
            for link in ws.find_all('a')
            if link.get('href').startswith('https://www.theguardian.com/lifeandstyle')
        )

        ws_response = requests.get(ws_link)
        ws_soup = bs4.BeautifulSoup(ws_response.text, 'html.parser')
        ws_questions_h2 = ws_soup.find('h2', id='the-questions')
        ws_questions_p = ws_questions_h2.find_next_sibling('p')
        ws_questions = [
            tag.text.strip()
            for tag in itertools.takewhile(lambda x: 'What links:' not in x, ws_questions_p.children)
            if isinstance(tag, bs4.element.NavigableString)
        ]

        ws_answers_h2 = ws_soup.find('h2', id='the-answers')
        ws_answers_p = ws_answers_h2.find_next_sibling('p')
        ws_answers = [
            tag.text.strip().rstrip('.')
            for tag in ws_answers_p.children
            if isinstance(tag, bs4.element.NavigableString)
        ]
        ws_question_answers = dict(zip(ws_questions, ws_answers))

        output = pathlib.Path(f"{ws_date.strftime('%Y/%m/%d')}.json")
        output.parent.mkdir(parents=True, exist_ok=True)
        output.write_text(json.dumps(ws_question_answers, indent=4))

        date_urls[ws_date.isoformat()] = ws_link
        pathlib.Path('date_urls.json').write_text(json.dumps(date_urls, indent=4, sort_keys=True))