In [None]:
import bs4
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service

import time
import json
import re
import os

In [None]:
def parseProblem(url:str, driver, dest_dir)->dict:
    problem = dict()
    driver.get(url)

    driver.execute_script('document.querySelector("div.pd-question-content input").click()')
    time.sleep(2)
    page_source = driver.page_source
    try:
        soup = bs4.BeautifulSoup(page_source, "lxml")
        correct_ans = [i["value"] for i in soup.select("div.correct.pd-answer-text-finish input")]
        entry_title = soup.select("h1.entry-title")[0].text
        question_content = soup.select("div.question-content-wrapper img")
        question_img_paths = []
        for i in question_content:
            img_path = i["src"].split("/")[-1]
            question_img_paths.append(img_path)
            with open(os.path.join(dest_dir, img_path), "wb") as f:
                f.write(requests.get(i["src"]).content)
        options = []
        for i in soup.select("div.pd-question-content input"):
            option = i["value"]
            option_imgs_soup = bs4.BeautifulSoup(option, "lxml")
            option_imgs = option_imgs_soup.select("img")
            if (option_imgs) :
                for option_img in option_imgs:
                    option_img_url = option_img["src"]
                    img_name = option_img_url.split("/")[-1]
                    option = f"@{img_name}"
                    with open(img_name, "wb") as k:
                        k.write(requests.get(option_img_url).content)
            
            options.append(option)


        for el in soup.select("div.pd-answ-explanation p"):
            if re.compile(r'^\s*references\s*:', re.IGNORECASE).match(el.text) is not None:
                references = re.sub(r"^\s*references\s*:\s*(.*)", r"\1", el.text, 0, re.MULTILINE | re.IGNORECASE)
                problem['references'] = references
            elif re.compile(r'^\s*category\s*:', re.IGNORECASE).match(el.text) is not None:
                category = re.sub(r"^\s*category\s*:\s*(.*)", r"\1", el.text, 0, re.MULTILINE | re.IGNORECASE)
                problem['category'] = category
            elif re.compile(r'^\s*explanation\s*:', re.IGNORECASE).match(el.text) is not None:
                explanation = re.sub(r"^\s*explanation\s*:\s*(.*)", r"\1", el.text, 0, re.MULTILINE | re.IGNORECASE)
                problem['explanation'] = explanation
                
        problem["entry_title"] = entry_title
        problem["question_img_paths"] = question_img_paths
        problem["correct_ans"] = correct_ans
        problem["options"] = options
    except:
        print("entry_title: ", entry_title)
        print("question_img_paths: ", question_img_paths)
        print("correct_ans: ", correct_ans)
        print("options: ", options)
        print("explanation: ", explanation)
        print("references: ", references)
        print("category: ", category)
        driver.close()
    return problem

## Scrape all question urls

In [None]:
question_bank_urls = ["https://theorytest.org.uk/question-sitemap1.xml",
                "https://theorytest.org.uk/question-sitemap2.xml",
                "https://theorytest.org.uk/question-sitemap3.xml",
                "https://theorytest.org.uk/question-sitemap4.xml",
                "https://theorytest.org.uk/question-sitemap5.xml",
                "https://theorytest.org.uk/question-sitemap6.xml",
                "https://theorytest.org.uk/question-sitemap7.xml",
                "https://theorytest.org.uk/question-sitemap8.xml",
                "https://theorytest.org.uk/question-sitemap9.xml",]

def is_img(text):
    pattern = re.compile(r".*\.(jpg|png|gif)$")
    if re.match(pattern, text):
        return True
    return False

with open("question_urls.txt", "w", encoding="utf-8") as f:
    question_urls = []
    for question_bank_url in question_bank_urls:
        page = requests.get(question_bank_url).text
        soup = bs4.BeautifulSoup(page, "xml")
        for i in soup.find_all("loc"):
            question_url = i.text
            if is_img(question_url):
                continue
            question_urls.append(question_url)
    f.write("\n".join(question_urls))

## Scrape question from each question url and save it individually

In [None]:
# driver_options = Options()
# driver_options.add_argument("--headless")
driver = webdriver.Firefox()
dest_dir = "./data"
os.makedirs(dest_dir, exist_ok=True)

with open("question_urls.txt", "r", encoding="utf-8") as f:
    for question_url in f.readlines():
        print(question_url)
        problem = parseProblem(question_url, driver, dest_dir)
        time.sleep(2)
        print(problem)
        filename = os.path.join(dest_dir, f"{question_url.rstrip("/").split("/")[-2]}.json")
        print(filename)
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(problem, f)
    driver.close()

## Assemble all questions into one single JSON file

In [98]:
output_json = "data.json"
data = []
for fname in os.listdir(dest_dir):
    fpath = os.path.join(dest_dir, fname)
    with open(fpath, 'r') as f:
        if (fpath.endswith("json")):
            try:
                data.append(json.load(f))
            except:
                print(fpath)
with open(output_json, 'w') as f:
    json.dump(data, f)

## Remove individual JSON files (make sure all data have been obtained before doing this)

In [99]:
for fname in os.listdir(dest_dir):
    if (fname.endswith("json")):
        fpath = os.path.join(dest_dir, fname)
        os.unlink(fpath)
    
os.rename(output_json, os.path.join(dest_dir, output_json))