diff --git a/src/web_crawler/benchmark.py b/src/web_crawler/benchmark.py index 22591f0..de1fc9a 100644 --- a/src/web_crawler/benchmark.py +++ b/src/web_crawler/benchmark.py @@ -2,47 +2,62 @@ from __future__ import annotations +import functools import time from logging import INFO, basicConfig, info from typing import Callable +from crawler_playwright.benchmark import ( + playwright_extract_lots, + playwright_extract_table, + use_playwright, +) +from crawler_selenium.benchmark import ( + selenium_extract_lots, + selenium_extract_table, + use_selenium, +) + basicConfig(level=INFO) -def benchmark_crawler( - url: str, - selector: str, - callback: Callable, +def run_benchmark( + callback: Callable | functools.partial, series: int = 10, ) -> float: - """Benchmark crawler. + """Run a benchmark series with a callback. Parameters ---------- - url : str - The url to fetch. - selector : str - The selector to extract data from. - callback : Callable - The callback to fetch data. - series : int - The number of tests to run. + callback : Callable | partial + The callback for the benchmark series. + series : int, optional + The number of times to run, by default 10. + + Returns + ------- + float + The average time for the benchmark series. """ times: list[float] = [] - function = callback.__name__ + name = ( + callback.func.__name__ + if isinstance(callback, functools.partial) + else callback.__name__ + ) - info(f"Start Benchmark ({series}) - {function}") + info(f"Start Benchmark ({series}) - {name}") for index in range(series): start = time.time() - callback(url=url, selector=selector) + callback() end = time.time() total = end - start times.append(total) info(f"Benchmark {index + 1} - Time {total:.2f}s") - info(f"Stop Benchmark - {function}") + info("Stop Benchmark") average = sum(times) / series @@ -51,28 +66,108 @@ def benchmark_crawler( return average -if __name__ == "__main__": - from crawler_playwright.benchmark import benchmark_playwright - from crawler_selenium.benchmark import benchmark_selenium +def run_benchmarks( + callback_playwright: Callable, + callback_selenium: Callable, + series: int = 10, +) -> None: + """Run benchmarks series with callbacks. + + Parameters + ---------- + callback_playwright : Callable + The Playwright callback. + callback_selenium : Callable + The Selenium callback. + series : int, optional + The number of times to run, by default 10 + """ + time_playwright = run_benchmark(callback=callback_playwright, series=series) + time_selenium = run_benchmark(callback=callback_selenium, series=series) + time_diff = abs(time_selenium - time_playwright) + crawler = "Playwright" if time_playwright < time_selenium else "Selenium" + info(f"On average, Playwright took {time_playwright:.2f}s.") + info(f"On average, Selenium took {time_selenium:.2f}s.") + info(f"Fastest was {crawler} by {time_diff:.2f}s.") + + +def benchmarks_extract_lots( + series: int | None = None, +) -> None: + """Run extract lots benchmarks. + + Parameters + ---------- + series : int | None, optional + The number of times to run, by default None + """ + lots = ["6227707", "4433198", "3920154"] + url = "https://demeter.cptaq.gouv.qc.ca/" + + run_benchmarks( + callback_playwright=functools.partial( + use_playwright, + callback=playwright_extract_lots, + url=url, + lots=lots, + ), + callback_selenium=functools.partial( + use_selenium, + callback=selenium_extract_lots, + url=url, + lots=lots, + ), + series=series, + ) + + +def benchmarks_extract_table( + series: int | None = None, +) -> None: + """Run extract table benchmarks. + + Parameters + ---------- + series : int | None, optional + The number of times to run, by default None + """ url = "https://webscraper.io/test-sites/tables" selector = "table.table td" - time_playwright = benchmark_crawler( - callback=benchmark_playwright, - selector=selector, - url=url, + run_benchmarks( + callback_playwright=functools.partial( + use_playwright, + callback=playwright_extract_table, + url=url, + selector=selector, + ), + callback_selenium=functools.partial( + use_selenium, + callback=selenium_extract_table, + url=url, + selector=selector, + ), + series=series, ) - time_selenium = benchmark_crawler( - callback=benchmark_selenium, - selector=selector, - url=url, + +def benchmarks_launch( + series: int | None = None, +) -> None: + """Run launch benchmarks. + + Parameters + ---------- + series : int | None, optional + The number of times to run, by default None + """ + run_benchmarks( + callback_playwright=use_playwright, + callback_selenium=use_selenium, + series=series, ) - time_diff = abs(time_selenium - time_playwright) - crawler = "Playwright" if time_playwright < time_selenium else "Selenium" - info(f"On average, Playwright took {time_playwright:.2f}s.") - info(f"On average, Selenium took {time_selenium:.2f}s.") - info(f"Fastest was {crawler} by {time_diff:.2f}s.") +if __name__ == "__main__": + benchmarks_launch(series=5) diff --git a/src/web_crawler/crawler_playwright/benchmark.py b/src/web_crawler/crawler_playwright/benchmark.py index e6c6d1d..5bf7e4b 100644 --- a/src/web_crawler/crawler_playwright/benchmark.py +++ b/src/web_crawler/crawler_playwright/benchmark.py @@ -2,31 +2,63 @@ from __future__ import annotations -from playwright.sync_api import sync_playwright +from typing import Any, Callable +from playwright.sync_api import Page, sync_playwright -def benchmark_playwright(url: str, selector: str) -> list[str]: - """Benchmark Playwright. + +def playwright_extract_table( + page: Page, + url: str, + selector: str, +) -> list[str]: + """Extract table using Playwright.""" + page.goto(url=url) + page.wait_for_selector(selector=selector) + elements = page.locator(selector=f"css={selector}") + return [element.text_content() for element in elements.element_handles()] + + +def playwright_extract_lots( + page: Page, + url: str, + lots: list[str], +) -> dict[str, float]: + """Extract lost using Playwright.""" + page.goto(url=url) + page.locator(selector="button[mat-dialog-close]").click() + search_input = page.locator(selector="#mat-input-0") + + areas = {} + + for lot in lots: + search_input.fill("") + search_input.type(text=lot) + search_input.press(key="Enter") + page.locator(selector=f"//h4[contains(text(), '{lot}')]").click() + area = page.locator( + selector="//td[contains(text(), 'Superficie (pi2)')]/following-sibling::td", + ).text_content() + areas[lot] = float(area.replace(" ", "").replace(",", ".")) + + return areas + + +def use_playwright( + callback: Callable | None = None, + **kwargs: dict[str, Any], +) -> None: + """Use Playwright with a callback. Parameters ---------- - url : str - The url to fetch. - selector : str - The selector to extract data from. - - Returns - ------- - list[str] - The extracted data. + callback : Callable + The callback to use with the browser context. """ with sync_playwright() as pw: browser = pw.chromium.launch() context = browser.new_context() page = context.new_page() - page.goto(url=url) - page.wait_for_selector(selector=selector) - elements = page.locator(selector=f"css={selector}") - data = [element.text_content() for element in elements.element_handles()] + if callback: + callback(page=page, **kwargs) browser.close() - return data diff --git a/src/web_crawler/crawler_selenium/benchmark.py b/src/web_crawler/crawler_selenium/benchmark.py index 17d5d7c..3009ed1 100644 --- a/src/web_crawler/crawler_selenium/benchmark.py +++ b/src/web_crawler/crawler_selenium/benchmark.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +from typing import TYPE_CHECKING, Any, Callable from selenium import webdriver from selenium.webdriver.chrome.service import Service as ChromeService @@ -12,21 +13,75 @@ from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.core.logger import set_logger +if TYPE_CHECKING: + from selenium.webdriver.chrome.webdriver import WebDriver -def benchmark_selenium(url: str, selector: str) -> list[str]: - """Benchmark Selenium. + +def selenium_extract_table( + driver: WebDriver, + driver_wait: WebDriverWait, + url: str, + selector: str, +) -> list[str]: + """Extract table using Selenium.""" + driver.get(url=url) + driver_wait.until( + expected_conditions.presence_of_all_elements_located( + (By.CSS_SELECTOR, selector), + ), + ) + elements = driver.find_elements(by=By.CSS_SELECTOR, value=selector) + return [element.text for element in elements] + + +def selenium_extract_lots( + driver: WebDriver, + driver_wait: WebDriverWait, + url: str, + lots: list[str], +) -> dict[str, float]: + """Extract lots using Selenium.""" + driver.get(url=url) + driver_wait.until( + expected_conditions.presence_of_element_located( + (By.CSS_SELECTOR, "button[mat-dialog-close]"), + ), + ).click() + search_input = driver.find_element(by=By.CSS_SELECTOR, value="#mat-input-0") + + areas = {} + + for lot in lots: + search_input.send_keys(lot) + search_input.clear() + driver_wait.until( + expected_conditions.presence_of_element_located( + (By.XPATH, f"//h4[contains(text(), '{lot}')]"), + ), + ).click() + area = driver_wait.until( + expected_conditions.presence_of_element_located( + ( + By.XPATH, + "//td[contains(text(), 'Superficie (pi2)')]/following-sibling::td", + ), + ), + ).get_attribute("innerText") + areas[lot] = float(area.replace(" ", "").replace(",", ".")) + + return areas + + +def use_selenium( + callback: Callable | None = None, + **kwargs: dict[str, Any], +) -> list[str]: + """Use Selenium with a callback. Parameters ---------- - url : str - The url to fetch. - selector : str - The selector to extract data from. - - Returns - ------- - list[str] - The extracted data. + callback : Callable + The callback to use with the browser context. """ logger = logging.getLogger("selenium") logger.setLevel(level=logging.CRITICAL) @@ -36,13 +91,6 @@ def benchmark_selenium(url: str, selector: str) -> list[str]: service = ChromeService(ChromeDriverManager().install()) driver = webdriver.Chrome(options=options, service=service) driver_wait = WebDriverWait(driver=driver, timeout=10) - driver.get(url=url) - driver_wait.until( - expected_conditions.presence_of_all_elements_located( - (By.CSS_SELECTOR, selector), - ), - ) - elements = driver.find_elements(by=By.CSS_SELECTOR, value=selector) - data = [element.text for element in elements] + if callback: + callback(driver=driver, driver_wait=driver_wait, **kwargs) driver.quit() - return data