Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 128 additions & 33 deletions src/web_crawler/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,62 @@

from __future__ import annotations

import functools
import time
from logging import INFO, basicConfig, info
from typing import Callable

from crawler_playwright.benchmark import (
playwright_extract_lots,
playwright_extract_table,
use_playwright,
)
from crawler_selenium.benchmark import (
selenium_extract_lots,
selenium_extract_table,
use_selenium,
)

basicConfig(level=INFO)


def benchmark_crawler(
url: str,
selector: str,
callback: Callable,
def run_benchmark(
callback: Callable | functools.partial,
series: int = 10,
) -> float:
"""Benchmark crawler.
"""Run a benchmark series with a callback.

Parameters
----------
url : str
The url to fetch.
selector : str
The selector to extract data from.
callback : Callable
The callback to fetch data.
series : int
The number of tests to run.
callback : Callable | partial
The callback for the benchmark series.
series : int, optional
The number of times to run, by default 10.

Returns
-------
float
The average time for the benchmark series.
"""
times: list[float] = []
function = callback.__name__
name = (
callback.func.__name__
if isinstance(callback, functools.partial)
else callback.__name__
)

info(f"Start Benchmark ({series}) - {function}")
info(f"Start Benchmark ({series}) - {name}")

for index in range(series):
start = time.time()
callback(url=url, selector=selector)
callback()
end = time.time()
total = end - start
times.append(total)

info(f"Benchmark {index + 1} - Time {total:.2f}s")

info(f"Stop Benchmark - {function}")
info("Stop Benchmark")

average = sum(times) / series

Expand All @@ -51,28 +66,108 @@ def benchmark_crawler(
return average


if __name__ == "__main__":
from crawler_playwright.benchmark import benchmark_playwright
from crawler_selenium.benchmark import benchmark_selenium
def run_benchmarks(
callback_playwright: Callable,
callback_selenium: Callable,
series: int = 10,
) -> None:
"""Run benchmarks series with callbacks.

Parameters
----------
callback_playwright : Callable
The Playwright callback.
callback_selenium : Callable
The Selenium callback.
series : int, optional
The number of times to run, by default 10
"""
time_playwright = run_benchmark(callback=callback_playwright, series=series)
time_selenium = run_benchmark(callback=callback_selenium, series=series)
time_diff = abs(time_selenium - time_playwright)
crawler = "Playwright" if time_playwright < time_selenium else "Selenium"

info(f"On average, Playwright took {time_playwright:.2f}s.")
info(f"On average, Selenium took {time_selenium:.2f}s.")
info(f"Fastest was {crawler} by {time_diff:.2f}s.")


def benchmarks_extract_lots(
series: int | None = None,
) -> None:
"""Run extract lots benchmarks.

Parameters
----------
series : int | None, optional
The number of times to run, by default None
"""
lots = ["6227707", "4433198", "3920154"]
url = "https://demeter.cptaq.gouv.qc.ca/"

run_benchmarks(
callback_playwright=functools.partial(
use_playwright,
callback=playwright_extract_lots,
url=url,
lots=lots,
),
callback_selenium=functools.partial(
use_selenium,
callback=selenium_extract_lots,
url=url,
lots=lots,
),
series=series,
)


def benchmarks_extract_table(
series: int | None = None,
) -> None:
"""Run extract table benchmarks.

Parameters
----------
series : int | None, optional
The number of times to run, by default None
"""
url = "https://webscraper.io/test-sites/tables"
selector = "table.table td"

time_playwright = benchmark_crawler(
callback=benchmark_playwright,
selector=selector,
url=url,
run_benchmarks(
callback_playwright=functools.partial(
use_playwright,
callback=playwright_extract_table,
url=url,
selector=selector,
),
callback_selenium=functools.partial(
use_selenium,
callback=selenium_extract_table,
url=url,
selector=selector,
),
series=series,
)

time_selenium = benchmark_crawler(
callback=benchmark_selenium,
selector=selector,
url=url,

def benchmarks_launch(
series: int | None = None,
) -> None:
"""Run launch benchmarks.

Parameters
----------
series : int | None, optional
The number of times to run, by default None
"""
run_benchmarks(
callback_playwright=use_playwright,
callback_selenium=use_selenium,
series=series,
)

time_diff = abs(time_selenium - time_playwright)
crawler = "Playwright" if time_playwright < time_selenium else "Selenium"

info(f"On average, Playwright took {time_playwright:.2f}s.")
info(f"On average, Selenium took {time_selenium:.2f}s.")
info(f"Fastest was {crawler} by {time_diff:.2f}s.")
if __name__ == "__main__":
benchmarks_launch(series=5)
66 changes: 49 additions & 17 deletions src/web_crawler/crawler_playwright/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,63 @@

from __future__ import annotations

from playwright.sync_api import sync_playwright
from typing import Any, Callable

from playwright.sync_api import Page, sync_playwright

def benchmark_playwright(url: str, selector: str) -> list[str]:
"""Benchmark Playwright.

def playwright_extract_table(
page: Page,
url: str,
selector: str,
) -> list[str]:
"""Extract table using Playwright."""
page.goto(url=url)
page.wait_for_selector(selector=selector)
elements = page.locator(selector=f"css={selector}")
return [element.text_content() for element in elements.element_handles()]


def playwright_extract_lots(
page: Page,
url: str,
lots: list[str],
) -> dict[str, float]:
"""Extract lost using Playwright."""
page.goto(url=url)
page.locator(selector="button[mat-dialog-close]").click()
search_input = page.locator(selector="#mat-input-0")

areas = {}

for lot in lots:
search_input.fill("")
search_input.type(text=lot)
search_input.press(key="Enter")
page.locator(selector=f"//h4[contains(text(), '{lot}')]").click()
area = page.locator(
selector="//td[contains(text(), 'Superficie (pi2)')]/following-sibling::td",
).text_content()
areas[lot] = float(area.replace(" ", "").replace(",", "."))

return areas


def use_playwright(
callback: Callable | None = None,
**kwargs: dict[str, Any],
) -> None:
"""Use Playwright with a callback.

Parameters
----------
url : str
The url to fetch.
selector : str
The selector to extract data from.

Returns
-------
list[str]
The extracted data.
callback : Callable
The callback to use with the browser context.
"""
with sync_playwright() as pw:
browser = pw.chromium.launch()
context = browser.new_context()
page = context.new_page()
page.goto(url=url)
page.wait_for_selector(selector=selector)
elements = page.locator(selector=f"css={selector}")
data = [element.text_content() for element in elements.element_handles()]
if callback:
callback(page=page, **kwargs)
browser.close()
return data
88 changes: 68 additions & 20 deletions src/web_crawler/crawler_selenium/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Any, Callable

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
Expand All @@ -12,21 +13,75 @@
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.logger import set_logger

if TYPE_CHECKING:
from selenium.webdriver.chrome.webdriver import WebDriver

def benchmark_selenium(url: str, selector: str) -> list[str]:
"""Benchmark Selenium.

def selenium_extract_table(
driver: WebDriver,
driver_wait: WebDriverWait,
url: str,
selector: str,
) -> list[str]:
"""Extract table using Selenium."""
driver.get(url=url)
driver_wait.until(
expected_conditions.presence_of_all_elements_located(
(By.CSS_SELECTOR, selector),
),
)
elements = driver.find_elements(by=By.CSS_SELECTOR, value=selector)
return [element.text for element in elements]


def selenium_extract_lots(
driver: WebDriver,
driver_wait: WebDriverWait,
url: str,
lots: list[str],
) -> dict[str, float]:
"""Extract lots using Selenium."""
driver.get(url=url)
driver_wait.until(
expected_conditions.presence_of_element_located(
(By.CSS_SELECTOR, "button[mat-dialog-close]"),
),
).click()
search_input = driver.find_element(by=By.CSS_SELECTOR, value="#mat-input-0")

areas = {}

for lot in lots:
search_input.send_keys(lot)
search_input.clear()
driver_wait.until(
expected_conditions.presence_of_element_located(
(By.XPATH, f"//h4[contains(text(), '{lot}')]"),
),
).click()
area = driver_wait.until(
expected_conditions.presence_of_element_located(
(
By.XPATH,
"//td[contains(text(), 'Superficie (pi2)')]/following-sibling::td",
),
),
).get_attribute("innerText")
areas[lot] = float(area.replace(" ", "").replace(",", "."))

return areas


def use_selenium(
callback: Callable | None = None,
**kwargs: dict[str, Any],
) -> list[str]:
"""Use Selenium with a callback.

Parameters
----------
url : str
The url to fetch.
selector : str
The selector to extract data from.

Returns
-------
list[str]
The extracted data.
callback : Callable
The callback to use with the browser context.
"""
logger = logging.getLogger("selenium")
logger.setLevel(level=logging.CRITICAL)
Expand All @@ -36,13 +91,6 @@ def benchmark_selenium(url: str, selector: str) -> list[str]:
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(options=options, service=service)
driver_wait = WebDriverWait(driver=driver, timeout=10)
driver.get(url=url)
driver_wait.until(
expected_conditions.presence_of_all_elements_located(
(By.CSS_SELECTOR, selector),
),
)
elements = driver.find_elements(by=By.CSS_SELECTOR, value=selector)
data = [element.text for element in elements]
if callback:
callback(driver=driver, driver_wait=driver_wait, **kwargs)
driver.quit()
return data