Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python.analysis.extraPaths": ["./src/web_crawler"]
}
410 changes: 409 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -103,32 +103,32 @@ comments_min_spaces_from_content = 1
preserve_quotes = true
whitelines = 1


[tool.poetry.dependencies]
python = "^3.11.1"
python-dotenv = "^1.0.0"


[tool.poetry.group.dev.dependencies]
commitizen = "^3.7.0"
mypy = "^1.5.1"
yamlfix = "^1.14.0"


[tool.poetry.group.lint.dependencies]
ruff = "^0.0.286"
black = "^23.7.0"
pymarkdownlnt = "^0.9.13.3"
yamllint = "^1.32.0"
djlint = "^1.32.1"


[tool.poetry.group.test.dependencies]
pytest = "^7.4.0"
pytest-mock = "^3.11.1"
pytest-cov = "^4.1.0"


[tool.poetry.group.docs.dependencies]
mkdocs = { extras = ["i18n"], version = "^1.5.2" }
mkdocstrings = { extras = ["python"], version = "^0.22.0" }

[tool.poetry.group.webdriver.dependencies]
selenium = "^4.15.2"
playwright = "^1.39.0"
webdriver-manager = "^4.0.1"
1 change: 1 addition & 0 deletions src/web_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package Web Crawler."""
78 changes: 78 additions & 0 deletions src/web_crawler/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Module Webdriver Benchmark."""

from __future__ import annotations

import time
from logging import INFO, basicConfig, info
from typing import Callable

basicConfig(level=INFO)


def benchmark_crawler(
url: str,
selector: str,
callback: Callable,
series: int = 10,
) -> float:
"""Benchmark crawler.

Parameters
----------
url : str
The url to fetch.
selector : str
The selector to extract data from.
callback : Callable
The callback to fetch data.
series : int
The number of tests to run.
"""
times: list[float] = []
function = callback.__name__

info(f"Start Benchmark ({series}) - {function}")

for index in range(series):
start = time.time()
callback(url=url, selector=selector)
end = time.time()
total = end - start
times.append(total)

info(f"Benchmark {index + 1} - Time {total:.2f}s")

info(f"Stop Benchmark - {function}")

average = sum(times) / series

info(f"Benchmark average {average:.2f}s\n")

return average


if __name__ == "__main__":
from crawler_playwright.benchmark import benchmark_playwright
from crawler_selenium.benchmark import benchmark_selenium

url = "https://webscraper.io/test-sites/tables"
selector = "table.table td"

time_playwright = benchmark_crawler(
callback=benchmark_playwright,
selector=selector,
url=url,
)

time_selenium = benchmark_crawler(
callback=benchmark_selenium,
selector=selector,
url=url,
)

time_diff = abs(time_selenium - time_playwright)
crawler = "Playwright" if time_playwright < time_selenium else "Selenium"

info(f"On average, Playwright took {time_playwright:.2f}s.")
info(f"On average, Selenium took {time_selenium:.2f}s.")
info(f"Fastest was {crawler} by {time_diff:.2f}s.")
1 change: 1 addition & 0 deletions src/web_crawler/crawler_playwright/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package Crawler Playwright."""
32 changes: 32 additions & 0 deletions src/web_crawler/crawler_playwright/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Module Benchmark Playwright."""

from __future__ import annotations

from playwright.sync_api import sync_playwright


def benchmark_playwright(url: str, selector: str) -> list[str]:
"""Benchmark Playwright.

Parameters
----------
url : str
The url to fetch.
selector : str
The selector to extract data from.

Returns
-------
list[str]
The extracted data.
"""
with sync_playwright() as pw:
browser = pw.chromium.launch()
context = browser.new_context()
page = context.new_page()
page.goto(url=url)
page.wait_for_selector(selector=selector)
elements = page.locator(selector=f"css={selector}")
data = [element.text_content() for element in elements.element_handles()]
browser.close()
return data
1 change: 1 addition & 0 deletions src/web_crawler/crawler_selenium/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package Crawler Selenium."""
48 changes: 48 additions & 0 deletions src/web_crawler/crawler_selenium/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Module Benchmark Selenium."""

from __future__ import annotations

import logging

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.logger import set_logger


def benchmark_selenium(url: str, selector: str) -> list[str]:
"""Benchmark Selenium.

Parameters
----------
url : str
The url to fetch.
selector : str
The selector to extract data from.

Returns
-------
list[str]
The extracted data.
"""
logger = logging.getLogger("selenium")
logger.setLevel(level=logging.CRITICAL)
set_logger(logger=logger)
options = webdriver.ChromeOptions()
options.add_argument("--headless")
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(options=options, service=service)
driver_wait = WebDriverWait(driver=driver, timeout=10)
driver.get(url=url)
driver_wait.until(
expected_conditions.presence_of_all_elements_located(
(By.CSS_SELECTOR, selector),
),
)
elements = driver.find_elements(by=By.CSS_SELECTOR, value=selector)
data = [element.text for element in elements]
driver.quit()
return data