Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
<!-- git-cliff-unreleased-start -->
## 0.6.12 - **not yet released**

### 🚀 Features

- Add `retire_browser_after_page_count` parameter for `BrowserPool` ([#1266](https://github.com/apify/crawlee-python/pull/1266)) ([603aa2b](https://github.com/apify/crawlee-python/commit/603aa2b192ef4bc42d88244bd009fffdb0614c06)) by [@Mantisus](https://github.com/Mantisus)

### 🐛 Bug Fixes

- Use `perf_counter_ns` for request duration tracking ([#1260](https://github.com/apify/crawlee-python/pull/1260)) ([9e92f6b](https://github.com/apify/crawlee-python/commit/9e92f6b54400ce5004fbab770e2e4ac42f73148f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1256](https://github.com/apify/crawlee-python/issues/1256)


<!-- git-cliff-unreleased-end -->
## [0.6.11](https://github.com/apify/crawlee-python/releases/tag/v0.6.11) (2025-06-23)
Expand Down
5 changes: 5 additions & 0 deletions src/crawlee/browsers/_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ class BrowserController(ABC):
def pages(self) -> list[Page]:
"""Return the list of opened pages."""

@property
@abstractmethod
def total_opened_pages(self) -> int:
"""Return the total number of pages opened since the browser was launched."""

@property
@abstractmethod
def pages_count(self) -> int:
Expand Down
16 changes: 15 additions & 1 deletion src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
browser_inactive_threshold: timedelta = timedelta(seconds=10),
identify_inactive_browsers_interval: timedelta = timedelta(seconds=20),
close_inactive_browsers_interval: timedelta = timedelta(seconds=30),
retire_browser_after_page_count: int = 100,
) -> None:
"""Initialize a new instance.

Expand All @@ -67,7 +68,10 @@ def __init__(
as retired.
close_inactive_browsers_interval: The interval at which the pool checks for inactive browsers
and closes them. The browser is considered as inactive if it has no active pages and has been idle
for the specified period.
for the specified period. The browser is considered as retired if it has no active pages and has total
pages count greater than or equal to `retire_browser_after_page_count`.
retire_browser_after_page_count: The maximum number of processed pages after which the browser is considered
as retired.
"""
self._plugins = plugins or [PlaywrightBrowserPlugin()]
self._operation_timeout = operation_timeout
Expand All @@ -91,6 +95,7 @@ def __init__(
)

self._total_pages_count = 0
self._retire_browser_after_page_count = retire_browser_after_page_count
self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins

Expand Down Expand Up @@ -305,6 +310,9 @@ async def _get_new_page(
except RuntimeError as exc:
raise RuntimeError('Browser pool is not initialized.') from exc

if browser_controller.total_opened_pages >= self._retire_browser_after_page_count:
self._retire_browser(browser_controller)

crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type)
self._pages[page_id] = crawlee_page
self._total_pages_count += 1
Expand All @@ -321,6 +329,12 @@ def _pick_browser_with_free_capacity(

return None

def _retire_browser(self, browser: BrowserController) -> None:
"""Retire a browser by moving it to the inactive list."""
if browser in self._active_browsers:
self._active_browsers.remove(browser)
self._inactive_browsers.append(browser)

async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController:
"""Launch a new browser instance using the specified plugin."""
browser = await plugin.new_browser()
Expand Down
9 changes: 9 additions & 0 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,18 @@ def __init__(
self._pages = list[Page]()
self._last_page_opened_at = datetime.now(timezone.utc)

self._total_opened_pages = 0

@property
@override
def pages(self) -> list[Page]:
return self._pages

@property
@override
def total_opened_pages(self) -> int:
return self._total_opened_pages

@property
@override
def pages_count(self) -> int:
Expand Down Expand Up @@ -160,6 +167,8 @@ async def new_page(
self._pages.append(page)
self._last_page_opened_at = datetime.now(timezone.utc)

self._total_opened_pages += 1

return page

@override
Expand Down
9 changes: 5 additions & 4 deletions src/crawlee/statistics/_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import math
import time
from datetime import datetime, timedelta, timezone
from logging import Logger, getLogger
from typing import TYPE_CHECKING, Generic, Literal
Expand All @@ -27,22 +28,22 @@ class RequestProcessingRecord:
"""Tracks information about the processing of a request."""

def __init__(self) -> None:
self._last_run_at: datetime | None = None
self._last_run_at_ns: int | None = None
self._runs = 0
self.duration: timedelta | None = None

def run(self) -> int:
"""Mark the job as started."""
self._last_run_at = datetime.now(timezone.utc)
self._last_run_at_ns = time.perf_counter_ns()
self._runs += 1
return self._runs

def finish(self) -> timedelta:
"""Mark the job as finished."""
if self._last_run_at is None:
if self._last_run_at_ns is None:
raise RuntimeError('Invalid state')

self.duration = datetime.now(timezone.utc) - self._last_run_at
self.duration = timedelta(microseconds=math.ceil((time.perf_counter_ns() - self._last_run_at_ns) / 1000))
return self.duration

@property
Expand Down
17 changes: 17 additions & 0 deletions tests/unit/_statistics/test_request_processing_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from datetime import timedelta

from crawlee.statistics._statistics import RequestProcessingRecord


def test_tracking_time_resolution() -> None:
"""Test that `RequestProcessingRecord` tracks time with sufficient resolution.

This is generally not an issue on Linux, but on Windows some packages in older Python versions might be using system
timers with not so granular resolution - some sources estimate 15ms. This test will start failing on Windows
if unsuitable source of time measurement is selected due to two successive time measurements possibly using same
timing sample."""
record = RequestProcessingRecord()
record.run()
record.finish()
assert record.duration
assert record.duration > timedelta(seconds=0)
26 changes: 26 additions & 0 deletions tests/unit/browsers/test_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,29 @@ async def test_with_plugin_contains_page_options(server_url: URL) -> None:
await test_page.page.goto(str(server_url / 'user-agent'))
assert 'My Best User-Agent' in await test_page.page.content()
await test_page.page.close()


@pytest.mark.parametrize(
('retire_after_page_count', 'expect_equal_browsers'),
[
pytest.param(2, True, id='Two pages opened in the same browser'),
pytest.param(1, False, id='Each page opened in a new browser.'),
],
)
async def test_browser_pool_retire_browser_after_page_count(
retire_after_page_count: int, *, expect_equal_browsers: bool
) -> None:
async with BrowserPool(retire_browser_after_page_count=retire_after_page_count) as browser_pool:
test_page = await browser_pool.new_page()
first_browser = test_page.page.context
await test_page.page.close()

test_page = await browser_pool.new_page()
second_browser = test_page.page.context

await test_page.page.close()

if expect_equal_browsers:
assert first_browser is second_browser
else:
assert first_browser is not second_browser