Skip to content

Commit

Permalink
Refactor LogStats extension to log IPM and RPM to stats on spider_clo…
Browse files Browse the repository at this point in the history
…se (scrapy#4111)
  • Loading branch information
BurnzZ committed Feb 16, 2024
1 parent c5fdba9 commit 9bb973d
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 10 deletions.
44 changes: 34 additions & 10 deletions scrapy/extensions/logstats.py
Expand Up @@ -9,7 +9,10 @@


class LogStats:
"""Log basic scraping stats periodically"""
"""Log basic scraping stats periodically like:
* RPM - Requests per Minute
* IPM - Items per Minute
"""

def __init__(self, stats, interval=60.0):
self.stats = stats
Expand All @@ -35,24 +38,45 @@ def spider_opened(self, spider):
self.task.start(self.interval)

def log(self, spider):
items = self.stats.get_value("item_scraped_count", 0)
pages = self.stats.get_value("response_received_count", 0)
irate = (items - self.itemsprev) * self.multiplier
prate = (pages - self.pagesprev) * self.multiplier
self.pagesprev, self.itemsprev = pages, items
self.calculate_stats()

msg = (
"Crawled %(pages)d pages (at %(pagerate)d pages/min), "
"scraped %(items)d items (at %(itemrate)d items/min)"
)
log_args = {
"pages": pages,
"pagerate": prate,
"items": items,
"itemrate": irate,
"pages": self.pages,
"pagerate": self.prate,
"items": self.items,
"itemrate": self.irate,
}
logger.info(msg, log_args, extra={"spider": spider})

def calculate_stats(self):
self.items = self.stats.get_value("item_scraped_count", 0)
self.pages = self.stats.get_value("response_received_count", 0)
self.irate = (self.items - self.itemsprev) * self.multiplier
self.prate = (self.pages - self.pagesprev) * self.multiplier
self.pagesprev, self.itemsprev = self.pages, self.items

def spider_closed(self, spider, reason):
if self.task and self.task.running:
self.task.stop()

rpm_final, ipm_final = self.calculate_final_stats(spider)
self.stats.set_value("responses_per_minute", rpm_final)
self.stats.set_value("items_per_minute", ipm_final)

def calculate_final_stats(self, spider):
start_time = self.stats.get_value("start_time")
finished_time = self.stats.get_value("finished_time")

if not start_time or not finished_time:
return None, None

mins_elapsed = (finished_time - start_time).seconds / 60

items = self.stats.get_value("item_scraped_count", 0)
pages = self.stats.get_value("response_received_count", 0)

return (pages / mins_elapsed), (items / mins_elapsed)
62 changes: 62 additions & 0 deletions tests/test_logstats.py
@@ -0,0 +1,62 @@
import unittest
from datetime import datetime

from scrapy.extensions.logstats import LogStats
from scrapy.utils.test import get_crawler
from tests.spiders import SimpleSpider


class TestLogStats(unittest.TestCase):
def setUp(self):
self.crawler = get_crawler(SimpleSpider)
self.spider = self.crawler._create_spider("spidey")
self.stats = self.crawler.stats

self.stats.set_value("response_received_count", 4802)
self.stats.set_value("item_scraped_count", 3201)

def test_stats_calculations(self):
logstats = LogStats.from_crawler(self.crawler)

with self.assertRaises(AttributeError):
logstats.pagesprev
logstats.itemsprev

logstats.spider_opened(self.spider)
self.assertEqual(logstats.pagesprev, 4802)
self.assertEqual(logstats.itemsprev, 3201)

logstats.calculate_stats()
self.assertEqual(logstats.items, 3201)
self.assertEqual(logstats.pages, 4802)
self.assertEqual(logstats.irate, 0.0)
self.assertEqual(logstats.prate, 0.0)
self.assertEqual(logstats.pagesprev, 4802)
self.assertEqual(logstats.itemsprev, 3201)

# Simulate what happens after a minute
self.stats.set_value("response_received_count", 5187)
self.stats.set_value("item_scraped_count", 3492)
logstats.calculate_stats()
self.assertEqual(logstats.items, 3492)
self.assertEqual(logstats.pages, 5187)
self.assertEqual(logstats.irate, 291.0)
self.assertEqual(logstats.prate, 385.0)
self.assertEqual(logstats.pagesprev, 5187)
self.assertEqual(logstats.itemsprev, 3492)

# Simulate when spider closes after running for 30 mins
self.stats.set_value("start_time", datetime.fromtimestamp(1655100172))
self.stats.set_value("finished_time", datetime.fromtimestamp(1655101972))
logstats.spider_closed(self.spider, "test reason")
self.assertEqual(self.stats.get_value("responses_per_minute"), 172.9)
self.assertEqual(self.stats.get_value("items_per_minute"), 116.4)

def test_stats_calculations_no_time(self):
"""The stat values should be None since the start and finish time are
not available.
"""
logstats = LogStats.from_crawler(self.crawler)
logstats.spider_closed(self.spider, "test reason")
self.assertIsNone(self.stats.get_value("responses_per_minute"))
self.assertIsNone(self.stats.get_value("items_per_minute"))

0 comments on commit 9bb973d

Please sign in to comment.