diff --git a/scrapy/extensions/logstats.py b/scrapy/extensions/logstats.py index 78874a6dbd7..9f63e9c4bb6 100644 --- a/scrapy/extensions/logstats.py +++ b/scrapy/extensions/logstats.py @@ -9,7 +9,10 @@ class LogStats: - """Log basic scraping stats periodically""" + """Log basic scraping stats periodically like: + * RPM - Requests per Minute + * IPM - Items per Minute + """ def __init__(self, stats, interval=60.0): self.stats = stats @@ -35,24 +38,45 @@ def spider_opened(self, spider): self.task.start(self.interval) def log(self, spider): - items = self.stats.get_value("item_scraped_count", 0) - pages = self.stats.get_value("response_received_count", 0) - irate = (items - self.itemsprev) * self.multiplier - prate = (pages - self.pagesprev) * self.multiplier - self.pagesprev, self.itemsprev = pages, items + self.calculate_stats() msg = ( "Crawled %(pages)d pages (at %(pagerate)d pages/min), " "scraped %(items)d items (at %(itemrate)d items/min)" ) log_args = { - "pages": pages, - "pagerate": prate, - "items": items, - "itemrate": irate, + "pages": self.pages, + "pagerate": self.prate, + "items": self.items, + "itemrate": self.irate, } logger.info(msg, log_args, extra={"spider": spider}) + def calculate_stats(self): + self.items = self.stats.get_value("item_scraped_count", 0) + self.pages = self.stats.get_value("response_received_count", 0) + self.irate = (self.items - self.itemsprev) * self.multiplier + self.prate = (self.pages - self.pagesprev) * self.multiplier + self.pagesprev, self.itemsprev = self.pages, self.items + def spider_closed(self, spider, reason): if self.task and self.task.running: self.task.stop() + + rpm_final, ipm_final = self.calculate_final_stats(spider) + self.stats.set_value("responses_per_minute", rpm_final) + self.stats.set_value("items_per_minute", ipm_final) + + def calculate_final_stats(self, spider): + start_time = self.stats.get_value("start_time") + finished_time = self.stats.get_value("finished_time") + + if not start_time or not finished_time: + return None, None + + mins_elapsed = (finished_time - start_time).seconds / 60 + + items = self.stats.get_value("item_scraped_count", 0) + pages = self.stats.get_value("response_received_count", 0) + + return (pages / mins_elapsed), (items / mins_elapsed) diff --git a/tests/test_logstats.py b/tests/test_logstats.py new file mode 100644 index 00000000000..d87285df785 --- /dev/null +++ b/tests/test_logstats.py @@ -0,0 +1,62 @@ +import unittest +from datetime import datetime + +from scrapy.extensions.logstats import LogStats +from scrapy.utils.test import get_crawler +from tests.spiders import SimpleSpider + + +class TestLogStats(unittest.TestCase): + def setUp(self): + self.crawler = get_crawler(SimpleSpider) + self.spider = self.crawler._create_spider("spidey") + self.stats = self.crawler.stats + + self.stats.set_value("response_received_count", 4802) + self.stats.set_value("item_scraped_count", 3201) + + def test_stats_calculations(self): + logstats = LogStats.from_crawler(self.crawler) + + with self.assertRaises(AttributeError): + logstats.pagesprev + logstats.itemsprev + + logstats.spider_opened(self.spider) + self.assertEqual(logstats.pagesprev, 4802) + self.assertEqual(logstats.itemsprev, 3201) + + logstats.calculate_stats() + self.assertEqual(logstats.items, 3201) + self.assertEqual(logstats.pages, 4802) + self.assertEqual(logstats.irate, 0.0) + self.assertEqual(logstats.prate, 0.0) + self.assertEqual(logstats.pagesprev, 4802) + self.assertEqual(logstats.itemsprev, 3201) + + # Simulate what happens after a minute + self.stats.set_value("response_received_count", 5187) + self.stats.set_value("item_scraped_count", 3492) + logstats.calculate_stats() + self.assertEqual(logstats.items, 3492) + self.assertEqual(logstats.pages, 5187) + self.assertEqual(logstats.irate, 291.0) + self.assertEqual(logstats.prate, 385.0) + self.assertEqual(logstats.pagesprev, 5187) + self.assertEqual(logstats.itemsprev, 3492) + + # Simulate when spider closes after running for 30 mins + self.stats.set_value("start_time", datetime.fromtimestamp(1655100172)) + self.stats.set_value("finished_time", datetime.fromtimestamp(1655101972)) + logstats.spider_closed(self.spider, "test reason") + self.assertEqual(self.stats.get_value("responses_per_minute"), 172.9) + self.assertEqual(self.stats.get_value("items_per_minute"), 116.4) + + def test_stats_calculations_no_time(self): + """The stat values should be None since the start and finish time are + not available. + """ + logstats = LogStats.from_crawler(self.crawler) + logstats.spider_closed(self.spider, "test reason") + self.assertIsNone(self.stats.get_value("responses_per_minute")) + self.assertIsNone(self.stats.get_value("items_per_minute"))