From 2f54d007106ea391535126d863f69a0a505e9a10 Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Sun, 17 May 2026 08:28:58 -0400 Subject: [PATCH 1/5] fix: Fetch GitHub stats in process to allow timeout Run PyGithub commit-activity retrieval in a separate process so it can be terminated if the library keeps retrying 202 responses. Added a top-level _fetch_stats_commit_activity_with_pygithub helper that constructs a Github client and returns raw week data or None. Switched from ThreadPoolExecutor to ProcessPoolExecutor in _get_stats_with_timeout, submit the helper with repo full name and GITHUB_TOKEN, and call pool.terminate_workers() on timeout. Also stop remapping week.raw_data when writing commitActivity (write the returned raw-data list directly). Updated unit tests to cover the new helper, environment token usage, ProcessPoolExecutor behavior and termination, and added a raise_for_status helper on FakeResponse. --- src/updater.py | 29 +++++++++++++++++---- tests/unit/test_updater.py | 53 +++++++++++++++++++++++++++++++++----- 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/src/updater.py b/src/updater.py index 3cbefedcc..4c8db97e7 100644 --- a/src/updater.py +++ b/src/updater.py @@ -2,7 +2,7 @@ import json import math import os -from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout +from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeout from datetime import datetime, timezone from threading import Thread @@ -193,6 +193,20 @@ def update_fb(): helpers.write_json_files(file_path=file_path, data=data) +def _fetch_stats_commit_activity_with_pygithub(repo_full_name: str, token: str) -> list | None: + """ + Fetch commit activity with PyGithub. + + This is intentionally a top-level function so it can run in a child process + and be terminated if PyGithub keeps retrying 202 responses. + """ + g = Github(auth=Auth.Token(token), timeout=helpers.DEFAULT_TIMEOUT) + commit_activity = g.get_repo(repo_full_name, lazy=True).get_stats_commit_activity() + if not commit_activity: + return None + return [week.raw_data for week in commit_activity] + + def _get_stats_with_timeout(repo, timeout=60): """ Fetch commit activity for a repo, capping total wait time. @@ -210,11 +224,17 @@ def _get_stats_with_timeout(repo, timeout=60): list or None Weekly commit-activity objects, or None on timeout. """ - with ThreadPoolExecutor(max_workers=1) as pool: - future = pool.submit(repo.get_stats_commit_activity) + repo_full_name = f'{repo.owner.login}/{repo.name}' + with ProcessPoolExecutor(max_workers=1) as pool: + future = pool.submit( + _fetch_stats_commit_activity_with_pygithub, + repo_full_name, + os.environ["GITHUB_TOKEN"], + ) try: return future.result(timeout=timeout) except FuturesTimeout: + pool.terminate_workers() log.warning(f'Timeout fetching commit activity for {repo.name}, skipping.') return None @@ -449,9 +469,8 @@ def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: # commit activity (last year, weekly buckets) commit_activity = _get_stats_with_timeout(repo) if commit_activity: - commits = [week.raw_data for week in commit_activity] file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) - helpers.write_json_files(file_path=file_path, data=commits) + helpers.write_json_files(file_path=file_path, data=commit_activity) # open pull requests pulls_data = [] diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 42759f87a..501bc86c7 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -25,6 +25,10 @@ def json(self): raise self._raises return self._payload + def raise_for_status(self): + if self.status_code >= 400: + raise requests.exceptions.HTTPError(self.text) + class FakeWeek: def __init__(self, week, total): @@ -229,10 +233,37 @@ def fake_get(url): assert 'paging' not in writes[0][1] +def test_fetch_stats_commit_activity_with_pygithub(monkeypatch): + class FakeGithub: + def __init__(self, auth, timeout): + self.auth = auth + self.timeout = timeout + + def get_repo(self, repo_full_name, lazy=False): + assert repo_full_name == 'owner/x' + assert lazy + return SimpleNamespace(get_stats_commit_activity=lambda: [FakeWeek(1, 2)]) + + monkeypatch.setattr(updater, 'Github', FakeGithub) + monkeypatch.setattr(updater.Auth, 'Token', lambda token: token) + + assert updater._fetch_stats_commit_activity_with_pygithub('owner/x', 'token') == [{'week': 1, 'total': 2}] + + class EmptyGithub(FakeGithub): + def get_repo(self, repo_full_name, lazy=False): + return SimpleNamespace(get_stats_commit_activity=lambda: None) + + monkeypatch.setattr(updater, 'Github', EmptyGithub) + assert updater._fetch_stats_commit_activity_with_pygithub('owner/x', 'token') is None + + def test_get_stats_with_timeout_success_and_timeout(monkeypatch): + repo = SimpleNamespace(name='x', owner=SimpleNamespace(login='owner')) + monkeypatch.setenv('GITHUB_TOKEN', 'token') + class FutureOk: def result(self, timeout): - return [1] + return [{'week': 1, 'total': 2}] class FutureTimeout: def result(self, timeout): @@ -248,18 +279,26 @@ def __enter__(self): def __exit__(self, *args): return False - def submit(self, func): + def submit(self, func, repo_full_name, token): + submissions.append((func, repo_full_name, token)) return self.future - monkeypatch.setattr(updater, 'ThreadPoolExecutor', lambda max_workers: Pool(FutureOk())) - repo = SimpleNamespace(name='x', get_stats_commit_activity=lambda: [1]) - assert updater._get_stats_with_timeout(repo) == [1] + def terminate_workers(self): + terminated.append(True) + + submissions = [] + terminated = [] + monkeypatch.setattr(updater, 'ProcessPoolExecutor', lambda max_workers: Pool(FutureOk())) + + assert updater._get_stats_with_timeout(repo) == [{'week': 1, 'total': 2}] + assert submissions == [(updater._fetch_stats_commit_activity_with_pygithub, 'owner/x', 'token')] warnings = [] monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) - monkeypatch.setattr(updater, 'ThreadPoolExecutor', lambda max_workers: Pool(FutureTimeout())) + monkeypatch.setattr(updater, 'ProcessPoolExecutor', lambda max_workers: Pool(FutureTimeout())) assert updater._get_stats_with_timeout(repo) is None assert warnings + assert terminated == [True] def test_seed_star_history(monkeypatch): @@ -331,7 +370,7 @@ def test_process_github_repo(monkeypatch, tmp_path): 'save_image_from_url', lambda **kwargs: writes.append(('img', kwargs['file_path'])) ) - monkeypatch.setattr(updater, '_get_stats_with_timeout', lambda repo: [FakeWeek(1, 1)]) + monkeypatch.setattr(updater, '_get_stats_with_timeout', lambda repo: [{'week': 1, 'total': 1}]) monkeypatch.setattr(updater, '_collect_star_history', lambda repo: [{'date': '2026-01-01', 'stars': 1}]) monkeypatch.setattr(updater, '_fetch_code_scanning_alerts', lambda repo: []) monkeypatch.setattr( From 52647ec412d97695641a63a6b40219430bb53b27 Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Sun, 17 May 2026 11:50:03 -0400 Subject: [PATCH 2/5] Write commit-activity raw_data when available Convert PyGithub WeeklyStat objects to their raw_data before writing commit activity JSON so stored data is serializable and includes fields like 'days'. Also tweak the _get_stats_with_timeout docstring wording and update unit tests to expect the raw_data format for commit activity. --- src/updater.py | 8 ++++++-- tests/unit/test_updater.py | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/updater.py b/src/updater.py index 4c8db97e7..33b2b12b8 100644 --- a/src/updater.py +++ b/src/updater.py @@ -222,7 +222,7 @@ def _get_stats_with_timeout(repo, timeout=60): Returns ------- list or None - Weekly commit-activity objects, or None on timeout. + Weekly commit-activity data, or None on timeout. """ repo_full_name = f'{repo.owner.login}/{repo.name}' with ProcessPoolExecutor(max_workers=1) as pool: @@ -469,8 +469,12 @@ def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: # commit activity (last year, weekly buckets) commit_activity = _get_stats_with_timeout(repo) if commit_activity: + commits = [ + week.raw_data if hasattr(week, 'raw_data') else week + for week in commit_activity + ] file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) - helpers.write_json_files(file_path=file_path, data=commit_activity) + helpers.write_json_files(file_path=file_path, data=commits) # open pull requests pulls_data = [] diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 501bc86c7..934526846 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -242,12 +242,16 @@ def __init__(self, auth, timeout): def get_repo(self, repo_full_name, lazy=False): assert repo_full_name == 'owner/x' assert lazy - return SimpleNamespace(get_stats_commit_activity=lambda: [FakeWeek(1, 2)]) + week = FakeWeek(1, 2) + week.raw_data['days'] = [0, 1, 1, 0, 0, 0, 0] + return SimpleNamespace(get_stats_commit_activity=lambda: [week]) monkeypatch.setattr(updater, 'Github', FakeGithub) monkeypatch.setattr(updater.Auth, 'Token', lambda token: token) - assert updater._fetch_stats_commit_activity_with_pygithub('owner/x', 'token') == [{'week': 1, 'total': 2}] + assert updater._fetch_stats_commit_activity_with_pygithub('owner/x', 'token') == [ + {'days': [0, 1, 1, 0, 0, 0, 0], 'total': 2, 'week': 1} + ] class EmptyGithub(FakeGithub): def get_repo(self, repo_full_name, lazy=False): @@ -370,7 +374,11 @@ def test_process_github_repo(monkeypatch, tmp_path): 'save_image_from_url', lambda **kwargs: writes.append(('img', kwargs['file_path'])) ) - monkeypatch.setattr(updater, '_get_stats_with_timeout', lambda repo: [{'week': 1, 'total': 1}]) + monkeypatch.setattr( + updater, + '_get_stats_with_timeout', + lambda repo: [{'week': 1, 'total': 1, 'days': [0, 0, 0, 0, 0, 0, 1]}], + ) monkeypatch.setattr(updater, '_collect_star_history', lambda repo: [{'date': '2026-01-01', 'stars': 1}]) monkeypatch.setattr(updater, '_fetch_code_scanning_alerts', lambda repo: []) monkeypatch.setattr( @@ -387,6 +395,10 @@ def post_ok(url, json, headers): updater._process_github_repo(FakeRepo(name='demo'), {'Authorization': 'x'}, 'https://api.github.com/graphql') assert any(path.endswith('languages\\demo') or path.endswith('languages/demo') for path, _ in writes) + assert any( + data == [{'days': [0, 0, 0, 0, 0, 0, 1], 'total': 1, 'week': 1}] + for _, data in writes + ) assert any(path.endswith('codeScanning\\demo') or path.endswith('codeScanning/demo') for path, _ in writes) assert any( path.endswith('codeScanningHistory\\demo') or path.endswith('codeScanningHistory/demo') for path, _ in writes) From 5f32360bde188288b9591725e01b249ebd34b843 Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Sun, 17 May 2026 13:11:13 -0400 Subject: [PATCH 3/5] Refactor GitHub commit-activity fetching Replace the previous ProcessPoolExecutor + PyGithub timeout trick with a two-step approach: prime GitHub's async stats job via a lightweight requests GET to /stats/commit_activity, then fetch commit activity using repo.get_stats_commit_activity with proper GithubException handling. Add a dedicated _write_commit_activity helper to cache results and only process non-archived repos; update update_github to prime all active repos, process repos, then write commit activity. Remove concurrent.futures usage and related imports, simplify error handling and logging, and update unit tests to match the new priming/fetching flow and behaviors. --- src/updater.py | 99 +++++++++++++++--------------- tests/unit/test_updater.py | 122 ++++++++++++++++++------------------- 2 files changed, 107 insertions(+), 114 deletions(-) diff --git a/src/updater.py b/src/updater.py index 33b2b12b8..c7e5f08c9 100644 --- a/src/updater.py +++ b/src/updater.py @@ -2,7 +2,6 @@ import json import math import os -from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeout from datetime import datetime, timezone from threading import Thread @@ -193,50 +192,48 @@ def update_fb(): helpers.write_json_files(file_path=file_path, data=data) -def _fetch_stats_commit_activity_with_pygithub(repo_full_name: str, token: str) -> list | None: +def _prime_commit_activity(repo, headers: dict) -> None: """ - Fetch commit activity with PyGithub. + Start GitHub's async commit-activity stats job for a repo. - This is intentionally a top-level function so it can run in a child process - and be terminated if PyGithub keeps retrying 202 responses. + PyGithub waits and retries automatically on 202 responses, which is what + we want for the final fetch. This warm-up request intentionally does not + use PyGithub so all repos can be started before any one repo blocks. """ - g = Github(auth=Auth.Token(token), timeout=helpers.DEFAULT_TIMEOUT) - commit_activity = g.get_repo(repo_full_name, lazy=True).get_stats_commit_activity() - if not commit_activity: - return None - return [week.raw_data for week in commit_activity] + url = f'{repo.url}/stats/commit_activity' + try: + response = helpers.s.get(url=url, headers=headers, timeout=5) + except requests.exceptions.RequestException as e: + log.warning(f'Could not prime commit activity for {repo.name}: {e}') + return + + if response.status_code not in (200, 202): + log.warning(f'Could not prime commit activity for {repo.name}: HTTP {response.status_code}') -def _get_stats_with_timeout(repo, timeout=60): +def _get_commit_activity(repo) -> list | None: """ - Fetch commit activity for a repo, capping total wait time. + Fetch commit activity for a repo. + """ + try: + return repo.get_stats_commit_activity() + except GithubException as e: + log.warning(f'Could not fetch commit activity for {repo.name}: {e}') + return None - Parameters - ---------- - repo : - PyGithub Repository object. - timeout : int - Maximum seconds to wait before giving up (GitHub may return 202 while - computing stats, causing PyGithub to retry indefinitely without this guard). - Returns - ------- - list or None - Weekly commit-activity data, or None on timeout. - """ - repo_full_name = f'{repo.owner.login}/{repo.name}' - with ProcessPoolExecutor(max_workers=1) as pool: - future = pool.submit( - _fetch_stats_commit_activity_with_pygithub, - repo_full_name, - os.environ["GITHUB_TOKEN"], - ) - try: - return future.result(timeout=timeout) - except FuturesTimeout: - pool.terminate_workers() - log.warning(f'Timeout fetching commit activity for {repo.name}, skipping.') - return None +def _write_commit_activity(repo) -> None: + """ + Fetch and cache commit activity for a repo. + """ + commit_activity = _get_commit_activity(repo) + if commit_activity: + commits = [ + week.raw_data if hasattr(week, 'raw_data') else week + for week in commit_activity + ] + file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) + helpers.write_json_files(file_path=file_path, data=commits) def _seed_star_history(repo, total: int, initial_samples: int) -> list[dict]: @@ -466,16 +463,6 @@ def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: file_path = os.path.join(BASE_DIR, 'github', 'languages', repo.name) helpers.write_json_files(file_path=file_path, data=languages) - # commit activity (last year, weekly buckets) - commit_activity = _get_stats_with_timeout(repo) - if commit_activity: - commits = [ - week.raw_data if hasattr(week, 'raw_data') else week - for week in commit_activity - ] - file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) - helpers.write_json_files(file_path=file_path, data=commits) - # open pull requests pulls_data = [] for pr in repo.get_pulls(state='open'): @@ -569,14 +556,26 @@ def update_github(): } graphql_url = 'https://api.github.com/graphql' + active_repos = [repo for repo in repos if not repo.archived] + + for repo in tqdm( + iterable=active_repos, + desc='Priming GitHub commit activity', + ): + _prime_commit_activity(repo, headers) + for repo in tqdm( - iterable=repos, + iterable=active_repos, desc='Updating GitHub data', ): - if repo.archived: - continue _process_github_repo(repo, headers, graphql_url) + for repo in tqdm( + iterable=active_repos, + desc='Updating GitHub commit activity', + ): + _write_commit_activity(repo) + def update_patreon(): """ diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 934526846..2c8c827eb 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -1,6 +1,6 @@ # standard imports import json -from concurrent.futures import TimeoutError as FuturesTimeout +import os from datetime import datetime, timezone from types import SimpleNamespace @@ -71,6 +71,7 @@ def __init__(self, name='repo1', archived=False, stars=4): self.owner = SimpleNamespace(login='owner') self.stargazers_count = stars self.raw_data = {'name': name, 'archived': archived} + self.url = f'https://api.github.com/repos/owner/{name}' def get_languages(self): return {'Python': 100} @@ -233,76 +234,73 @@ def fake_get(url): assert 'paging' not in writes[0][1] -def test_fetch_stats_commit_activity_with_pygithub(monkeypatch): - class FakeGithub: - def __init__(self, auth, timeout): - self.auth = auth - self.timeout = timeout +def test_prime_commit_activity(monkeypatch): + repo = FakeRepo(name='demo') + calls = [] - def get_repo(self, repo_full_name, lazy=False): - assert repo_full_name == 'owner/x' - assert lazy - week = FakeWeek(1, 2) - week.raw_data['days'] = [0, 1, 1, 0, 0, 0, 0] - return SimpleNamespace(get_stats_commit_activity=lambda: [week]) + def fake_get(url, headers, timeout): + calls.append((url, headers, timeout)) + return FakeResponse(status=202) - monkeypatch.setattr(updater, 'Github', FakeGithub) - monkeypatch.setattr(updater.Auth, 'Token', lambda token: token) + monkeypatch.setattr(updater.helpers.s, 'get', fake_get) + updater._prime_commit_activity(repo, {'Authorization': 'x'}) - assert updater._fetch_stats_commit_activity_with_pygithub('owner/x', 'token') == [ - {'days': [0, 1, 1, 0, 0, 0, 0], 'total': 2, 'week': 1} - ] + assert calls == [(f'{repo.url}/stats/commit_activity', {'Authorization': 'x'}, 5)] - class EmptyGithub(FakeGithub): - def get_repo(self, repo_full_name, lazy=False): - return SimpleNamespace(get_stats_commit_activity=lambda: None) + warnings = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers, timeout: FakeResponse(status=500)) + updater._prime_commit_activity(repo, {'Authorization': 'x'}) - monkeypatch.setattr(updater, 'Github', EmptyGithub) - assert updater._fetch_stats_commit_activity_with_pygithub('owner/x', 'token') is None + def raise_timeout(url, headers, timeout): + raise requests.exceptions.Timeout('boom') + monkeypatch.setattr(updater.helpers.s, 'get', raise_timeout) + updater._prime_commit_activity(repo, {'Authorization': 'x'}) -def test_get_stats_with_timeout_success_and_timeout(monkeypatch): - repo = SimpleNamespace(name='x', owner=SimpleNamespace(login='owner')) - monkeypatch.setenv('GITHUB_TOKEN', 'token') + assert len(warnings) == 2 - class FutureOk: - def result(self, timeout): - return [{'week': 1, 'total': 2}] - class FutureTimeout: - def result(self, timeout): - raise FuturesTimeout() +def test_get_commit_activity(monkeypatch): + week = FakeWeek(1, 2) + repo = SimpleNamespace(name='x', get_stats_commit_activity=lambda: [week]) + assert updater._get_commit_activity(repo) == [week] - class Pool: - def __init__(self, future): - self.future = future + repo_empty = SimpleNamespace(name='x', get_stats_commit_activity=lambda: None) + assert updater._get_commit_activity(repo_empty) is None - def __enter__(self): - return self + warnings = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) - def __exit__(self, *args): - return False + def raise_github_exception(): + raise GithubException(status=409, data={'message': 'empty'}) - def submit(self, func, repo_full_name, token): - submissions.append((func, repo_full_name, token)) - return self.future + repo_error = SimpleNamespace(name='x', get_stats_commit_activity=raise_github_exception) + assert updater._get_commit_activity(repo_error) is None + assert warnings - def terminate_workers(self): - terminated.append(True) - submissions = [] - terminated = [] - monkeypatch.setattr(updater, 'ProcessPoolExecutor', lambda max_workers: Pool(FutureOk())) +def test_write_commit_activity(monkeypatch): + monkeypatch.setattr(updater, 'BASE_DIR', 'base') - assert updater._get_stats_with_timeout(repo) == [{'week': 1, 'total': 2}] - assert submissions == [(updater._fetch_stats_commit_activity_with_pygithub, 'owner/x', 'token')] + week = FakeWeek(1, 2) + week.raw_data['days'] = [0, 1, 1, 0, 0, 0, 0] + monkeypatch.setattr(updater, '_get_commit_activity', lambda repo: [week]) - warnings = [] - monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) - monkeypatch.setattr(updater, 'ProcessPoolExecutor', lambda max_workers: Pool(FutureTimeout())) - assert updater._get_stats_with_timeout(repo) is None - assert warnings - assert terminated == [True] + writes = [] + monkeypatch.setattr(updater.helpers, 'write_json_files', lambda file_path, data: writes.append((file_path, data))) + + updater._write_commit_activity(FakeRepo(name='demo')) + + assert writes == [( + os.path.join('base', 'github', 'commitActivity', 'demo'), + [{'days': [0, 1, 1, 0, 0, 0, 0], 'total': 2, 'week': 1}], + )] + + monkeypatch.setattr(updater, '_get_commit_activity', lambda repo: None) + writes.clear() + updater._write_commit_activity(FakeRepo(name='demo')) + assert writes == [] def test_seed_star_history(monkeypatch): @@ -374,11 +372,6 @@ def test_process_github_repo(monkeypatch, tmp_path): 'save_image_from_url', lambda **kwargs: writes.append(('img', kwargs['file_path'])) ) - monkeypatch.setattr( - updater, - '_get_stats_with_timeout', - lambda repo: [{'week': 1, 'total': 1, 'days': [0, 0, 0, 0, 0, 0, 1]}], - ) monkeypatch.setattr(updater, '_collect_star_history', lambda repo: [{'date': '2026-01-01', 'stars': 1}]) monkeypatch.setattr(updater, '_fetch_code_scanning_alerts', lambda repo: []) monkeypatch.setattr( @@ -395,10 +388,6 @@ def post_ok(url, json, headers): updater._process_github_repo(FakeRepo(name='demo'), {'Authorization': 'x'}, 'https://api.github.com/graphql') assert any(path.endswith('languages\\demo') or path.endswith('languages/demo') for path, _ in writes) - assert any( - data == [{'days': [0, 0, 0, 0, 0, 0, 1], 'total': 1, 'week': 1}] - for _, data in writes - ) assert any(path.endswith('codeScanning\\demo') or path.endswith('codeScanning/demo') for path, _ in writes) assert any( path.endswith('codeScanningHistory\\demo') or path.endswith('codeScanningHistory/demo') for path, _ in writes) @@ -408,7 +397,6 @@ def post_ok(url, json, headers): def test_process_github_repo_error_and_avatar_skip(monkeypatch, tmp_path): monkeypatch.setattr(updater, 'BASE_DIR', str(tmp_path / 'gh-pages')) monkeypatch.setattr(updater.helpers, 'write_json_files', lambda **kwargs: None) - monkeypatch.setattr(updater, '_get_stats_with_timeout', lambda repo: None) monkeypatch.setattr(updater, '_collect_star_history', lambda repo: []) monkeypatch.setattr(updater, '_fetch_code_scanning_alerts', lambda repo: []) monkeypatch.setattr(updater, '_build_code_scanning_history', lambda alerts: []) @@ -454,14 +442,20 @@ def get_user(self, name): writes = [] monkeypatch.setattr(updater.helpers, 'write_json_files', lambda file_path, data: writes.append((file_path, data))) + primed = [] + monkeypatch.setattr(updater, '_prime_commit_activity', lambda repo, headers: primed.append(repo.name)) processed = [] monkeypatch.setattr(updater, '_process_github_repo', lambda repo, headers, graphql_url: processed.append(repo.name)) + commit_activity = [] + monkeypatch.setattr(updater, '_write_commit_activity', lambda repo: commit_activity.append(repo.name)) monkeypatch.setattr(updater, 'BASE_DIR', 'base') updater.update_github() assert any(path.endswith('github\\repos') or path.endswith('github/repos') for path, _ in writes) + assert primed == ['active'] assert processed == ['active'] + assert commit_activity == ['active'] def test_update_patreon(monkeypatch): From e8b3529fc4f4dc3a5d840e97160d95faa79e8eb2 Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Sun, 17 May 2026 13:42:26 -0400 Subject: [PATCH 4/5] Poll and cache GitHub commit activity Switch commit-activity fetching to use direct HTTP requests and add polling/caching logic. _get_commit_activity now accepts headers and calls helpers.s.get, handling 202 (processing), 204 (empty), 200 (JSON), non-200 responses and JSON decode errors. _write_commit_activity signature changed to accept commit_activity and only write when data exists. Added _update_commit_activity to poll the GitHub stats endpoint with a timeout, progress reporting, and retries before warning on timeout. update_github now uses the new polling function. Corresponding unit tests were updated to reflect the new request/response flow and polling behavior. --- src/updater.py | 65 ++++++++++++++++++++++++++++------ tests/unit/test_updater.py | 71 +++++++++++++++++++++++++++++--------- 2 files changed, 109 insertions(+), 27 deletions(-) diff --git a/src/updater.py b/src/updater.py index c7e5f08c9..982e8ab0d 100644 --- a/src/updater.py +++ b/src/updater.py @@ -2,6 +2,7 @@ import json import math import os +import time from datetime import datetime, timezone from threading import Thread @@ -211,22 +212,36 @@ def _prime_commit_activity(repo, headers: dict) -> None: log.warning(f'Could not prime commit activity for {repo.name}: HTTP {response.status_code}') -def _get_commit_activity(repo) -> list | None: +def _get_commit_activity(repo, headers: dict) -> list | None: """ Fetch commit activity for a repo. """ + url = f'{repo.url}/stats/commit_activity' try: - return repo.get_stats_commit_activity() - except GithubException as e: + response = helpers.s.get(url=url, headers=headers, timeout=10) + except requests.exceptions.RequestException as e: log.warning(f'Could not fetch commit activity for {repo.name}: {e}') return None + if response.status_code == 202: + return None + if response.status_code == 204: + return [] + if response.status_code != 200: + log.warning(f'Could not fetch commit activity for {repo.name}: HTTP {response.status_code}') + return [] + + try: + return response.json() or [] + except requests.exceptions.JSONDecodeError as e: + log.warning(f'Could not parse commit activity for {repo.name}: {e}') + return [] + -def _write_commit_activity(repo) -> None: +def _write_commit_activity(repo, commit_activity: list) -> None: """ - Fetch and cache commit activity for a repo. + Cache commit activity for a repo. """ - commit_activity = _get_commit_activity(repo) if commit_activity: commits = [ week.raw_data if hasattr(week, 'raw_data') else week @@ -236,6 +251,38 @@ def _write_commit_activity(repo) -> None: helpers.write_json_files(file_path=file_path, data=commits) +def _update_commit_activity(repos: list, headers: dict, max_wait: int = 1200, poll_interval: int = 15) -> None: + """ + Poll commit activity for all repos until ready or timeout. + """ + pending = list(repos) + deadline = time.monotonic() + max_wait + + with tqdm(total=len(pending), desc='Updating GitHub commit activity') as progress: + while pending and time.monotonic() < deadline: + remaining = [] + for repo in pending: + commit_activity = _get_commit_activity(repo, headers) + if commit_activity is None: + remaining.append(repo) + continue + + _write_commit_activity(repo, commit_activity) + progress.update(1) + + pending = remaining + if pending: + progress.set_postfix_str(f'{len(pending)} pending') + progress.write(f'Waiting for GitHub commit activity: {len(pending)} repos pending') + progress.refresh() + sleep_for = min(poll_interval, max(0, deadline - time.monotonic())) + if sleep_for: + time.sleep(sleep_for) + + for repo in pending: + log.warning(f'Timeout fetching commit activity for {repo.name}, skipping.') + + def _seed_star_history(repo, total: int, initial_samples: int) -> list[dict]: """ Fetch evenly-spaced pages from the stargazers API for a first-time seed. @@ -570,11 +617,7 @@ def update_github(): ): _process_github_repo(repo, headers, graphql_url) - for repo in tqdm( - iterable=active_repos, - desc='Updating GitHub commit activity', - ): - _write_commit_activity(repo) + _update_commit_activity(active_repos, headers) def update_patreon(): diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 2c8c827eb..8e49109ef 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -262,22 +262,31 @@ def raise_timeout(url, headers, timeout): def test_get_commit_activity(monkeypatch): - week = FakeWeek(1, 2) - repo = SimpleNamespace(name='x', get_stats_commit_activity=lambda: [week]) - assert updater._get_commit_activity(repo) == [week] + responses = [ + FakeResponse(status=202), + FakeResponse(status=204), + FakeResponse([{'week': 1, 'total': 2}], status=200), + FakeResponse(status=500), + FakeResponse(status=200, raises=requests.exceptions.JSONDecodeError('x', 'y', 0)), + ] + monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers, timeout: responses.pop(0)) - repo_empty = SimpleNamespace(name='x', get_stats_commit_activity=lambda: None) - assert updater._get_commit_activity(repo_empty) is None + repo = FakeRepo(name='x') + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) is None + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == [] + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == [{'week': 1, 'total': 2}] warnings = [] monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == [] + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == [] - def raise_github_exception(): - raise GithubException(status=409, data={'message': 'empty'}) + def raise_timeout(url, headers, timeout): + raise requests.exceptions.Timeout('boom') - repo_error = SimpleNamespace(name='x', get_stats_commit_activity=raise_github_exception) - assert updater._get_commit_activity(repo_error) is None - assert warnings + monkeypatch.setattr(updater.helpers.s, 'get', raise_timeout) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) is None + assert len(warnings) == 3 def test_write_commit_activity(monkeypatch): @@ -285,24 +294,50 @@ def test_write_commit_activity(monkeypatch): week = FakeWeek(1, 2) week.raw_data['days'] = [0, 1, 1, 0, 0, 0, 0] - monkeypatch.setattr(updater, '_get_commit_activity', lambda repo: [week]) - writes = [] monkeypatch.setattr(updater.helpers, 'write_json_files', lambda file_path, data: writes.append((file_path, data))) - updater._write_commit_activity(FakeRepo(name='demo')) + updater._write_commit_activity(FakeRepo(name='demo'), [week]) assert writes == [( os.path.join('base', 'github', 'commitActivity', 'demo'), [{'days': [0, 1, 1, 0, 0, 0, 0], 'total': 2, 'week': 1}], )] - monkeypatch.setattr(updater, '_get_commit_activity', lambda repo: None) writes.clear() - updater._write_commit_activity(FakeRepo(name='demo')) + updater._write_commit_activity(FakeRepo(name='demo'), []) assert writes == [] +def test_update_commit_activity(monkeypatch): + repos = [FakeRepo(name='ready'), FakeRepo(name='pending')] + calls = [] + + def fake_get_commit_activity(repo, headers): + calls.append(repo.name) + if repo.name == 'ready': + return [{'week': 1, 'total': 2}] + return None + + writes = [] + sleeps = [] + warnings = [] + times = iter([0, 0, 0, 2]) + + monkeypatch.setattr(updater, '_get_commit_activity', fake_get_commit_activity) + monkeypatch.setattr(updater, '_write_commit_activity', lambda repo, data: writes.append((repo.name, data))) + monkeypatch.setattr(updater.time, 'monotonic', lambda: next(times)) + monkeypatch.setattr(updater.time, 'sleep', lambda seconds: sleeps.append(seconds)) + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + + updater._update_commit_activity(repos, {'Authorization': 'x'}, max_wait=1, poll_interval=10) + + assert calls == ['ready', 'pending'] + assert writes == [('ready', [{'week': 1, 'total': 2}])] + assert sleeps == [1] + assert warnings + + def test_seed_star_history(monkeypatch): repo = FakeRepo(stars=250) history = updater._seed_star_history(repo, total=250, initial_samples=5) @@ -447,7 +482,11 @@ def get_user(self, name): processed = [] monkeypatch.setattr(updater, '_process_github_repo', lambda repo, headers, graphql_url: processed.append(repo.name)) commit_activity = [] - monkeypatch.setattr(updater, '_write_commit_activity', lambda repo: commit_activity.append(repo.name)) + monkeypatch.setattr( + updater, + '_update_commit_activity', + lambda repos, headers: commit_activity.extend(repo.name for repo in repos), + ) monkeypatch.setattr(updater, 'BASE_DIR', 'base') updater.update_github() From 4205157eee4ac5c7e3e82eea013c24981f5b1a79 Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Sun, 17 May 2026 19:04:48 -0400 Subject: [PATCH 5/5] Improve commit activity handling and fallback Change _get_commit_activity to return (data, status) and handle more HTTP statuses and parse errors explicitly. Add _build_commit_activity_from_commits to synthesize 52-week commit activity from the commits API as a fallback when GitHub stats are not ready or time out. Update _update_commit_activity to collect status counts, show a status summary in the progress output, and use the commits-based fallback for repos that time out. Adjust imports to use timedelta and update unit tests to reflect the new return shapes and fallback behavior. --- src/updater.py | 83 ++++++++++++++++++++++++++++++++------ tests/unit/test_updater.py | 77 ++++++++++++++++++++++++++++++----- 2 files changed, 138 insertions(+), 22 deletions(-) diff --git a/src/updater.py b/src/updater.py index 982e8ab0d..75e6aafc9 100644 --- a/src/updater.py +++ b/src/updater.py @@ -3,7 +3,7 @@ import math import os import time -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from threading import Thread # lib imports @@ -212,7 +212,7 @@ def _prime_commit_activity(repo, headers: dict) -> None: log.warning(f'Could not prime commit activity for {repo.name}: HTTP {response.status_code}') -def _get_commit_activity(repo, headers: dict) -> list | None: +def _get_commit_activity(repo, headers: dict) -> tuple[list | None, int | str]: """ Fetch commit activity for a repo. """ @@ -221,22 +221,72 @@ def _get_commit_activity(repo, headers: dict) -> list | None: response = helpers.s.get(url=url, headers=headers, timeout=10) except requests.exceptions.RequestException as e: log.warning(f'Could not fetch commit activity for {repo.name}: {e}') - return None + return None, 'error' - if response.status_code == 202: - return None - if response.status_code == 204: - return [] + if response.status_code in (202, 204): + return None, response.status_code + if response.status_code in (403, 429) or response.status_code >= 500: + log.warning(f'Could not fetch commit activity for {repo.name}: HTTP {response.status_code}') + return None, response.status_code if response.status_code != 200: log.warning(f'Could not fetch commit activity for {repo.name}: HTTP {response.status_code}') - return [] + return [], response.status_code try: - return response.json() or [] + return response.json() or [], response.status_code except requests.exceptions.JSONDecodeError as e: log.warning(f'Could not parse commit activity for {repo.name}: {e}') + return [], 'parse_error' + + +def _build_commit_activity_from_commits(repo) -> list: + """ + Build commit activity from the commits API when GitHub stats do not become ready. + """ + today = datetime.now(tz=timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0) + latest_week_start = today - timedelta(days=(today.weekday() + 1) % 7) + first_week_start = latest_week_start - timedelta(weeks=51) + end = latest_week_start + timedelta(days=7) + + commit_activity = [ + { + 'days': [0, 0, 0, 0, 0, 0, 0], + 'total': 0, + 'week': int((first_week_start + timedelta(weeks=i)).timestamp()), + } + for i in range(52) + ] + + try: + commits = repo.get_commits(since=first_week_start) + for commit in commits: + if len(getattr(commit, 'parents', [])) > 1: + continue + + commit_data = getattr(commit, 'commit', None) + author = getattr(commit_data, 'author', None) + committer = getattr(commit_data, 'committer', None) + commit_date = getattr(author, 'date', None) or getattr(committer, 'date', None) + if commit_date is None: + continue + if commit_date.tzinfo is None: + commit_date = commit_date.replace(tzinfo=timezone.utc) + else: + commit_date = commit_date.astimezone(timezone.utc) + + if commit_date < first_week_start or commit_date >= end: + continue + + days_since_start = (commit_date.date() - first_week_start.date()).days + week_index, day_index = divmod(days_since_start, 7) + commit_activity[week_index]['days'][day_index] += 1 + commit_activity[week_index]['total'] += 1 + except GithubException as e: + log.warning(f'Could not build commit activity for {repo.name}: {e}') return [] + return commit_activity + def _write_commit_activity(repo, commit_activity: list) -> None: """ @@ -261,8 +311,10 @@ def _update_commit_activity(repos: list, headers: dict, max_wait: int = 1200, po with tqdm(total=len(pending), desc='Updating GitHub commit activity') as progress: while pending and time.monotonic() < deadline: remaining = [] + statuses = {} for repo in pending: - commit_activity = _get_commit_activity(repo, headers) + commit_activity, status = _get_commit_activity(repo, headers) + statuses[status] = statuses.get(status, 0) + 1 if commit_activity is None: remaining.append(repo) continue @@ -272,15 +324,22 @@ def _update_commit_activity(repos: list, headers: dict, max_wait: int = 1200, po pending = remaining if pending: + status_summary = ', '.join( + f'{status}: {count}' for status, count in sorted(statuses.items(), key=lambda item: str(item[0])) + ) progress.set_postfix_str(f'{len(pending)} pending') - progress.write(f'Waiting for GitHub commit activity: {len(pending)} repos pending') + progress.write( + f'Waiting for GitHub commit activity: {len(pending)} repos pending ({status_summary})' + ) progress.refresh() sleep_for = min(poll_interval, max(0, deadline - time.monotonic())) if sleep_for: time.sleep(sleep_for) for repo in pending: - log.warning(f'Timeout fetching commit activity for {repo.name}, skipping.') + log.warning(f'Timeout fetching commit activity stats for {repo.name}, using commits API fallback.') + commit_activity = _build_commit_activity_from_commits(repo) + _write_commit_activity(repo, commit_activity) def _seed_star_history(repo, total: int, initial_samples: int) -> list[dict]: diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 8e49109ef..02b4812b2 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -267,26 +267,76 @@ def test_get_commit_activity(monkeypatch): FakeResponse(status=204), FakeResponse([{'week': 1, 'total': 2}], status=200), FakeResponse(status=500), + FakeResponse(status=404), FakeResponse(status=200, raises=requests.exceptions.JSONDecodeError('x', 'y', 0)), ] monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers, timeout: responses.pop(0)) repo = FakeRepo(name='x') - assert updater._get_commit_activity(repo, {'Authorization': 'x'}) is None - assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == [] - assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == [{'week': 1, 'total': 2}] + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == (None, 202) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == (None, 204) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == ([{'week': 1, 'total': 2}], 200) warnings = [] monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) - assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == [] - assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == [] + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == (None, 500) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == ([], 404) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == ([], 'parse_error') def raise_timeout(url, headers, timeout): raise requests.exceptions.Timeout('boom') monkeypatch.setattr(updater.helpers.s, 'get', raise_timeout) - assert updater._get_commit_activity(repo, {'Authorization': 'x'}) is None - assert len(warnings) == 3 + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == (None, 'error') + assert len(warnings) == 4 + + +def test_build_commit_activity_from_commits(monkeypatch): + fixed_today = datetime(2026, 3, 20, tzinfo=timezone.utc) + + class FixedDatetime(datetime): + @classmethod + def now(cls, tz=None): + return fixed_today + + class FakeCommit: + def __init__(self, author_date, committer_date=None, parents=None): + self.commit = SimpleNamespace( + author=SimpleNamespace(date=author_date), + committer=SimpleNamespace(date=committer_date), + ) + self.parents = parents or [object()] + + class FakeCommitRepo(FakeRepo): + def get_commits(self, since): + assert since == datetime(2025, 3, 23, tzinfo=timezone.utc) + return [ + FakeCommit(datetime(2026, 3, 15, tzinfo=timezone.utc)), + FakeCommit(datetime(2026, 3, 18)), + FakeCommit(None, datetime(2026, 3, 19, tzinfo=timezone.utc)), + FakeCommit(datetime(2026, 3, 19, tzinfo=timezone.utc), parents=[object(), object()]), + FakeCommit(None), + FakeCommit(datetime(2025, 3, 22, tzinfo=timezone.utc)), + ] + + monkeypatch.setattr(updater, 'datetime', FixedDatetime) + + activity = updater._build_commit_activity_from_commits(FakeCommitRepo(name='x')) + assert len(activity) == 52 + assert activity[0] == {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1742688000} + assert activity[-1] == {'days': [1, 0, 0, 1, 1, 0, 0], 'total': 3, 'week': 1773532800} + + +def test_build_commit_activity_from_commits_warning(monkeypatch): + warnings = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + + class BrokenRepo(FakeRepo): + def get_commits(self, since): + raise GithubException(status=409, data={'message': 'empty'}) + + assert updater._build_commit_activity_from_commits(BrokenRepo(name='empty')) == [] + assert warnings def test_write_commit_activity(monkeypatch): @@ -316,16 +366,22 @@ def test_update_commit_activity(monkeypatch): def fake_get_commit_activity(repo, headers): calls.append(repo.name) if repo.name == 'ready': - return [{'week': 1, 'total': 2}] - return None + return [{'week': 1, 'total': 2}], 200 + return None, 202 writes = [] + fallback = [] sleeps = [] warnings = [] times = iter([0, 0, 0, 2]) monkeypatch.setattr(updater, '_get_commit_activity', fake_get_commit_activity) monkeypatch.setattr(updater, '_write_commit_activity', lambda repo, data: writes.append((repo.name, data))) + monkeypatch.setattr( + updater, + '_build_commit_activity_from_commits', + lambda repo: fallback.append(repo.name) or [{'week': 2, 'total': 3}], + ) monkeypatch.setattr(updater.time, 'monotonic', lambda: next(times)) monkeypatch.setattr(updater.time, 'sleep', lambda seconds: sleeps.append(seconds)) monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) @@ -333,7 +389,8 @@ def fake_get_commit_activity(repo, headers): updater._update_commit_activity(repos, {'Authorization': 'x'}, max_wait=1, poll_interval=10) assert calls == ['ready', 'pending'] - assert writes == [('ready', [{'week': 1, 'total': 2}])] + assert writes == [('ready', [{'week': 1, 'total': 2}]), ('pending', [{'week': 2, 'total': 3}])] + assert fallback == ['pending'] assert sleeps == [1] assert warnings