diff --git a/src/updater.py b/src/updater.py index 3cbefedcc..75e6aafc9 100644 --- a/src/updater.py +++ b/src/updater.py @@ -2,8 +2,8 @@ import json import math import os -from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout -from datetime import datetime, timezone +import time +from datetime import datetime, timedelta, timezone from threading import Thread # lib imports @@ -193,30 +193,153 @@ def update_fb(): helpers.write_json_files(file_path=file_path, data=data) -def _get_stats_with_timeout(repo, timeout=60): +def _prime_commit_activity(repo, headers: dict) -> None: """ - Fetch commit activity for a repo, capping total wait time. + Start GitHub's async commit-activity stats job for a repo. - Parameters - ---------- - repo : - PyGithub Repository object. - timeout : int - Maximum seconds to wait before giving up (GitHub may return 202 while - computing stats, causing PyGithub to retry indefinitely without this guard). + PyGithub waits and retries automatically on 202 responses, which is what + we want for the final fetch. This warm-up request intentionally does not + use PyGithub so all repos can be started before any one repo blocks. + """ + url = f'{repo.url}/stats/commit_activity' + try: + response = helpers.s.get(url=url, headers=headers, timeout=5) + except requests.exceptions.RequestException as e: + log.warning(f'Could not prime commit activity for {repo.name}: {e}') + return - Returns - ------- - list or None - Weekly commit-activity objects, or None on timeout. + if response.status_code not in (200, 202): + log.warning(f'Could not prime commit activity for {repo.name}: HTTP {response.status_code}') + + +def _get_commit_activity(repo, headers: dict) -> tuple[list | None, int | str]: """ - with ThreadPoolExecutor(max_workers=1) as pool: - future = pool.submit(repo.get_stats_commit_activity) - try: - return future.result(timeout=timeout) - except FuturesTimeout: - log.warning(f'Timeout fetching commit activity for {repo.name}, skipping.') - return None + Fetch commit activity for a repo. + """ + url = f'{repo.url}/stats/commit_activity' + try: + response = helpers.s.get(url=url, headers=headers, timeout=10) + except requests.exceptions.RequestException as e: + log.warning(f'Could not fetch commit activity for {repo.name}: {e}') + return None, 'error' + + if response.status_code in (202, 204): + return None, response.status_code + if response.status_code in (403, 429) or response.status_code >= 500: + log.warning(f'Could not fetch commit activity for {repo.name}: HTTP {response.status_code}') + return None, response.status_code + if response.status_code != 200: + log.warning(f'Could not fetch commit activity for {repo.name}: HTTP {response.status_code}') + return [], response.status_code + + try: + return response.json() or [], response.status_code + except requests.exceptions.JSONDecodeError as e: + log.warning(f'Could not parse commit activity for {repo.name}: {e}') + return [], 'parse_error' + + +def _build_commit_activity_from_commits(repo) -> list: + """ + Build commit activity from the commits API when GitHub stats do not become ready. + """ + today = datetime.now(tz=timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0) + latest_week_start = today - timedelta(days=(today.weekday() + 1) % 7) + first_week_start = latest_week_start - timedelta(weeks=51) + end = latest_week_start + timedelta(days=7) + + commit_activity = [ + { + 'days': [0, 0, 0, 0, 0, 0, 0], + 'total': 0, + 'week': int((first_week_start + timedelta(weeks=i)).timestamp()), + } + for i in range(52) + ] + + try: + commits = repo.get_commits(since=first_week_start) + for commit in commits: + if len(getattr(commit, 'parents', [])) > 1: + continue + + commit_data = getattr(commit, 'commit', None) + author = getattr(commit_data, 'author', None) + committer = getattr(commit_data, 'committer', None) + commit_date = getattr(author, 'date', None) or getattr(committer, 'date', None) + if commit_date is None: + continue + if commit_date.tzinfo is None: + commit_date = commit_date.replace(tzinfo=timezone.utc) + else: + commit_date = commit_date.astimezone(timezone.utc) + + if commit_date < first_week_start or commit_date >= end: + continue + + days_since_start = (commit_date.date() - first_week_start.date()).days + week_index, day_index = divmod(days_since_start, 7) + commit_activity[week_index]['days'][day_index] += 1 + commit_activity[week_index]['total'] += 1 + except GithubException as e: + log.warning(f'Could not build commit activity for {repo.name}: {e}') + return [] + + return commit_activity + + +def _write_commit_activity(repo, commit_activity: list) -> None: + """ + Cache commit activity for a repo. + """ + if commit_activity: + commits = [ + week.raw_data if hasattr(week, 'raw_data') else week + for week in commit_activity + ] + file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) + helpers.write_json_files(file_path=file_path, data=commits) + + +def _update_commit_activity(repos: list, headers: dict, max_wait: int = 1200, poll_interval: int = 15) -> None: + """ + Poll commit activity for all repos until ready or timeout. + """ + pending = list(repos) + deadline = time.monotonic() + max_wait + + with tqdm(total=len(pending), desc='Updating GitHub commit activity') as progress: + while pending and time.monotonic() < deadline: + remaining = [] + statuses = {} + for repo in pending: + commit_activity, status = _get_commit_activity(repo, headers) + statuses[status] = statuses.get(status, 0) + 1 + if commit_activity is None: + remaining.append(repo) + continue + + _write_commit_activity(repo, commit_activity) + progress.update(1) + + pending = remaining + if pending: + status_summary = ', '.join( + f'{status}: {count}' for status, count in sorted(statuses.items(), key=lambda item: str(item[0])) + ) + progress.set_postfix_str(f'{len(pending)} pending') + progress.write( + f'Waiting for GitHub commit activity: {len(pending)} repos pending ({status_summary})' + ) + progress.refresh() + sleep_for = min(poll_interval, max(0, deadline - time.monotonic())) + if sleep_for: + time.sleep(sleep_for) + + for repo in pending: + log.warning(f'Timeout fetching commit activity stats for {repo.name}, using commits API fallback.') + commit_activity = _build_commit_activity_from_commits(repo) + _write_commit_activity(repo, commit_activity) def _seed_star_history(repo, total: int, initial_samples: int) -> list[dict]: @@ -446,13 +569,6 @@ def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: file_path = os.path.join(BASE_DIR, 'github', 'languages', repo.name) helpers.write_json_files(file_path=file_path, data=languages) - # commit activity (last year, weekly buckets) - commit_activity = _get_stats_with_timeout(repo) - if commit_activity: - commits = [week.raw_data for week in commit_activity] - file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) - helpers.write_json_files(file_path=file_path, data=commits) - # open pull requests pulls_data = [] for pr in repo.get_pulls(state='open'): @@ -546,14 +662,22 @@ def update_github(): } graphql_url = 'https://api.github.com/graphql' + active_repos = [repo for repo in repos if not repo.archived] + for repo in tqdm( - iterable=repos, + iterable=active_repos, + desc='Priming GitHub commit activity', + ): + _prime_commit_activity(repo, headers) + + for repo in tqdm( + iterable=active_repos, desc='Updating GitHub data', ): - if repo.archived: - continue _process_github_repo(repo, headers, graphql_url) + _update_commit_activity(active_repos, headers) + def update_patreon(): """ diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 42759f87a..02b4812b2 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -1,6 +1,6 @@ # standard imports import json -from concurrent.futures import TimeoutError as FuturesTimeout +import os from datetime import datetime, timezone from types import SimpleNamespace @@ -25,6 +25,10 @@ def json(self): raise self._raises return self._payload + def raise_for_status(self): + if self.status_code >= 400: + raise requests.exceptions.HTTPError(self.text) + class FakeWeek: def __init__(self, week, total): @@ -67,6 +71,7 @@ def __init__(self, name='repo1', archived=False, stars=4): self.owner = SimpleNamespace(login='owner') self.stargazers_count = stars self.raw_data = {'name': name, 'archived': archived} + self.url = f'https://api.github.com/repos/owner/{name}' def get_languages(self): return {'Python': 100} @@ -229,36 +234,164 @@ def fake_get(url): assert 'paging' not in writes[0][1] -def test_get_stats_with_timeout_success_and_timeout(monkeypatch): - class FutureOk: - def result(self, timeout): - return [1] +def test_prime_commit_activity(monkeypatch): + repo = FakeRepo(name='demo') + calls = [] + + def fake_get(url, headers, timeout): + calls.append((url, headers, timeout)) + return FakeResponse(status=202) + + monkeypatch.setattr(updater.helpers.s, 'get', fake_get) + updater._prime_commit_activity(repo, {'Authorization': 'x'}) + + assert calls == [(f'{repo.url}/stats/commit_activity', {'Authorization': 'x'}, 5)] + + warnings = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers, timeout: FakeResponse(status=500)) + updater._prime_commit_activity(repo, {'Authorization': 'x'}) + + def raise_timeout(url, headers, timeout): + raise requests.exceptions.Timeout('boom') + + monkeypatch.setattr(updater.helpers.s, 'get', raise_timeout) + updater._prime_commit_activity(repo, {'Authorization': 'x'}) + + assert len(warnings) == 2 + + +def test_get_commit_activity(monkeypatch): + responses = [ + FakeResponse(status=202), + FakeResponse(status=204), + FakeResponse([{'week': 1, 'total': 2}], status=200), + FakeResponse(status=500), + FakeResponse(status=404), + FakeResponse(status=200, raises=requests.exceptions.JSONDecodeError('x', 'y', 0)), + ] + monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers, timeout: responses.pop(0)) + + repo = FakeRepo(name='x') + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == (None, 202) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == (None, 204) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == ([{'week': 1, 'total': 2}], 200) + + warnings = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == (None, 500) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == ([], 404) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == ([], 'parse_error') + + def raise_timeout(url, headers, timeout): + raise requests.exceptions.Timeout('boom') + + monkeypatch.setattr(updater.helpers.s, 'get', raise_timeout) + assert updater._get_commit_activity(repo, {'Authorization': 'x'}) == (None, 'error') + assert len(warnings) == 4 + + +def test_build_commit_activity_from_commits(monkeypatch): + fixed_today = datetime(2026, 3, 20, tzinfo=timezone.utc) + + class FixedDatetime(datetime): + @classmethod + def now(cls, tz=None): + return fixed_today + + class FakeCommit: + def __init__(self, author_date, committer_date=None, parents=None): + self.commit = SimpleNamespace( + author=SimpleNamespace(date=author_date), + committer=SimpleNamespace(date=committer_date), + ) + self.parents = parents or [object()] + + class FakeCommitRepo(FakeRepo): + def get_commits(self, since): + assert since == datetime(2025, 3, 23, tzinfo=timezone.utc) + return [ + FakeCommit(datetime(2026, 3, 15, tzinfo=timezone.utc)), + FakeCommit(datetime(2026, 3, 18)), + FakeCommit(None, datetime(2026, 3, 19, tzinfo=timezone.utc)), + FakeCommit(datetime(2026, 3, 19, tzinfo=timezone.utc), parents=[object(), object()]), + FakeCommit(None), + FakeCommit(datetime(2025, 3, 22, tzinfo=timezone.utc)), + ] + + monkeypatch.setattr(updater, 'datetime', FixedDatetime) + + activity = updater._build_commit_activity_from_commits(FakeCommitRepo(name='x')) + assert len(activity) == 52 + assert activity[0] == {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1742688000} + assert activity[-1] == {'days': [1, 0, 0, 1, 1, 0, 0], 'total': 3, 'week': 1773532800} + + +def test_build_commit_activity_from_commits_warning(monkeypatch): + warnings = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + + class BrokenRepo(FakeRepo): + def get_commits(self, since): + raise GithubException(status=409, data={'message': 'empty'}) + + assert updater._build_commit_activity_from_commits(BrokenRepo(name='empty')) == [] + assert warnings + + +def test_write_commit_activity(monkeypatch): + monkeypatch.setattr(updater, 'BASE_DIR', 'base') + + week = FakeWeek(1, 2) + week.raw_data['days'] = [0, 1, 1, 0, 0, 0, 0] + writes = [] + monkeypatch.setattr(updater.helpers, 'write_json_files', lambda file_path, data: writes.append((file_path, data))) - class FutureTimeout: - def result(self, timeout): - raise FuturesTimeout() + updater._write_commit_activity(FakeRepo(name='demo'), [week]) - class Pool: - def __init__(self, future): - self.future = future + assert writes == [( + os.path.join('base', 'github', 'commitActivity', 'demo'), + [{'days': [0, 1, 1, 0, 0, 0, 0], 'total': 2, 'week': 1}], + )] - def __enter__(self): - return self + writes.clear() + updater._write_commit_activity(FakeRepo(name='demo'), []) + assert writes == [] - def __exit__(self, *args): - return False - def submit(self, func): - return self.future +def test_update_commit_activity(monkeypatch): + repos = [FakeRepo(name='ready'), FakeRepo(name='pending')] + calls = [] - monkeypatch.setattr(updater, 'ThreadPoolExecutor', lambda max_workers: Pool(FutureOk())) - repo = SimpleNamespace(name='x', get_stats_commit_activity=lambda: [1]) - assert updater._get_stats_with_timeout(repo) == [1] + def fake_get_commit_activity(repo, headers): + calls.append(repo.name) + if repo.name == 'ready': + return [{'week': 1, 'total': 2}], 200 + return None, 202 + writes = [] + fallback = [] + sleeps = [] warnings = [] + times = iter([0, 0, 0, 2]) + + monkeypatch.setattr(updater, '_get_commit_activity', fake_get_commit_activity) + monkeypatch.setattr(updater, '_write_commit_activity', lambda repo, data: writes.append((repo.name, data))) + monkeypatch.setattr( + updater, + '_build_commit_activity_from_commits', + lambda repo: fallback.append(repo.name) or [{'week': 2, 'total': 3}], + ) + monkeypatch.setattr(updater.time, 'monotonic', lambda: next(times)) + monkeypatch.setattr(updater.time, 'sleep', lambda seconds: sleeps.append(seconds)) monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) - monkeypatch.setattr(updater, 'ThreadPoolExecutor', lambda max_workers: Pool(FutureTimeout())) - assert updater._get_stats_with_timeout(repo) is None + + updater._update_commit_activity(repos, {'Authorization': 'x'}, max_wait=1, poll_interval=10) + + assert calls == ['ready', 'pending'] + assert writes == [('ready', [{'week': 1, 'total': 2}]), ('pending', [{'week': 2, 'total': 3}])] + assert fallback == ['pending'] + assert sleeps == [1] assert warnings @@ -331,7 +464,6 @@ def test_process_github_repo(monkeypatch, tmp_path): 'save_image_from_url', lambda **kwargs: writes.append(('img', kwargs['file_path'])) ) - monkeypatch.setattr(updater, '_get_stats_with_timeout', lambda repo: [FakeWeek(1, 1)]) monkeypatch.setattr(updater, '_collect_star_history', lambda repo: [{'date': '2026-01-01', 'stars': 1}]) monkeypatch.setattr(updater, '_fetch_code_scanning_alerts', lambda repo: []) monkeypatch.setattr( @@ -357,7 +489,6 @@ def post_ok(url, json, headers): def test_process_github_repo_error_and_avatar_skip(monkeypatch, tmp_path): monkeypatch.setattr(updater, 'BASE_DIR', str(tmp_path / 'gh-pages')) monkeypatch.setattr(updater.helpers, 'write_json_files', lambda **kwargs: None) - monkeypatch.setattr(updater, '_get_stats_with_timeout', lambda repo: None) monkeypatch.setattr(updater, '_collect_star_history', lambda repo: []) monkeypatch.setattr(updater, '_fetch_code_scanning_alerts', lambda repo: []) monkeypatch.setattr(updater, '_build_code_scanning_history', lambda alerts: []) @@ -403,14 +534,24 @@ def get_user(self, name): writes = [] monkeypatch.setattr(updater.helpers, 'write_json_files', lambda file_path, data: writes.append((file_path, data))) + primed = [] + monkeypatch.setattr(updater, '_prime_commit_activity', lambda repo, headers: primed.append(repo.name)) processed = [] monkeypatch.setattr(updater, '_process_github_repo', lambda repo, headers, graphql_url: processed.append(repo.name)) + commit_activity = [] + monkeypatch.setattr( + updater, + '_update_commit_activity', + lambda repos, headers: commit_activity.extend(repo.name for repo in repos), + ) monkeypatch.setattr(updater, 'BASE_DIR', 'base') updater.update_github() assert any(path.endswith('github\\repos') or path.endswith('github/repos') for path, _ in writes) + assert primed == ['active'] assert processed == ['active'] + assert commit_activity == ['active'] def test_update_patreon(monkeypatch):