In [1]:
import time
import sqlite3
import re
import sys
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
BASE_URL = "https://github.com"
ORG_URL = "https://github.com/orgs/google/repositories"
DB_PATH = "google_repos.db"
# 一般的なブラウザ UA（ブロック回避のため強めのUA）
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9,ja;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
}
def parse_star_count(text):
    """
    '1.2k', '3M', '1,234' のような GitHub のスター表記を整数へ変換。
    """
    text = text.strip().lower()
    if not text:
        return 0
    text = text.replace(",", "")  # "1,234" => "1234"
    m = re.match(r"^([\d\.]+)\s*([km]?)$", text)
    if not m:
        try:
            return int(text)
        except ValueError:
            return 0
    num = float(m.group(1))
    suffix = m.group(2)
    if suffix == "k":
        return int(num * 1_000)
    if suffix == "m":
        return int(num * 1_000_000)
    return int(num)
def get_soup(url):
    """
    URLからHTMLを取得してBeautifulSoupを返す。各取得後にtime.sleep(1)を必ず実施。
    """
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    # 取得HTMLがSPA等で内容薄い場合の簡易チェック（必要ならログ）
    html = resp.text
    time.sleep(1)  # polite delay
    return BeautifulSoup(html, "html.parser")
def extract_repos_from_page(soup):
    """
    組織のリポジトリ一覧ページから
    {name, language, stars} の辞書リストを抽出。
    HTML構造変化に備えて複数のフォールバックセレクタを使用。
    """
    repos = []
    # コンテナ（新UIで使われることがある）
    container = soup.select_one('div[data-testid="results-list"]')
    if container:
        cards = container.select('li, div.Box-row')
    else:
        # 旧UI・一般フォールバック
        cards = soup.select('li[itemprop="owns"], div.Box-row, li')
    for card in cards:
        # リポジトリ名の抽出
        name = None
        # 最も確実：org名を含むリンク
        name_link = card.select_one('a[href^="/google/"]')
        if not name_link:
            # フォールバック
            name_link = card.select_one('a[itemprop="name codeRepository"], h3 a, a[data-hovercard-type="repository"]')
        if name_link:
            text = name_link.get_text(strip=True)
            # "google/guava" の場合は末尾要素をリポジトリ名に
            parts = [p for p in text.split('/') if p]
            name = parts[-1] if parts else text
        # 主要言語
        language = None
        lang_el = card.select_one('span[itemprop="programmingLanguage"]')
        if lang_el:
            language = lang_el.get_text(strip=True)
        else:
            # 言語色ドット隣や補助テキストの候補
            lang_candidates = card.select(
                '.f6 .mt-2 span, '
                '.mr-3 .text-bold, '
                'span[data-testid="repo-language-color"] + span, '
                'li.d-inline span'
            )
            for c in lang_candidates:
                t = c.get_text(strip=True)
                # ノイズ除去
                if t and len(t) <= 30 and not any(x in t.lower() for x in ["updated", "star", "fork", "issue"]):
                    language = t
                    break
        # スター数
        stars = 0
        star_link = card.select_one('a[href$="/stargazers"]')
        if star_link:
            stars = parse_star_count(star_link.get_text(strip=True))
        if name:
            repos.append({"name": name, "language": language if language else None, "stars": stars})
    return repos
def find_next_page(soup):
    """
    ページネーションの「次へ」を見つけてURLを返す。
    ない場合はNone。
    """
    next_link = soup.select_one('a.next_page, a[rel="next"]')
    if not next_link:
        # テキストベースのフォールバック
        for a in soup.select('a'):
            if a.get_text(strip=True).lower() in ("next", "older"):
                next_link = a
                break
    if next_link and next_link.get("href"):
        return urljoin(BASE_URL, next_link["href"])
    return None
def init_db(conn):
    cur = conn.cursor()
    cur.execute("""
        CREATE TABLE IF NOT EXISTS repositories (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT NOT NULL,
            language TEXT,
            stars INTEGER NOT NULL,
            UNIQUE(name)
        );
    """)
    # よく使う並び替え・フィルタ用のインデックス（任意）
    cur.execute("CREATE INDEX IF NOT EXISTS idx_repositories_stars ON repositories(stars);")
    cur.execute("CREATE INDEX IF NOT EXISTS idx_repositories_language ON repositories(language);")
    conn.commit()
def save_repos(conn, repos):
    cur = conn.cursor()
    for r in repos:
        cur.execute("""
            INSERT INTO repositories (name, language, stars)
            VALUES (?, ?, ?)
            ON CONFLICT(name) DO UPDATE SET
                language = excluded.language,
                stars    = excluded.stars;
        """, (r["name"], r["language"], r["stars"]))
    conn.commit()
def show_saved(conn):
    cur = conn.cursor()
    cur.execute("""
        SELECT name, language, stars
        FROM repositories
        ORDER BY stars DESC, name ASC;
    """)
    rows = cur.fetchall()
    print("\nSaved repositories (sorted by stars desc):")
    if not rows:
        print("- no rows -")
    for name, language, stars in rows:
        print(f"- {name} | {language if language else '(unknown)'} | {stars} stars")
def main():
    print(f"Target: {ORG_URL}")
    conn = sqlite3.connect(DB_PATH)
    try:
        init_db(conn)
        url = ORG_URL
        total = 0
        page_idx = 1
        while url:
            print(f"\nFetching page {page_idx}: {url}")
            try:
                soup = get_soup(url)
            except requests.HTTPError as e:
                print(f"HTTP error: {e}", file=sys.stderr)
                break
            except requests.RequestException as e:
                print(f"Request error: {e}", file=sys.stderr)
                break
            repos = extract_repos_from_page(soup)
            print(f"Found {len(repos)} repositories on this page.")
            if repos:
                # サンプルを表示
                print("Sample:", repos[:3])
            else:
                # デバッグ用にページタイトルや一部テキストを表示
                title = soup.title.get_text(strip=True) if soup.title else "(no title)"
                print(f"Page title: {title}")
                first_text = soup.get_text(" ", strip=True)[:500]
                print(f"Page snippet: {first_text}")
            save_repos(conn, repos)
            total += len(repos)
            next_url = find_next_page(soup)
            if next_url and next_url != url:
                url = next_url
                page_idx += 1#
            else:
                url = None  # 次ページなしで終了
        print(f"\nTotal repositories processed: {total}")
        show_saved(conn)
    finally:
        conn.close()
if __name__ == "__main__":
    main()









Target: https://github.com/orgs/google/repositories

Fetching page 1: https://github.com/orgs/google/repositories
Found 30 repositories on this page.
Sample: [{'name': 'tunix', 'language': None, 'stars': 1900}, {'name': 'meridian', 'language': None, 'stars': 1200}, {'name': 'skia', 'language': None, 'stars': 10000}]

Fetching page 2: https://github.com#2
Found 0 repositories on this page.
Page title: GitHub · Change is constant. GitHub keeps you ahead. · GitHub
Page snippet: GitHub · Change is constant. GitHub keeps you ahead. · GitHub Skip to content Navigation Menu Toggle navigation Sign in Platform GitHub Copilot Write better code with AI GitHub Spark New Build and deploy intelligent apps GitHub Models New Manage and compare prompts GitHub Advanced Security Find and fix vulnerabilities Actions Automate any workflow Codespaces Instant dev environments Issues Plan and track work Code Review Manage code changes Discussions Collaborate outside of code Code Search Fin

Total repositories