In [1]:
import requests
from bs4 import BeautifulSoup
import time

base_url = "https://github.com/google?tab=repositories&page="
headers = {"User-Agent": "Mozilla/5.0"}

repos_data = []

for page in range(1, 6): 
    print(f"Fetching page {page}...")

    response = requests.get(base_url + str(page), headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    repo_list = soup.find_all("li", class_="Box-row")

    if not repo_list:
        break

    for repo in repo_list:
        name_tag = repo.find("a", itemprop="name codeRepository")
        lang_tag = repo.find("span", itemprop="programmingLanguage")
        star_tag = repo.find("a", href=lambda x: x and x.endswith("stargazers"))

        name = name_tag.text.strip() if name_tag else None
        language = lang_tag.text.strip() if lang_tag else None
        stars = star_tag.text.strip() if star_tag else "0"

        # k対応
        if "k" in stars:
            stars = int(float(stars.replace("k", "")) * 1000)
        else:
            stars = int(stars.replace(",", ""))

        repos_data.append((name, language, stars))

        time.sleep(1)

print("Total repos:", len(repos_data))
repos_data[:3]

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Total repos: 50


[('oss-fuzz', 'Shell', 11663),
 ('jetpack-camera-app', 'Kotlin', 277),
 ('eclipsa-audio-plugin', 'C++', 56)]

In [2]:
import sqlite3

# DBに接続
conn = sqlite3.connect("google_repos.db")
cur = conn.cursor()

# テーブル作成
cur.execute("""
CREATE TABLE IF NOT EXISTS repos (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT,
    language TEXT,
    stars INTEGER
)
""")

# 既存データを消したい場合
cur.execute("DELETE FROM repos")

# repos_data の内容を DB に保存
cur.executemany("""
INSERT INTO repos (name, language, stars)
VALUES (?, ?, ?)
""", repos_data)

conn.commit()
conn.close()

print("データをDBに保存しました！")

データをDBに保存しました！


In [3]:
import sqlite3

conn = sqlite3.connect("google_repos.db")
cur = conn.cursor()

cur.execute("SELECT * FROM repos")
rows = cur.fetchall()

for row in rows:
    print(row)

conn.close()

('oss-fuzz', 'Shell', 11663)
('jetpack-camera-app', 'Kotlin', 277)
('eclipsa-audio-plugin', 'C++', 56)
('orbax', 'Python', 456)
('adk-go', 'Go', 5090)
('crubit', 'C++', 924)
('meridian', 'Python', 1190)
('aarch64-esr-decoder', 'Rust', 99)
('osv-scanner', 'Go', 8091)
('dawn', 'C++', 783)
('oss-fuzz', 'Shell', 11663)
('jetpack-camera-app', 'Kotlin', 277)
('eclipsa-audio-plugin', 'C++', 56)
('orbax', 'Python', 456)
('adk-go', 'Go', 5090)
('crubit', 'C++', 924)
('meridian', 'Python', 1190)
('aarch64-esr-decoder', 'Rust', 99)
('osv-scanner', 'Go', 8091)
('dawn', 'C++', 783)
('osv.dev', 'Python', 2388)
('oss-fuzz', 'Shell', 11663)
('jetpack-camera-app', 'Kotlin', 277)
('eclipsa-audio-plugin', 'C++', 56)
('orbax', 'Python', 456)
('adk-go', 'Go', 5090)
('crubit', 'C++', 924)
('meridian', 'Python', 1190)
('aarch64-esr-decoder', 'Rust', 99)
('osv-scanner', 'Go', 8091)
('osv.dev', 'Python', 2388)
('oss-fuzz', 'Shell', 11663)
('jetpack-camera-app', 'Kotlin', 277)
('eclipsa-audio-plugin', 'C++', 56