In [15]:
# Install dependencies
!apt-get update > /dev/null
!apt install chromium-chromedriver > /dev/null
!pip install selenium > /dev/null

import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import xml.etree.ElementTree as ET

# Set up headless Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=chrome_options)

username = "Jantsagdorj"
base_url = f"https://github.com/Jantsagdorj?tab=repositories"

driver.get(base_url)
time.sleep(3)

# Scroll to load all repositories
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)

soup = BeautifulSoup(driver.page_source, "html.parser")
repo_links = soup.select('li[itemprop="owns"] a[itemprop="name codeRepository"]')

repos_data = []

for a in repo_links:
    repo_name = a.text.strip()
    repo_url = "https://github.com" + a['href']

    driver.get(repo_url)
    time.sleep(2)

    page = BeautifulSoup(driver.page_source, "html.parser")

    is_empty = "This repository is empty" in driver.page_source

    # About
    about_tag = page.select_one("p.f4.my-3")
    about = about_tag.text.strip() if about_tag else (None if is_empty else repo_name)

    # Last updated
    last_updated_tag = page.select_one("relative-time")
    last_updated = last_updated_tag['datetime'] if last_updated_tag else None

    # Languages
    if not is_empty:
        languages = []
        lang_containers = page.select("li.d-inline span.color-fg-default.text-bold.mr-1")
        for lang in lang_containers:
            languages.append(lang.text.strip())
        if not languages:
            lang_list = page.select("li a[href*='/languages\']")
            for lang in lang_list:
                lang_name = lang.select_one("span")
                if lang_name:
                    languages.append(lang_name.text.strip())
        if not languages:
            languages = None
    else:
        languages = None

    # Commits
    if not is_empty:
        commits = None
        commits_section = page.select_one('li a[href$="/commits"] span strong')
        if commits_section:
            commits = commits_section.text.strip().replace(",", "")
        else:
            alt_commit = page.select_one("li span.d-none.d-sm-inline strong")
            if alt_commit:
                commits = alt_commit.text.strip().replace(",", "")
    else:
        commits = None

    repos_data.append({
        "url": repo_url,
        "about": about,
        "last_updated": last_updated,
        "languages": languages,
        "commits": commits
    })

driver.quit()

# Create XML
root = ET.Element("repositories")

for repo in repos_data:
    repo_el = ET.SubElement(root, "repository")

    ET.SubElement(repo_el, "url").text = repo["url"]
    ET.SubElement(repo_el, "about").text = repo["about"] if repo["about"] else "None"
    ET.SubElement(repo_el, "last_updated").text = repo["last_updated"] if repo["last_updated"] else "None"

    langs_el = ET.SubElement(repo_el, "languages")
    if repo["languages"]:
        for lang in repo["languages"]:
            ET.SubElement(langs_el, "language").text = lang
    else:
        langs_el.text = "None"

    ET.SubElement(repo_el, "commits").text = repo["commits"] if repo["commits"] else "None"

tree = ET.ElementTree(root)
tree.write("github_repos.xml", encoding="utf-8", xml_declaration=True)

print("done!")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


Scraping done!


In [17]:
from google.colab import files
files.download("github_repos.xml")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>