In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time


In [2]:
def get_model_urls(pages=1):
    base_url = "https://huggingface.co/models"
    urls = []

    for page in range(1, pages + 1):
        res = requests.get(f"{base_url}?p={page}")
        soup = BeautifulSoup(res.content, "html.parser")

        for a in soup.find_all("a", href=True):
            href = a["href"]
            if href.startswith("/") and len(href.split("/")) == 3:
                urls.append("https://huggingface.co" + href)

        time.sleep(1)

    return list(set(urls))  # deduplicate

def scrape_model_page(url):
    try:
        res = requests.get(url)
        soup = BeautifulSoup(res.content, "html.parser")
        title = soup.find("h1").text.strip() if soup.find("h1") else url.split("/")[-1]

        code_blocks = []
        for pre in soup.find_all("pre"):
            code = pre.text.strip()
            if any(kw in code for kw in ["from transformers", "pipeline", "AutoModel", "AutoTokenizer"]):
                code_blocks.append(code)

        if code_blocks:
            return {
                "url": url,
                "title": title,
                "code_snippets": code_blocks
            }
        else:
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def scrape_huggingface_models(max_pages=1):
    model_urls = get_model_urls(pages=max_pages)
    print(f"Found {len(model_urls)} model pages.")

    data = []
    for url in model_urls:
        print(f"Scraping: {url}")
        item = scrape_model_page(url)
        if item:
            data.append(item)
        time.sleep(1.2)

    return data

def save_to_json(data, filename):
    with open(filename, "w") as f:
        json.dump(data, f, indent=2)
    print(f"Saved {len(data)} entries to {filename}")


In [3]:
hf_data = scrape_huggingface_models(max_pages=2)
save_to_json(hf_data, "huggingface_models.json")


Found 61 model pages.
Scraping: https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct
Scraping: https://huggingface.co/google/gemma-3-27b-it
Scraping: https://huggingface.co/black-forest-labs/FLUX.1-schnell
Scraping: https://huggingface.co/google/gemma-3-1b-it
Scraping: https://huggingface.co/Qwen/Qwen3-0.6B
Scraping: https://huggingface.co/ds4sd/SmolDocling-256M-preview
Scraping: https://huggingface.co/tiiuae/Falcon-H1-34B-Instruct
Scraping: https://huggingface.co/BAAI/bge-m3
Scraping: https://huggingface.co/TheDrummer/Valkyrie-49B-v1
Scraping: https://huggingface.co/deepseek-ai/DeepSeek-V3-0324
Scraping: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503
Scraping: https://huggingface.co/a-m-team/AM-Thinking-v1
Scraping: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
Scraping: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
Scraping: https://huggingface.co/BLIP3o/BLIP3o-Model-8B
Scraping: https://huggingface.co/aleksa-codes/flux-ghibsky-ill