In [2]:
from dotenv import load_dotenv
import nest_asyncio

load_dotenv()
nest_asyncio.apply()

# First Crawl

In [4]:
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun("https://example.com")
        print(result.markdown[:300])  # Print first 300 chars

if __name__ == "__main__":
    asyncio.run(main())

# Example Domain
This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.
[More information...](https://www.iana.org/domains/example)



# Basic Configuration (Light Introduction)

In [5]:
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode

# 1. BrowserConfig: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.).
# 2. CrawlerRunConfig: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.).

async def main():
    browser_conf = BrowserConfig(headless=True)  # or False to see the browser
    run_conf = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS
    )

    async with AsyncWebCrawler(config=browser_conf) as crawler:
        result = await crawler.arun(
            url="https://example.com",
            config=run_conf
        )
        print(result.markdown)

if __name__ == "__main__":
    asyncio.run(main())

# Example Domain
This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.
[More information...](https://www.iana.org/domains/example)



# Generating Markdown Output

In [6]:
# Example: Using a Filter with DefaultMarkdownGenerator

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

md_generator = DefaultMarkdownGenerator(
    content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
)

config = CrawlerRunConfig(
    cache_mode=CacheMode.BYPASS,
    markdown_generator=md_generator
)

async with AsyncWebCrawler() as crawler:
    result = await crawler.arun("https://news.ycombinator.com", config=config)
    print("Raw Markdown length:", len(result.markdown.raw_markdown))
    print("Fit Markdown length:", len(result.markdown.fit_markdown))

Raw Markdown length: 16487
Fit Markdown length: 14015


# Simple Data Extraction (CSS-based)

In [10]:
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai import LLMConfig

# Generate a schema (one-time cost)
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"

# Using OpenAI (requires API token)
schema = JsonCssExtractionStrategy.generate_schema(
    html,
    llm_config = LLMConfig(provider="openai/gpt-4o")  # Required for OpenAI
)

# Or using Ollama (open source, no token needed)
# schema = JsonCssExtractionStrategy.generate_schema(
#     html,
#     llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
# )

# Use the schema for fast, repeated extractions
strategy = JsonCssExtractionStrategy(schema)

In [11]:
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai import JsonCssExtractionStrategy

async def main():
    schema = {
        "name": "Example Items",
        "baseSelector": "div.item",
        "fields": [
            {"name": "title", "selector": "h2", "type": "text"},
            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
        ]
    }

    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="raw://" + raw_html,
            config=CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                extraction_strategy=JsonCssExtractionStrategy(schema)
            )
        )
        # The JSON output is stored in 'extracted_content'
        data = json.loads(result.extracted_content)
        print(data)

if __name__ == "__main__":
    asyncio.run(main())


[{'title': 'Item 1', 'link': 'https://example.com/item1'}]


# Simple Data Extraction (LLM-based)

In [12]:
# - We define a Pydantic schema (PricingInfo) describing the fields we want. 
# - The LLM extraction strategy uses that schema and your instructions to transform raw text into structured JSON. 
# - Depending on the provider and api_token, you can use local models or a remote API.


import os
import json
import asyncio
from typing import Dict
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai import LLMExtractionStrategy

class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(
        ..., description="Fee for output token for the OpenAI model."
    )

async def extract_structured_data_using_llm(
    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
):
    print(f"\n--- Extracting Structured Data with {provider} ---")

    if api_token is None and provider != "ollama":
        print(f"API token is required for {provider}. Skipping this example.")
        return

    browser_config = BrowserConfig(headless=True)

    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
    if extra_headers:
        extra_args["extra_headers"] = extra_headers

    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        word_count_threshold=1,
        page_timeout=80000,
        extraction_strategy=LLMExtractionStrategy(
            llm_config = LLMConfig(provider=provider,api_token=api_token),
            schema=OpenAIModelFee.model_json_schema(),
            extraction_type="schema",
            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
            Do not miss any models in the entire content.""",
            extra_args=extra_args,
        ),
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url="https://openai.com/api/pricing/", config=crawler_config
        )
        print(result.extracted_content)

if __name__ == "__main__":

    asyncio.run(
        extract_structured_data_using_llm(
            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
        )
    )


--- Extracting Structured Data with openai/gpt-4o ---


[
    {
        "model_name": "GPT-5",
        "input_fee": "US$1.250 / 一百萬個 token",
        "output_fee": "US$10.000 / 一百萬個 token",
        "error": false
    },
    {
        "model_name": "gpt-5-mini",
        "input_fee": "US$0.250 / 一百萬個 token",
        "output_fee": "US$2.000 / 一百萬個 token",
        "error": false
    },
    {
        "model_name": "gpt-5-nano",
        "input_fee": "US$0.050 / 一百萬個 token",
        "output_fee": "US$0.400 / 一百萬個 token",
        "error": false
    },
    {
        "model_name": "GPT-4.1",
        "input_fee": "US$3.00 / 一百萬個 token",
        "output_fee": "US$12.00 / 一百萬個 token",
        "error": false
    },
    {
        "model_name": "GPT-4.1 mini",
        "input_fee": "US$0.80 / 一百萬個 token",
        "output_fee": "US$3.20 / 一百萬個 token",
        "error": false
    },
    {
        "model_name": "GPT-4.1 nano",
        "input_fee": "US$0.20 / 一百萬個 token",
        "output_fee": "US$0.80 / 一百萬個 token",
        "error": false
    },
    {
        "m

# Adaptive Crawling

In [13]:
import asyncio
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler

async def adaptive_example():
    async with AsyncWebCrawler() as crawler:
        adaptive = AdaptiveCrawler(crawler)

        # Start adaptive crawling
        result = await adaptive.digest(
            start_url="https://docs.python.org/3/",
            query="async context managers"
        )

        # View results
        adaptive.print_stats()
        print(f"Crawled {len(result.crawled_urls)} pages")
        print(f"Achieved {adaptive.confidence:.0%} confidence")

if __name__ == "__main__":
    asyncio.run(adaptive_example())

Crawled 5 pages
Achieved 53% confidence


# Multi-URL Concurrency

In [3]:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode

async def quick_parallel_example():
    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3"
    ]

    run_conf = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        stream=True  # Enable streaming mode
    )

    async with AsyncWebCrawler() as crawler:
        # Stream results as they complete
        async for result in await crawler.arun_many(urls, config=run_conf):
            if result.success:
                print(f"[OK] {result.url}, length: {len(result.markdown.raw_markdown)}")
            else:
                print(f"[ERROR] {result.url} => {result.error_message}")

        # Or get all results at once (default behavior)
        run_conf = run_conf.clone(stream=False)
        results = await crawler.arun_many(urls, config=run_conf)
        for res in results:
            if res.success:
                print(f"[OK] {res.url}, length: {len(res.markdown.raw_markdown)}")
            else:
                print(f"[ERROR] {res.url} => {res.error_message}")

if __name__ == "__main__":
    asyncio.run(quick_parallel_example())

[OK] https://example.com/page1, length: 230


[OK] https://example.com/page2, length: 230
[OK] https://example.com/page3, length: 230


[OK] https://example.com/page2, length: 230
[OK] https://example.com/page3, length: 230
[OK] https://example.com/page1, length: 230
