# Crawl4AI

In [None]:
# Set GEMINI_API_KEY temporarily for this session
import os
os.environ["GEMINI_API_KEY"] = "gemini-api-key"  # Replace with your actual key

In [1]:
%%capture
!pip install -U crawl4ai
!pip install nest_asyncio

In [2]:
# Check crawl4ai version
import crawl4ai
print(crawl4ai.__version__.__version__)

0.7.4


In [3]:
%%capture
!crawl4ai-setup

In [4]:
!crawl4ai-doctor

[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Running Crawl4AI health check[0m[36m...[0m[36m [0m
[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.7[0m[36m.[0m[1;36m4[0m[36m [0m
[1;36m[[0m[36mTEST[0m[1;36m][0m[36m...[0m[36m. ℹ Testing crawling capabilities[0m[36m...[0m[36m [0m
[1;36m[[0m[36mEXPORT[0m[1;36m][0m[36m.. ℹ Exporting media [0m[1;36m([0m[36mPDF/MHTML/screenshot[0m[1;36m)[0m[36m took [0m[1;36m0.[0m[36m86s [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m[4;32mhttps://crawl4ai.com[0m[32m                                               [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m4.[0m[32m31s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m[4;32mhttps://crawl4ai.com[0m[32m                                               [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m0.[0m[32m11s [0m
[1;32m[[0m[32mCOMPLETE[0m[1;32m][0m[32m ● [0m[4;32mhttps://crawl

In [10]:
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import CacheMode
from pydantic import BaseModel, Field
# from google.colab import userdata
import os, json, asyncio

class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(
        ..., description="Fee for output token for the OpenAI model."
    )

async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: dict = None):
    print(f"\n--- Extracting Structured Data with {provider} ---")

    # Skip if API token is missing (for providers that require it)
    if api_token is None and provider != "ollama":
        print(f"API token is required for {provider}. Skipping this example.")
        return

    extra_args = {"extra_headers": extra_headers} if extra_headers else {}



    config = CrawlerRunConfig(
        word_count_threshold=1,
        extraction_strategy=LLMExtractionStrategy(
            llm_config = LLMConfig(provider=provider, api_token=api_token),
            schema=OpenAIModelFee.model_json_schema(),
            extraction_type="schema",
            instruction="""Extract all model names along with fees for input and output tokens."
            "{model_name: 'GPT-4', input_fee: 'US$10.00 / 1M tokens', output_fee: 'US$30.00 / 1M tokens'}.""",
            **extra_args
        ),
        cache_mode = CacheMode.ENABLED
    )

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://openai.com/api/pricing/",
            config=config
        )
        print(json.loads(result.extracted_content)[:5])

# Usage:
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
# await extract_structured_data_using_llm("ollama/llama3.2")
output = await extract_structured_data_using_llm("gemini/gemini-2.0-flash-001", os.getenv("GEMINI_API_KEY"))


--- Extracting Structured Data with gemini/gemini-2.0-flash-001 ---


[{'model_name': 'GPT-5', 'input_fee': '$1.250 / 1M tokens', 'output_fee': '$10.000 / 1M tokens', 'error': False}, {'model_name': 'GPT-5 mini', 'input_fee': '$0.250 / 1M tokens', 'output_fee': '$2.000 / 1M tokens', 'error': False}, {'model_name': 'GPT-5 nano', 'input_fee': '$0.050 / 1M tokens', 'output_fee': '$0.400 / 1M tokens', 'error': False}, {'model_name': 'GPT-4.1', 'input_fee': '$3.00 / 1M tokens', 'output_fee': '$12.00 / 1M tokens', 'error': False}, {'model_name': 'GPT-4.1 mini', 'input_fee': '$0.80 / 1M tokens', 'output_fee': '$3.20 / 1M tokens', 'error': False}]
