In [1]:
import dotenv
dotenv.load_dotenv()


True

In [2]:
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy
import os
import json
import asyncio
from pydantic import BaseModel, Field

class Competitor(BaseModel):
    name: str = Field(..., description="Name of the competitor")
    relevant: bool = Field(..., description="Whether the competitor is relevant to the user's idea")
    url: str = Field(..., description="Homepage url of the competitor")

async def extract_tech_content():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://www.reddit.com/r/EventProduction/comments/1d93f6k/nowadays_ai_tool_for_event_planning/",
            extraction_strategy=LLMExtractionStrategy(
                provider="openai/gpt-4o",
                api_token=os.getenv('OPENAI_API_KEY'),
                extraction_type="schema",
                schema=Competitor.model_json_schema(),
                instruction="We are a competitor analysis pipeline for a user's idea. The following content is a new reddit launch post, extract the company/product name, whether it is relevant to the user's idea or not, and if it is, extract its homepage url. User idea is an AI tool for event planning."
            ),
            bypass_cache=True,
        )

    tech_content = json.loads(result.extracted_content)
    print(f"Number of tech-related items extracted: {len(tech_content)}")
    print(tech_content)
await extract_tech_content()

[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🕸️ Crawling https://www.reddit.com/r/EventProduction/comments/1d93f6k/nowadays_ai_tool_for_event_planning/ using AsyncPlaywrightCrawlerStrategy...
[LOG] ✅ Crawled https://www.reddit.com/r/EventProduction/comments/1d93f6k/nowadays_ai_tool_for_event_planning/ successfully!
[LOG] 🚀 Crawling done for https://www.reddit.com/r/EventProduction/comments/1d93f6k/nowadays_ai_tool_for_event_planning/, success: True, time taken: 1.66 seconds
[LOG] 🚀 Content extracted for https://www.reddit.com/r/EventProduction/comments/1d93f6k/nowadays_ai_tool_for_event_planning/, success: True, time taken: 0.22 seconds
[LOG] 🔥 Extracting semantic blocks for https://www.reddit.com/r/EventProduction/comments/1d93f6k/nowadays_ai_tool_for_event_planning/, Strategy: AsyncWebCrawler
[LOG] Call LLM for https://www.reddit.com/r/EventProduction/comments/1d93f6k/nowadays_ai_tool_for_event_planning/ - block index: 0
[LOG] Extracted 1 b

  Expected `PromptTokensDetails` but got `dict` with value `{'audio_tokens': None, 'cached_tokens': 0}` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [6]:
await extract_openai_pricing()

[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 1.47 seconds
[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.01 seconds
[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler
[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0
[LOG] Extracted 1 blocks from URL: https://openai.com/api/pricing/ block index: 0
[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 0.93 seconds.
[
    {
        "index": 0,
        "tags": [
            "error message"
        ],
        "content": [
            "Application error: a client-side exception has occurred (see the browser console for more information)."
        ],
        "error": false
    }
]


In [4]:
from app.engine.tools.web_reader import read_webpage
from pydantic import BaseModel, Field
class Competitor(BaseModel):
    name: str = Field(..., description="Name of the competitor")
    relevant: bool = Field(..., description="Whether the competitor is relevant to the user's idea")
    url: str = Field(..., description="Homepage url of the competitor")
await read_webpage("https://www.ycombinator.com/launches/J7I-nowadays-ai-copilot-for-event-planning", schema=Competitor.model_json_schema(), instruction="We are a competitor analysis pipeline for a user's idea. The following content is a new reddit launch post, extract the company/product name, whether it is relevant to the user's idea or not, and if it is, extract its homepage url. User idea is an AI tool for event planning.")


[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🕸️ Crawling https://www.ycombinator.com/launches/J7I-nowadays-ai-copilot-for-event-planning using AsyncPlaywrightCrawlerStrategy...
[LOG] ✅ Crawled https://www.ycombinator.com/launches/J7I-nowadays-ai-copilot-for-event-planning successfully!
[LOG] 🚀 Crawling done for https://www.ycombinator.com/launches/J7I-nowadays-ai-copilot-for-event-planning, success: True, time taken: 1.69 seconds
[LOG] 🚀 Content extracted for https://www.ycombinator.com/launches/J7I-nowadays-ai-copilot-for-event-planning, success: True, time taken: 0.03 seconds
[LOG] 🔥 Extracting semantic blocks for https://www.ycombinator.com/launches/J7I-nowadays-ai-copilot-for-event-planning, Strategy: AsyncWebCrawler
[LOG] Call LLM for https://www.ycombinator.com/launches/J7I-nowadays-ai-copilot-for-event-planning - block index: 0
[LOG] Extracted 1 blocks from URL: https://www.ycombinator.com/launches/J7I-nowadays-ai-copilot-for-event-pla

  Expected `PromptTokensDetails` but got `dict` with value `{'audio_tokens': None, 'cached_tokens': 0}` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


WebReaderResult(content=[{'name': 'Nowadays', 'relevant': True, 'url': 'http://getnowadays.com/', 'error': False}], url='https://www.ycombinator.com/launches/J7I-nowadays-ai-copilot-for-event-planning', is_error=False, error_message=None)