In [1]:
import asyncio
import nest_asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
import time
from pydantic import BaseModel, Field

nest_asyncio.apply()



In [2]:
import json

# Load the products from the JSON file
with open('appsumo-products.json', 'r') as file:
    all_products = json.load(file)

In [3]:
test_data = all_products[:5]
test_data

[{'name': 'Ranklytics',
  'category': 'SEO',
  'shortDescription': 'Leverage 200+ ranking signals to generate loads of SEO content that’s ready to rank',
  'rating': '4.62',
  'noOfRatings': '60',
  'price': '$69',
  'url': 'https://appsumo.com/products/ranklytics/#reviews'},
 {'name': 'Pismo',
  'category': 'Productivity',
  'shortDescription': 'An AI assistant that can rewrite, proofread, and translate whatever you’re writing',
  'rating': '4.85',
  'noOfRatings': '84',
  'price': '$49',
  'url': 'https://appsumo.com/products/pismo/#reviews'},
 {'name': 'SiteBehaviour',
  'category': 'Website analytics',
  'shortDescription': 'Simplify campaign tracking with this cookie-less alternative to Google Analytics',
  'rating': '4.7',
  'noOfRatings': '23',
  'price': '$49',
  'url': 'https://appsumo.com/products/sitebehaviour/#reviews'},
 {'name': 'Butternut AI',
  'category': 'Web builders',
  'shortDescription': 'Use this AI platform to build your dream website in seconds with prompts—wit

In [10]:
import os
import json
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
from typing import List

# Define the Pydantic model for the Butternut AI product
class ButternutAIInfo(BaseModel):
    name: str = Field(description="Product name")
    description: str = Field(description="A detailed description of the product, extract the contents from the TLDR section and also combine it with the description provided in the product page and with your own description")
    unique_value_props: List[str] = Field(description="List of unique value props. Point out how this product is different from your knowledge of common products in the market")
    key_features: List[str] = Field(description="List of key features. Extract from the Core Features section, extract the actual meaningful features")
    best_for: List[str] = Field(description="Best suited for user categories. Extract from the Best For section")
    integrations: List[str] = Field(description="List of integrations. Extract from the Integrations section")
    alternativeTo: List[str] = Field(description="Alternative products. Extract from the Alternative section")
    summaryOfReviews: str = Field(description="Summary from AskSumo. You should be able to extract it from <div class='bg-asksumo'></div>")

# Scraper function to extract product details
async def extract_butternut_ai_info():
    url = 'https://appsumo.com/products/butternut-ai/'

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url=url,
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
                provider="openai/gpt-4o",  # Replace with your OpenAI model
                api_token=os.getenv('OPENAI_API_KEY'),
                schema=ButternutAIInfo.model_json_schema(),
                extraction_type="schema",
                instruction=(
                    "From the crawled content, extract the product information in the following format: "
                    "name, description, key features (as a list), best for categories (as a list), pricing info, "
                    "integrations (as a list), a summary of AskSumo reviews, and a list of alternative products. "
                    "Ensure that all the data is accurate and reflects the given structure. Example JSON: "
                    '{ "name": "Butternut AI", "description": "A generative AI platform for building websites", '
                    '"key_features": ["AI-driven design", "SEO optimizations", "Custom chatbot"], '
                    '"best_for": ["Small businesses", "Solopreneurs"], "pricing": "$69", '
                    '"integrations": ["Calendly", "Google Analytics", "YouTube"], '
                    '"summaryOfReviews": "Users rate Butternut AI highly for its ease of use.", '
                    '"alternative_to": ["Squarespace", "Wix", "WordPress"] }'
                )
            ),
            bypass_cache=True,
        )

    product_info = json.loads(result.extracted_content)
    print(f"Extracted product information: {product_info}")

    # Save the extracted data
    with open(".data/butternut_ai_info.json", "w", encoding="utf-8") as f:
        json.dump(product_info, f, indent=2)

# Run the asynchronous scraper
asyncio.run(extract_butternut_ai_info())


[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🕸️ Crawling https://appsumo.com/products/butternut-ai/ using AsyncPlaywrightCrawlerStrategy...
[LOG] ✅ Crawled https://appsumo.com/products/butternut-ai/ successfully!
[LOG] 🚀 Crawling done for https://appsumo.com/products/butternut-ai/, success: True, time taken: 7.68 seconds
[LOG] 🚀 Content extracted for https://appsumo.com/products/butternut-ai/, success: True, time taken: 0.08 seconds
[LOG] 🔥 Extracting semantic blocks for https://appsumo.com/products/butternut-ai/, Strategy: AsyncWebCrawler
[LOG] Call LLM for https://appsumo.com/products/butternut-ai/ - block index: 3


  Expected `PromptTokensDetails` but got `dict` with value `{'audio_tokens': None, 'cached_tokens': 0}` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


[LOG] Extracted 1 blocks from URL: https://appsumo.com/products/butternut-ai/ block index: 3
[LOG] Call LLM for https://appsumo.com/products/butternut-ai/ - block index: 4
[LOG] Extracted 1 blocks from URL: https://appsumo.com/products/butternut-ai/ block index: 4
[LOG] Call LLM for https://appsumo.com/products/butternut-ai/ - block index: 7
[LOG] Extracted 1 blocks from URL: https://appsumo.com/products/butternut-ai/ block index: 7
[LOG] 🚀 Extraction done for https://appsumo.com/products/butternut-ai/, time taken: 12.07 seconds.
Extracted product information: [{'name': 'Butternut AI', 'description': 'Butternut AI is a generative AI platform designed to simplify the process of building full-fledged, multi-page websites from a single prompt in seconds. It aims to make website building accessible to everyone, regardless of technical expertise, by providing hyper-personalized websites tailored to business niches and target audiences.', 'unique_value_props': ['Generates websites from a sin

FileNotFoundError: [Errno 2] No such file or directory: '.data/butternut_ai_info.json'

In [34]:
import json
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

async def scrape_appsumo_product_page(url):
    # Define the extraction schema
    schema = {
    "name": "Butternut AI Product Info",
    "baseSelector": "body",
    "fields": [
        {
            "name": "name",
            "selector": "h1.font-header",
            "type": "text",
        },
        {
            "name": "summary",
            "selector": "div.flex.flex-col.rounded.bg-gray-100.px-4",
            "type": "text",
        },
        {
            "name": "description",
            "selector": "#overview .prose",
            "type": "text",
        },
        {
            "name": "key_features",
            "selector": "#pricePlans ul li",
            "type": "list",
            "fields": [
                {
                    "name": "feature",
                    "type": "text"
                }
            ]
        },
        {
            "name": "audience-integrations-alternativeProducts",
            "selector": "div.grid.grid-cols-1.gap-x-2.gap-y-4 ul",
            "type": "nested_list",
            "fields": [
                {
                    "name": "detail",
                    "selector": "ul.mt-2.list-inside.list-disc li",
                    "type": "list",
                    "fields": [
                        {
                            "name": "item",
                            "type": "text"
                        }
                    ]
                },
            ]
        },
        {
            "name": "summaryOfReviews",
            "selector": ".bg-asksumo",
            "type": "text",
        },
    ],
}

    # Create the extraction strategy
    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
    
    # Use the AsyncWebCrawler with the extraction strategy
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://appsumo.com/products/butternut-ai/",
            extraction_strategy=extraction_strategy,
            bypass_cache=True,
        )

        assert result.success, "Failed to crawl the page"

        # Parse the extracted content
        product_info = json.loads(result.extracted_content)
        print(f"Extracted product information: {json.dumps(product_info, indent=2)}")


    return product_info[0]

asyncio.run(scrape_appsumo_product_page("https://appsumo.com/products/butternut-ai/"))

[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🕸️ Crawling https://appsumo.com/products/butternut-ai/ using AsyncPlaywrightCrawlerStrategy...
[LOG] ✅ Crawled https://appsumo.com/products/butternut-ai/ successfully!
[LOG] 🚀 Crawling done for https://appsumo.com/products/butternut-ai/, success: True, time taken: 7.10 seconds
[LOG] 🚀 Content extracted for https://appsumo.com/products/butternut-ai/, success: True, time taken: 0.08 seconds
[LOG] 🔥 Extracting semantic blocks for https://appsumo.com/products/butternut-ai/, Strategy: AsyncWebCrawler
[LOG] 🚀 Extraction done for https://appsumo.com/products/butternut-ai/, time taken: 0.12 seconds.
Extracted product information: [
  {
    "name": "Butternut AI",
    "summary": "Build a stunning, fully-functional website with zero codeAccess features like built-in SEO, tailored content, seamless integrations, advanced customization options, and more",
    "description": "Butternut AI is a generative AI pla

{'name': 'Butternut AI',
 'summary': 'Build a stunning, fully-functional website with zero codeAccess features like built-in SEO, tailored content, seamless integrations, advanced customization options, and more',
 'description': 'Butternut AI is a generative AI platform that can build professional, mobile-ready websites in seconds with simple prompting.Build a website with AIButternut AI gives you everything you need tocreate a website in just seconds—all without writing a line of code.Simply enter text todescribe your business to an advanced AI modelGenerate a fully functional, stunning website with content, images, sections, and layoutsthat align with your business category and target audienceEnter your business name and relevant keywords to generate a full-fledged website in a snap.Get full creative controlEven better, you’ll be able toedit every aspect of your website, including the content, images, components, and layout, to match your creative vision.Add new sections and pages, 

In [46]:
from pydantic import BaseModel, Field
from typing import List

class RefinedProductInfo(BaseModel):
    name: str = Field(description="Name of the product")
    refined_description: str = Field(description="Combined key points from the description, key features, audience, integrations, and alternatives it is replacing")
    potential_usecases: List[str] = Field(description="A few potential use cases where this product would be extremely useful, especially when combined with other technologies")
    ideal_users: List[str] = Field(description="Refined list of ideal customers facing specific issues that will use this product")
    scenarios_where_tool_excels: List[str] = Field(description="Predicted scenarios where this tool stands out and solves specific problems")
    integration_potential: List[str] = Field(description="Predicted ways this tool can effectively integrate with other tools and platforms")
    tags: List[str] = Field(description="Key feature tags to categorize the product")
    
REFINEMENT_PROMPT = """
Analyze the following product information and provide a refined output:

Product Information:
{product_info}

Please provide the following:
1. A refined description that combines key points from the description, key features, audience, integrations, and alternatives it is replacing.
2. Generate 3-5 potential use cases where this product would be extremely useful, especially when combined with other technologies.
3. Further refine the audience and generate 3-5 ideal customer profiles facing specific issues that will use this product.
4. Predict and list 3-5 scenarios where this tool will stand out and solve specific problems.
5. Predict and list 3-5 ways this tool can effectively integrate with other tools and platforms.
6. Add relevant feature tags to categorize the product.

Format your response as a JSON object with the following structure:
{{
    "name": "Product Name",
    "refined_description": "Refined description...",
    "potential_usecases": ["Use case 1", "Use case 2", "Use case 3"],
    "ideal_users": ["Ideal user 1", "Ideal user 2", "Ideal user 3"],
    "scenarios_where_tool_excels": ["Scenario 1", "Scenario 2", "Scenario 3"],
    "integration_potential": ["Integration 1", "Integration 2", "Integration 3"],
    "tags": ["Tag 1", "Tag 2", "Tag 3"],
}}
""" 

In [47]:
from openai import OpenAI
import json
from typing import Dict, Any

client = OpenAI()

async def refine_product_info(product_info: Dict[str, Any]) -> RefinedProductInfo:
    # Ensure your OpenAI API key is set in the environment variables
    del product_info["summaryOfReviews"]

    # Prepare the prompt with the product information
    prompt = REFINEMENT_PROMPT.format(product_info=json.dumps(product_info, indent=2))

    # Make the API call to OpenAI
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",  # or whichever model you prefer
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant that analyzes product information and provides refined insights."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        response_format=RefinedProductInfo,
    )

    return response.choices[0].message.parsed


In [49]:
# productinput:
#  {'name': 'VideoToPage',
#   'category': 'Video',
#   'shortDescription': 'Transcribe video and audio to create text content like tutorials, blog posts, or SOPs',
#   'rating': '4.65',
#   'noOfRatings': '17',
#   'price': '$69',
#   'url': 'https://appsumo.com/products/videotopage/#reviews'}

async def scrape_pipeline(product: dict):
    scraped_product_info = await scrape_appsumo_product_page(product["url"])
    product["reviewsSummary"] = scraped_product_info["summaryOfReviews"]
    product["name"] = scraped_product_info["name"]

    refined_product_info = await refine_product_info(scraped_product_info)
    product.update(refined_product_info.model_dump())

    return product


In [50]:
async def main():
    for product in test_data[:3]:
        result = await scrape_pipeline(product)
        print(result)

asyncio.run(main())

[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🕸️ Crawling https://appsumo.com/products/butternut-ai/ using AsyncPlaywrightCrawlerStrategy...
[LOG] ✅ Crawled https://appsumo.com/products/butternut-ai/ successfully!
[LOG] 🚀 Crawling done for https://appsumo.com/products/butternut-ai/, success: True, time taken: 7.62 seconds
[LOG] 🚀 Content extracted for https://appsumo.com/products/butternut-ai/, success: True, time taken: 0.07 seconds
[LOG] 🔥 Extracting semantic blocks for https://appsumo.com/products/butternut-ai/, Strategy: AsyncWebCrawler
[LOG] 🚀 Extraction done for https://appsumo.com/products/butternut-ai/, time taken: 0.12 seconds.
Extracted product information: [
  {
    "name": "Butternut AI",
    "summary": "Build a stunning, fully-functional website with zero codeAccess features like built-in SEO, tailored content, seamless integrations, advanced customization options, and more",
    "description": "Butternut AI is a generative AI pla