In [1]:
import asyncio
import nest_asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
import time
from pydantic import BaseModel, Field

nest_asyncio.apply()



In [4]:
async def simple_crawl():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(url="https://appsumo.com/software/?sort=rating")
        print(f"Basic crawl result: {result.markdown[:500]}")  # Print first 500 characters
await simple_crawl()

[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🚀 Content extracted for https://appsumo.com/software/?sort=rating, success: True, time taken: 0.04 seconds
[LOG] 🚀 Extraction done for https://appsumo.com/software/?sort=rating, time taken: 0.04 seconds.
Basic crawl result: Open menu

AppSumo![](https://appsumo2next-cdn.appsumo.com/_next/static/media/as-appsumo-logo-dark.fbc325ee.svg)

Submit search

![Sell on AppSumo](https://appsumo2next-cdn.appsumo.com/_next/static/media/light-bulb-confetti.1be8d096.svg)

Sell on AppSumoEarn money selling digital products

Log in

Cart

  * Software
  * Courses & more
  * New arrivals
  * Ending soon



##### Filters

Close Filters

##### Shop by category:

  * Software
    * Operations
    * Marketing & sales
    * Build it you


In [119]:
import json
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

all_products = []

async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://appsumo.com/collections/new/"
        session_id = "appsumo_products_session"

        schema = {
            "name": "Product Extractor",
            "baseSelector": 'div[class*="rounded bg-white md:border md:border-gray-300 md:shadow-md"]',
            "fields": [
                {
                    "name": "name",
                    "selector": "span.overflow-hidden.text-ellipsis.whitespace-nowrap.font-bold",
                    "type": "text",
                    "transform": "strip",
                },
                {
                    "name": "category",
                    "selector": "span.max-md\\:text-xs a",
                    "type": "text",
                    "transform": "strip",
                },
                {
                    "name": "shortDescription",
                    "selector": "div.my-1.line-clamp-3",
                    "type": "text",
                    "transform": "strip",
                },
                {
                    "name": "rating",
                    "selector": "img[alt$='stars']",
                    "type": "attribute",
                    "attribute": "alt",
                    "transform": "lambda x: x.split()[0]",
                },
                {
                    "name": "noOfRatings",
                    "selector": "a[href$='#reviews'] span",
                    "type": "text",
                    "transform": "lambda x: x.split()[0]",
                },
                {
                    "name": "price",
                    "selector": "div.font-medium.md\\:text-2xl span",
                    "type": "text",
                    "transform": "lambda x: x.split('/')[0]",
                },
            ],
        }
        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

        # Page 1 with auto loading
        js_code = """
        (async () => {
            window.scrollTo(0, document.body.scrollHeight);
            await new Promise(resolve => setTimeout(resolve, 500));
            window.scrollTo(0, document.body.scrollHeight);
            await new Promise(resolve => setTimeout(resolve, 500));
            let loadMoreButton = null;
            for (let i = 0; i < 5; i++) {
                loadMoreButton = document.querySelector('button.rounded-full.px-12.py-2');
                loadMoreButton.click();
                if (i == 4) {
                    const products = document.querySelectorAll('div[class*="rounded bg-white md:border md:border-gray-300 md:shadow-md"]');
                
                    if (products.length > 70) {
                        throw new Error("Clicked load more 4 tiems. " + "Found " + products.length + " products");
                    }
                }
                await new Promise(resolve => setTimeout(resolve, 500));
                window.scrollTo(0, document.body.scrollHeight);
                await new Promise(resolve => setTimeout(resolve, 500));
            }
        })();
        """

        wait_for = """() => {
            const products = document.querySelectorAll('div[class*="rounded bg-white md:border md:border-gray-300 md:shadow-md"]');
            return products.length > 70;
        }"""

        result = await crawler.arun(
            url=url,
            session_id=session_id,
            css_selector=schema["baseSelector"],
            extraction_strategy=extraction_strategy,
            js_code=js_code,
            # wait_for=wait_for,
            bypass_cache=True,
            headless=True,
            screenshot=True,
        )

        products = json.loads(result.extracted_content)
        all_products.extend(products)
        import base64
        # Save the screenshot to a file
        with open("screenshot.png", "wb") as f:
            f.write(base64.b64decode(result.screenshot))
        await crawler.crawler_strategy.kill_session(session_id)
        print(f"Successfully crawled {len(all_products)}")
        print(json.dumps(all_products, indent=2))
asyncio.run(main())

[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🕸️ Crawling https://appsumo.com/software/?sort=recommended using AsyncPlaywrightCrawlerStrategy...
[ERROR] 🚫 Failed to crawl https://appsumo.com/software/?sort=recommended, error: Failed to crawl https://appsumo.com/software/?sort=recommended: Page.evaluate: Error: Clicked load more 4 tiems. Found 80 products
    at eval (eval at evaluate (:234:30), <anonymous>:14:31)
    at async <anonymous>:260:30


TypeError: the JSON object must be str, bytes or bytearray, not NoneType

In [110]:
# get all product names
product_names = [product['name'] for product in all_products]
print(product_names)
len(product_names)


['Ranklytics', 'Pismo', 'SiteBehaviour', 'Butternut AI', 'VideoToPage', 'IKI.AI', 'Octolens', 'Hopper HQ', 'SparkReceipt', 'Kroto', 'Scrab.in', 'Evolup', 'InsertChat', 'Diffshop', 'SurveyNoodle', 'Binder', 'Legitt AI', 'Blastable', 'Signum.AI', 'OneSuite', 'SeoRocket.ai', 'Blue', 'Brizy Cloud', 'AskYourDatabase', 'TidyCal', 'Tarvent', 'Fraud Blocker - Plus Exclusive', 'Zync', 'GenPage', 'Video To Blog', 'Rocky AI', 'Acadle', 'Leadfwd', 'MimicPC', 'Qura', 'RanksPro', 'AppMySite', 'PowerIn', 'Followr', 'Doplac', 'Transcript.LOL', 'SendFox', 'GoZen HyperReach.Ai', 'viewflip', 'Claspo', 'Tapflow', 'mavic', 'AahSheet', 'Emailit', 'ZeroBounce Email Verification', 'KingSumo', 'Taja', 'Reoon Email Verifier', 'Socrates', 'Sociamonials', 'Fox Signals', 'Job Boardly', 'Pin Generator - Automated Pinterest Marketing', 'Lancepilot', 'BreezeDoc', 'Writeseed - AI Content Writer', 'Visitor Tracking - Unlimited Website Tracking', 'Formly - Forms, Surveys & Quizzes', 'Periodix', 'GoBrunch', 'MyMemo', 'Le

80