In [1]:
import crawl4ai
print(crawl4ai.__version__.__version__)

0.6.3


In [2]:
!crawl4ai-doctor

[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Running Crawl4AI health check[0m[36m...[0m[36m [0m
[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.6[0m[36m.[0m[1;36m3[0m[36m [0m
[1;36m[[0m[36mTEST[0m[1;36m][0m[36m...[0m[36m. ℹ Testing crawling capabilities[0m[36m...[0m[36m [0m
[1;36m[[0m[36mEXPORT[0m[1;36m][0m[36m.. ℹ Exporting media [0m[1;36m([0m[36mPDF/MHTML/screenshot[0m[1;36m)[0m[36m took [0m[1;36m1.[0m[36m72s [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m[4;32mhttps://crawl4ai.com[0m[32m                                               [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m7.[0m[32m28s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m[4;32mhttps://crawl4ai.com[0m[32m                                               [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m0.[0m[32m12s [0m
[1;32m[[0m[32mCOMPLETE[0m[1;32m][0m[32m ● [0m[4;32mhttps://crawl

# try the code below to ensure that Playwright is installed and works properly.

In [3]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [4]:
import asyncio
from playwright.async_api import async_playwright

async def test_browser():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto('https://example.com')
        print(f'Title: {await page.title()}')
        await browser.close()

asyncio.run(test_browser())

Title: Example Domain


# simple crawling

In [8]:
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, CacheMode

async def simple_crawl():
    crawler_run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) # CacheMode.BYPASS means no cache will be used.
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://mushfiqur-rahman-robin.github.io/",
            config=crawler_run_config
        )
        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))  # Print the first 500 characters

asyncio.run(simple_crawl())

[ Md. Mushfiqur Rahman ](https://Mushfiqur-Rahman-Robin.github.io/) --   * [ About Me ](https://Mushfiqur-Rahman-Robin.github.io/about/ "About Me page") --   * [ Achievements ](https://Mushfiqur-Rahman-Robin.github.io/achievements/ "Achievements page") --   * [ Certifications ](https://Mushfiqur-Rahman-Robin.github.io/certifications/ "Certifications page") --   * [ Contact Me ](https://Mushfiqur-Rahman-Robin.github.io/contact/ "Contact Me page") --   * [ Projects ](https://Mushfiqur-Rahman-Robin.github.io/post


# dynamic content crawling

In [12]:
async def crawl_dynamic_content():
    # You can use wait_for to wait for a condition to be met before returning the result
    # wait_for = """() => {
    #     return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
    # }"""

    # wait_for can be also just a css selector
    # wait_for = "article.tease-card:nth-child(10)"

    async with AsyncWebCrawler() as crawler:
        js_code = [
            """
            var btns = document.querySelectorAll('button');
            for (var i = 0; i < btns.length; i++) {
                if (btns[i].textContent.includes('Load More')) {
                    btns[i].click();
                    break;
                }
            }
            """
        ]
        # the js_code parameter lets you run custom JavaScript code in the context of the web page before scraping. 
        # This is especially useful for interacting with dynamic web content — things like clicking buttons, filling forms, or waiting for data to load via JavaScript.
        config = CrawlerRunConfig(
            cache_mode=CacheMode.ENABLED,
            js_code=js_code,
            # wait_for=wait_for,
        )
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            config=config,

        )
        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))  # Print first 500 characters

asyncio.run(crawl_dynamic_content())

IE 11 is not supported. For an optimal experience visit our site on another browser. -- Skip to Content -- [NBC News Logo](https://www.nbcnews.com) -- Sponsored By --   * [ Politics](https://www.nbcnews.com/politics) --   * Local --   * [New York](https://www.nbcnews.com/new-york) --   * [Los Angeles](https://www.nbcnews.com/los-angeles) --   * [Chicago](https://www.nbcnews.com/chicago) --   * [Dallas-Fort Worth](https://www.nbcnews.com/dallas-fort-worth) --   * [Philadelphia](https://www.nbcnews.com/philadelphia) --   * [Washi


# content cleaning

In [14]:
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

async def clean_content():
    async with AsyncWebCrawler(verbose=True) as crawler:
        config = CrawlerRunConfig(
            cache_mode=CacheMode.ENABLED,
            excluded_tags=['nav', 'footer', 'aside'],
            remove_overlay_elements=True,
            # excluded_tags: Removes HTML sections like <nav>, <footer>, and <aside> (often boilerplate or repeated UI).
            # remove_overlay_elements=True: Automatically removes modal overlays/popups, like cookie consent banners.
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0),
                options={
                    "ignore_links": True
                }
            ),
        )
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/Apple",
            config=config,
        )
        full_markdown_length = len(result.markdown.raw_markdown)
        fit_markdown_length = len(result.markdown.fit_markdown)
        print(f"Full Markdown Length: {full_markdown_length}")
        print(f"Fit Markdown Length: {fit_markdown_length}")

        with open("output/apple_markdown.md", "w") as f:
            f.write(result.markdown.fit_markdown)

# This code:
# Visits a webpage (Wikipedia on Apple 🍎)
# Cleans it (removes UI elements and overlays)
# Converts it into clean, minimal markdown
# Filters low-value content (using a scoring mechanism)
# Returns both the full and filtered markdown lengths


asyncio.run(clean_content())

Full Markdown Length: 92190
Fit Markdown Length: 0


# link analysis

In [21]:
async def link_analysis():
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            cache_mode=CacheMode.ENABLED,
            exclude_external_links=False,  # If you want to collect & save them
            exclude_social_media_links=True,
            exclude_domains=["facebook.com", "twitter.com"]
        )
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            config=config,
        )

        internal_links = result.links.get('internal', [])
        external_links = result.links.get('external', [])

        print(f"Found {len(internal_links)} internal links")
        print(f"Found {len(external_links)} external links")

        with open("output/link_analysis_markdown.md", "w") as f:
            f.write("# Internal Links\n")
            for link in internal_links:
                f.write(f"- [{link['text']}]({link['href']})\n")

            f.write("\n# External Links\n")
            for link in external_links:
                f.write(f"- [{link['text']}]({link['href']})\n")

        for link in internal_links[:5]:
            print(f"Href: {link['href']}\nText: {link['text']}\n")

        for link in external_links[:5]:
            print(f"External Href: {link['href']}\nText: {link['text']}\n")


asyncio.run(link_analysis())

Found 109 internal links
Found 56 external links
Href: https://www.nbcnews.com
Text: NBC News Logo

Href: https://www.nbcnews.com/politics
Text: Politics

Href: https://www.nbcnews.com/new-york
Text: New York

Href: https://www.nbcnews.com/los-angeles
Text: Los Angeles

Href: https://www.nbcnews.com/chicago
Text: Chicago

External Href: https://www.facebook.com/sharer/sharer.php?u=null&cid=article_share_facebook
Text: 

External Href: https://x.com/intent/post?text=&via=nbcnews&url=null&original_referer=URL&cid=article_share_twitter
Text: 

External Href: mailto:?subject=&body=null
Text: 

External Href: https://www.today.com/
Text: Today

External Href: https://www.msnbc.com/
Text: MSNBC



# media handling

In [40]:
import os
async def media_handling():
    os.makedirs("output/media", exist_ok=True)

    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            cache_mode=CacheMode.ENABLED,
            exclude_external_images=False,
            screenshot=False,  # No screenshot needed
            image_score_threshold=0.5,
            exclude_internal_links=False
        )
        result = await crawler.arun(
            url="https://mushfiqur-rahman-robin.github.io/publications/",
            config=config,
        )

        for img in result.media['images'][:5]:
            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")

        markdown_lines = ["# 🖼️ All Images from the Website\n"]

        all_images = result.media.get('images', [])

        if not all_images:
            markdown_lines.append("_No images found on the page._")
        else:
            for i, img in enumerate(all_images):
                image_url = img['src']
                alt_text = img.get('alt', 'No alt text')
                score = img.get('score', 0)

                markdown_lines.append(f"### Image {i+1}")
                markdown_lines.append(f"- **Alt**: {alt_text}")
                markdown_lines.append(f"- **Score**: {score}")
                markdown_lines.append(f"- **Link**: [{image_url}]({image_url})")
                markdown_lines.append(f"![Image {i+1}]({image_url})\n")

        with open("output/media_summary.md", "w") as md_file:
            md_file.write("\n".join(markdown_lines))

asyncio.run(media_handling())

Image URL: https://Mushfiqur-Rahman-Robin.github.io/images/BIMConference.PNG, Alt: , Score: 2
Image URL: https://Mushfiqur-Rahman-Robin.github.io/images/EICT_presentation_certificate.PNG, Alt: , Score: 1


In [36]:
async def media_handling():
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            cache_mode=CacheMode.ENABLED,
            exclude_external_images=False,
            # screenshot=True # Set this to True if you want to take a screenshot
        )
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            config=config,
        )
        for img in result.media['images'][:5]:
            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")

asyncio.run(media_handling())

Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-762x508,f_auto,q_auto:best/rockcms/2025-06/250602-jamie-dimon-se-517p-f32603.jpg, Alt: Jamie Dimon., Score: 6
Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-762x508,f_auto,q_auto:best/rockcms/2025-05/250514-uber-ch-1315-82ca9e.jpg, Alt: Uber headquarters., Score: 6
Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2025-05/250516-Musk-RS-cbcca6.jpg, Alt: Elon Musk, Score: 6
Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2025-06/250602-empty-office-stock-se-505p-55a14f.jpg, Alt: Empty table and chair against window at new workplace, Score: 6
Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2025-06/250602-Byron-Allen-2019-ac-510p-b3ed94.jpg, Alt: byron allen portrait, Score: 6


# Using Hooks for Custom Workflow