## Crawl4ai
- The product pages on the AXA website contain /pk/ in their URLs (e.g., https://www.axa.de/pk/gesundheit/p/zahnzusatzversicherung).
- This specific pattern was used to filter internal links from the homepage.
### Next:
- Integrate with Agents




In [None]:
import asyncio
from crawl4ai import AsyncWebCrawler
import os

# Semaphore to limit concurrency
SEMAPHORE = asyncio.Semaphore(2)  # Adjust to 3 for better performance

# Function to fetch product page links from the homepage
async def fetch_product_links(limit=200):
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url="https://www.axa.de")
        internal_links = result.links['internal']
        # Filter links that point to product pages containing '/pk/'
        product_links = [link['href'] for link in internal_links if '/pk/' in link['href']]
        return product_links[:limit]

# Function to crawl a page and save it as Markdown
async def crawl_and_save(link, session, output_dir="markdownPages"):
    async with SEMAPHORE:  # Limit concurrent tasks
        async with AsyncWebCrawler(session=session) as crawler:
            result = await crawler.arun(url=link)
            if result.success:
                os.makedirs(output_dir, exist_ok=True)
                filename = os.path.join(output_dir, f"{link.split('/')[-1] or 'index'}.md")
                with open(filename, 'w', encoding='utf-8') as file:
                    file.write(result.markdown)
                print(f"Saved: {filename}")
            else:
                print(f"Failed to crawl: {link}")


async def main():
    print("Fetching product page links...")
    links = await fetch_product_links(limit=200)
    print(f"Found {len(links)} product page links.")

    async with AsyncWebCrawler() as session:
        tasks = [crawl_and_save(link, session) for link in links]
        await asyncio.gather(*tasks)
    print("Crawling and saving completed.")


await main()
