# Check if website is being scraped

In [5]:
import time
import threading
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# ==============================================================================
# 1. SETUP THE SELENIUM DRIVER
# ==============================================================================
# We only need one driver for this test script.
driver = None

def initialize_driver():
    """Initializes a single Selenium driver instance."""
    global driver
    if driver is None:
        print("🚀 Initializing Selenium driver...")
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")
        # Add headers to appear more like a real user
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
        driver = webdriver.Chrome(options=chrome_options)
        print("✅ Driver is ready.")

def close_driver():
    """Closes the driver if it was initialized."""
    global driver
    if driver:
        driver.quit()
        driver = None
        print("✅ Driver has been closed.")

# ==============================================================================
# 2. THE HYBRID SCRAPING FUNCTION
# ==============================================================================
# This is the exact same function from our main pipeline, for an accurate test.

def scrape_website_with_hybrid_approach(url: str) -> str:
    """
    Implements the hybrid scraping strategy. Tries a fast 'requests' scrape first.
    If it fails or returns minimal content, it falls back to Selenium.
    """
    page_text = ""
    MIN_TEXT_LENGTH = 300  # Threshold to decide if the fast scrape was successful

    print(f"\n--- Attempt 1: Fast Scrape on {url} ---")
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            page_text = soup.get_text(" ", strip=True)
            print(f"  - Fast Scrape got {len(page_text)} characters.")
        else:
            print(f"  - Fast Scrape failed with status code: {response.status_code}")
            
    except requests.RequestException as e:
        print(f"  - Fast Scrape failed with exception: {e}")
        page_text = ""

    if len(page_text) > MIN_TEXT_LENGTH:
        print("✅ Fast Scrape was successful. Using this result.")
        return page_text

    print(f"\n--- Attempt 2: Selenium Fallback on {url} ---")
    try:
        # Ensure the driver is running before we use it
        initialize_driver()
        
        driver.get(url)
        print("  - Page requested with Selenium.")
        time.sleep(3) # Wait for JavaScript to render
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        page_text = soup.get_text(" ", strip=True)
        print(f"  - Selenium Scrape got {len(page_text)} characters.")
        
    except Exception as e:
        print(f"❌ Selenium Scrape failed with exception: {e}")
        return ""
    
    return page_text

# ==============================================================================
# 3. INTERACTIVE TEST LOOP
# ==============================================================================

if __name__ == "__main__":
    try:
        while True:
            # Get URL from user
            test_url = input("\nEnter the full website URL to test (or type 'quit' to exit): ").strip()

            if test_url.lower() == 'quit':
                break
            
            if not test_url.startswith('http'):
                print("⚠️ Please enter a full URL, including 'http://' or 'https://'.")
                continue

            # Run the scraper
            scraped_content = scrape_website_with_hybrid_approach(test_url)

            # --- Report Results ---
            print("\n" + "="*20 + " SCRAPE RESULT " + "="*20)
            if scraped_content and len(scraped_content) > 10:
                print(f"✅ SUCCESS! Successfully scraped {len(scraped_content)} characters.")
                print("\n--- Full scraped text: ---")
                print(scraped_content)
            else:
                print("❌ FAILED. The scraper could not retrieve meaningful content from this URL.")
            print("="*55 + "\n")

    finally:
        # Make sure the driver is closed when the loop ends
        close_driver()
        print("\n👋 Exiting test script.")


--- Attempt 1: Fast Scrape on https://hansolo.com/ ---
  - Fast Scrape got 1643 characters.
✅ Fast Scrape was successful. Using this result.

✅ SUCCESS! Successfully scraped 1643 characters.

--- Full scraped text: ---
HANSOLO Building Services™ – Tenant Improvements and Commercial Property Maintenance Services FAQs Portfolio Projects Resources Workorder Contact Home HANSOLO Building Services™ – Los Angeles, California Tenant Improvements, Structural Repairs, Concrete Flatwork, Parking Lot Repair and Commercial Property Maintenance services in the Greater Los Angeles California area and surrounding communities. 24-Hour Emergency Response! 2-hour emergency response within Los Angeles County for established clients. We do not provide residential services. Commercial Building Services and Maintenance in Los Angeles Office Remodeling and Tenant Improvements Demolition Services Framing, Drywall and Plaster Work ADA Restrooms Asphalt Repair and Replacement Asphalt Slurry Seal Coating Interi

# Web crawling with email finding


In [6]:
import time
import threading
import requests
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin, urlparse

# ==============================================================================
# 1. SETUP THE SELENIUM DRIVER (Unchanged)
# ==============================================================================
driver = None

def initialize_driver():
    """Initializes a single Selenium driver instance if not already running."""
    global driver
    if driver is None:
        print("🚀 Initializing Selenium driver...")
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
        driver = webdriver.Chrome(options=chrome_options)
        print("✅ Driver is ready.")

def close_driver():
    """Closes the driver if it was initialized."""
    global driver
    if driver:
        driver.quit()
        driver = None
        print("✅ Driver has been closed.")

# ==============================================================================
# 2. MODIFIED HYBRID SCRAPING FUNCTION (Returns full HTML source)
# ==============================================================================
# This is modified to return the full page source, which we need for
# finding links and emails, not just the visible text.

def get_page_source_hybrid(url: str) -> str:
    """
    Implements the hybrid scraping strategy. Returns the full page source HTML.
    Tries 'requests' first, falls back to Selenium if content is minimal.
    """
    html_source = ""
    MIN_HTML_LENGTH = 500  # Threshold for raw HTML to decide if scrape was successful

    # --- Attempt 1: Fast Scrape (requests) ---
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            html_source = response.text
        else:
            print(f"  - Fast Scrape on {url} failed with status code: {response.status_code}")
            
    except requests.RequestException as e:
        print(f"  - Fast Scrape on {url} failed with exception: {e}")
        html_source = ""

    if len(html_source) > MIN_HTML_LENGTH:
        return html_source

    # --- Attempt 2: Selenium Fallback ---
    try:
        initialize_driver()
        driver.get(url)
        time.sleep(3) # Wait for JavaScript to render
        html_source = driver.page_source
        
    except Exception as e:
        print(f"❌ Selenium Scrape on {url} failed with exception: {e}")
        return ""
    
    return html_source

# ==============================================================================
# 3. THE CRAWLER & EXTRACTOR LOGIC
# ==============================================================================

def crawl_and_extract(start_url: str):
    """
    Crawls a website starting from a given URL, scraping every internal page
    to find emails and links.
    """
    # Use urlparse to get the domain name, which we'll use to stay on the site
    base_netloc = urlparse(start_url).netloc
    
    # --- Data structures to manage the crawl ---
    urls_to_visit = [start_url]  # A queue of pages to crawl
    visited_urls = set()         # A set of pages we've already crawled to avoid loops
    found_emails = set()         # A set to store unique emails found

    print(f"\nCrawling starting from {start_url}")
    print(f"Will only crawl pages on domain: {base_netloc}\n")
    
    while urls_to_visit:
        current_url = urls_to_visit.pop(0) # Get the next URL to visit

        if current_url in visited_urls:
            continue
            
        print(f"🔎 Scraping: {current_url}")
        visited_urls.add(current_url)

        # Get the full HTML of the page using our hybrid function
        html_content = get_page_source_hybrid(current_url)

        if not html_content:
            print(f"  - Skipping page, no content retrieved.")
            continue
        
        # --- 1. Extract Emails from the current page ---
        # A simple but effective regex for finding emails
        email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        emails_on_page = re.findall(email_regex, html_content)
        
        if emails_on_page:
            new_emails = set(emails_on_page) - found_emails
            if new_emails:
                print(f"  ✅ Found {len(new_emails)} new email(s): {', '.join(new_emails)}")
                found_emails.update(new_emails)

        # --- 2. Find all links and add internal ones to the queue ---
        soup = BeautifulSoup(html_content, 'html.parser')
        links_on_page = soup.find_all('a', href=True)

        for link_tag in links_on_page:
            href = link_tag['href']
            # Use urljoin to handle relative links (e.g., '/about-us')
            absolute_link = urljoin(current_url, href)
            
            # Parse the link to check its components
            parsed_link = urlparse(absolute_link)

            # Check if the link is on the same domain and is an http/https link
            if parsed_link.netloc == base_netloc and parsed_link.scheme in ['http', 'https']:
                # Clean up fragments (#) and query params (?) for cleaner crawling
                clean_link = parsed_link._replace(query="", fragment="").geturl()
                
                if clean_link not in visited_urls and clean_link not in urls_to_visit:
                    urls_to_visit.append(clean_link)
    
    return found_emails, visited_urls


# ==============================================================================
# 4. INTERACTIVE TEST LOOP
# ==============================================================================

if __name__ == "__main__":
    try:
        while True:
            start_url = input("\nEnter the full website URL to crawl (or 'quit' to exit): ").strip()

            if start_url.lower() == 'quit':
                break
            
            if not start_url.startswith('http'):
                print("⚠️ Please enter a full URL, including 'http://' or 'https://'.")
                continue

            # Run the crawler
            start_time = time.time()
            all_emails, all_crawled_urls = crawl_and_extract(start_url)
            end_time = time.time()

            # --- Report Results ---
            print("\n" + "="*20 + " CRAWL COMPLETE " + "="*20)
            print(f"Crawl finished in {end_time - start_time:.2f} seconds.")
            print(f"Crawled a total of {len(all_crawled_urls)} pages.")
            
            if all_emails:
                print(f"\n✅ SUCCESS! Found {len(all_emails)} unique email address(es):")
                for i, email in enumerate(sorted(list(all_emails)), 1):
                    print(f"  {i}. {email}")
            else:
                print("\n❌ No email addresses were found on this website.")
            print("="*58 + "\n")

    finally:
        # Make sure the driver is closed when the loop ends
        close_driver()
        print("\n👋 Exiting test script.")


Crawling starting from https://hansolo.com/
Will only crawl pages on domain: hansolo.com

🔎 Scraping: https://hansolo.com/
🔎 Scraping: https://hansolo.com/services
🔎 Scraping: https://hansolo.com/faqs
🔎 Scraping: https://hansolo.com/portfolio
🔎 Scraping: https://hansolo.com/projects/
🔎 Scraping: https://hansolo.com/resources
🔎 Scraping: https://hansolo.com/workorder
🔎 Scraping: https://hansolo.com/contact
🔎 Scraping: https://hansolo.com/tenant-improvements
🔎 Scraping: https://hansolo.com/structural-repairs
🔎 Scraping: https://hansolo.com/concrete-flatwork
🔎 Scraping: https://hansolo.com/parking-lot-maintenance
🔎 Scraping: https://hansolo.com/privacy
🔎 Scraping: https://hansolo.com/projects/beach-and-lincoln
🔎 Scraping: https://hansolo.com/projects/hq
🔎 Scraping: https://hansolo.com/projects/kling
🔎 Scraping: https://hansolo.com/projects/safety-net
🔎 Scraping: https://hansolo.com/projects/silo
🔎 Scraping: https://hansolo.com/yard
🔎 Scraping: https://hansolo.com/images/bl/beach-and-linc

In [7]:
import time
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin, urlparse

# ==============================================================================
# 1. SETUP THE SELENIUM DRIVER (Unchanged)
# ==============================================================================
# We only need one driver for the entire crawl session.
driver = None

def initialize_driver():
    """Initializes a single Selenium driver instance if not already running."""
    global driver
    if driver is None:
        print("🚀 Initializing Selenium driver...")
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
        driver = webdriver.Chrome(options=chrome_options)
        print("✅ Driver is ready.")

def close_driver():
    """Closes the driver if it was initialized."""
    global driver
    if driver:
        driver.quit()
        driver = None
        print("✅ Driver has been closed.")

# ==============================================================================
# 2. MODIFIED HYBRID SCRAPING FUNCTION (Returns full HTML source)
# ==============================================================================
# We now return the full HTML source to find links and emails, not just text.

def get_page_source_hybrid(url: str) -> str:
    """
    Implements the hybrid scraping strategy. Returns the full page source HTML.
    Tries 'requests' first, falls back to Selenium if content is minimal.
    """
    html_source = ""
    MIN_HTML_LENGTH = 500  # Threshold for raw HTML to decide if scrape was successful

    # --- Attempt 1: Fast Scrape (requests) ---
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            html_source = response.text
        else:
            # Don't print for every failed page, it clutters the output.
            # print(f"  - Fast Scrape on {url} failed with status code: {response.status_code}")
            pass
            
    except requests.RequestException:
        html_source = ""

    if len(html_source) > MIN_HTML_LENGTH:
        return html_source

    # --- Attempt 2: Selenium Fallback ---
    try:
        initialize_driver()
        driver.get(url)
        # A small dynamic wait can be better than a fixed sleep
        time.sleep(2) 
        html_source = driver.page_source
        
    except Exception as e:
        print(f"  - ❌ Selenium Scrape on {url} failed: {e}")
        return ""
    
    return html_source

# ==============================================================================
# 3. NEW: THE CRAWLER & EXTRACTOR LOGIC
# ==============================================================================

def crawl_and_extract(start_url: str):
    """
    Crawls a website starting from a given URL, scraping every internal page
    to find emails and aggregate all text content.
    """
    base_netloc = urlparse(start_url).netloc
    
    urls_to_visit = [start_url]
    visited_urls = set()
    found_emails = set()
    total_text_scraped = 0

    print(f"\n🕷️  Starting crawl of '{base_netloc}'...")
    
    while urls_to_visit:
        current_url = urls_to_visit.pop(0)

        if current_url in visited_urls:
            continue
            
        print(f"  - Scraping: {current_url}")
        visited_urls.add(current_url)

        # Get the full HTML of the page using our hybrid function
        html_content = get_page_source_hybrid(current_url)

        if not html_content:
            continue
        
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # --- 1. Aggregate total text content ---
        page_text = soup.get_text()
        total_text_scraped += len(page_text)

        # --- 2. Extract Emails from the current page's HTML ---
        email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        emails_on_page = re.findall(email_regex, html_content)
        if emails_on_page:
            found_emails.update(emails_on_page)

        # --- 3. Find all links and add internal ones to the queue ---
        for link_tag in soup.find_all('a', href=True):
            absolute_link = urljoin(current_url, link_tag['href'])
            parsed_link = urlparse(absolute_link)

            # Check if the link is on the same domain and is http/https
            if parsed_link.netloc == base_netloc and parsed_link.scheme in ['http', 'https']:
                clean_link = parsed_link._replace(query="", fragment="").geturl()
                
                if clean_link not in visited_urls and clean_link not in urls_to_visit:
                    urls_to_visit.append(clean_link)
    
    return found_emails, visited_urls, total_text_scraped


# ==============================================================================
# 4. UPDATED INTERACTIVE TEST LOOP
# ==============================================================================

if __name__ == "__main__":
    try:
        while True:
            start_url = input("\nEnter the full website URL to crawl (or 'quit' to exit): ").strip()

            if start_url.lower() == 'quit':
                break
            
            if not start_url.startswith('http'):
                print("⚠️ Please enter a full URL, including 'http://' or 'https://'.")
                continue

            # Run the new crawler function
            start_time = time.time()
            all_emails, all_crawled_urls, total_chars = crawl_and_extract(start_url)
            end_time = time.time()

            # --- Report Comprehensive Results ---
            print("\n" + "="*20 + " CRAWL RESULT " + "="*20)
            print(f"Crawl finished in {end_time - start_time:.2f} seconds.")
            
            # Report on scrapability (total content found)
            if total_chars > 100:
                 print(f"✅ SCRAPE SUCCESS: Found {total_chars:,} characters of text across {len(all_crawled_urls)} pages.")
            else:
                 print(f"❌ SCRAPE FAILED: Little to no text content was found. The site may be blocking scrapers or requires complex interaction.")
            
            # Report on emails found
            if all_emails:
                print(f"\n📧 Found {len(all_emails)} unique email address(es):")
                for i, email in enumerate(sorted(list(all_emails)), 1):
                    print(f"  {i}. {email}")
            else:
                print("\n- No email addresses were found during the crawl.")
            print("="*56 + "\n")

    finally:
        # Make sure the driver is closed when the loop ends
        close_driver()
        print("\n👋 Exiting test script.")


🕷️  Starting crawl of 'www.marshalls.com'...
  - Scraping: https://www.marshalls.com/
🚀 Initializing Selenium driver...
✅ Driver is ready.

Crawl finished in 16.52 seconds.
❌ SCRAPE FAILED: Little to no text content was found. The site may be blocking scrapers or requires complex interaction.

- No email addresses were found during the crawl.

✅ Driver has been closed.

👋 Exiting test script.


# Trying with Playwright

In [4]:
! pip install playwright
! playwright install


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
You are using a frozen webkit browser which does not receive updates anymore on ubuntu20.04-x64. Please update to the latest version of your operating system to test up-to-date browsers.
╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libopus.so.0                                     ║
║     libwebpdemux.so.2                                ║
║     libharfbuzz-icu.so.0                             ║
║     libwebpmux.so.3                                  ║
║     libenchant-2.so.2                                ║
║     libhyphen.so.0                

In [5]:
! pip install nest_asyncio


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [7]:
import time
import re
import requests
import asyncio
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, Page, Error 
from urllib.parse import urljoin, urlparse

# --- Constants are unchanged ---
IGNORED_EXTENSIONS = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.rar', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.mp3', '.mp4', '.avi', '.mov'}
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.bmp'}

# ... (The 'get_page_source_hybrid' and 'crawl_and_extract' functions are perfect, no changes needed) ...

async def get_page_source_hybrid(url: str, page: Page) -> str:
    html_source = ""
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
        response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
        response.raise_for_status()
        content_type = response.headers.get('content-type', '').lower()
        if 'text/html' in content_type:
            html_source = response.text
        else:
            # print(f"  - Skipping non-HTML content at {url}")
            return ""
    except requests.RequestException as e:
        # print(f"  - Fast scrape on {url} failed: {e}. Falling back to browser.")
        html_source = ""
    if len(html_source) > 1000:
        return html_source
    # print(f"  - Using browser fallback for {url}")
    try:
        await page.goto(url, wait_until='networkidle', timeout=20000)
        html_source = await page.content()
    except Error as e:
        print(f"❌ Playwright failed on {url}: {e}")
        return ""
    return html_source

async def crawl_and_extract(start_url: str, page: Page):
    base_netloc = urlparse(start_url).netloc
    urls_to_visit = {start_url}
    visited_urls = set()
    found_emails = set()
    print(f"\n🕷️ Starting crawl of '{base_netloc}'...")
    while urls_to_visit:
        current_url = urls_to_visit.pop()
        if current_url in visited_urls:
            continue
        print(f"🔎 Scraping: {current_url}")
        visited_urls.add(current_url)
        html_content = await get_page_source_hybrid(current_url, page)
        if not html_content:
            continue
        email_regex = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b'
        potential_emails = re.findall(email_regex, html_content)
        emails_on_page = [
            email for email in potential_emails 
            if not any(email.lower().endswith(ext) for ext in IMAGE_EXTENSIONS)
        ]
        if emails_on_page:
            new_emails_found = set(emails_on_page) - found_emails
            if new_emails_found:
                print(f"  ✅ Found {len(new_emails_found)} new email(s): {', '.join(new_emails_found)}")
                found_emails.update(new_emails_found)
        soup = BeautifulSoup(html_content, 'html.parser')
        for link_tag in soup.find_all('a', href=True):
            href = link_tag['href']
            if href.lower().startswith(('mailto:', 'tel:', 'javascript:')):
                continue
            absolute_link = urljoin(current_url, href)
            parsed_link = urlparse(absolute_link)
            path = parsed_link.path
            if any(path.lower().endswith(ext) for ext in IGNORED_EXTENSIONS):
                continue
            if parsed_link.netloc == base_netloc and parsed_link.scheme in ['http', 'https']:
                clean_link = parsed_link._replace(query="", fragment="").geturl()
                if clean_link not in visited_urls:
                    urls_to_visit.add(clean_link)
    return found_emails, visited_urls

async def main():
    async with async_playwright() as p:
        print("🚀 Initializing Playwright browser...")
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
        )
        page = await context.new_page()
        print("✅ Browser is ready.")
        try:
            while True:
                start_url = input("\nEnter the full website URL to crawl (or 'quit' to exit): ").strip()
                if start_url.lower() == 'quit':
                    break
                if not start_url.startswith('http'):
                    print("⚠️ Please enter a full URL, including 'http://' or 'https://'.")
                    continue
                start_time = time.time()
                all_emails, all_crawled_urls = await crawl_and_extract(start_url, page)
                end_time = time.time()
                print("\n" + "="*20 + " CRAWL COMPLETE " + "="*20)
                print(f"Crawl finished in {end_time - start_time:.2f} seconds.")
                print(f"Crawled a total of {len(all_crawled_urls)} pages.")
                if all_emails:
                    print(f"\n✅ SUCCESS! Found {len(all_emails)} unique email address(es):")
                    for i, email in enumerate(sorted(list(all_emails)), 1):
                        print(f"  {i}. {email}")
                else:
                    print("\n❌ No email addresses were found on this website.")
                print("="*58 + "\n")
        finally:
            print("\n👋 Closing browser...")
            await browser.close()
            print("✅ Browser has been closed.")

# ==============================================================================
# 4. FINAL EXECUTION (The only part that changed)
# ==============================================================================
# This will correctly run your main() function in a Jupyter/Colab environment.
await main()

🚀 Initializing Playwright browser...
✅ Browser is ready.

🕷️ Starting crawl of 'hansolo.com'...
🔎 Scraping: https://hansolo.com/
🔎 Scraping: https://hansolo.com/projects/
🔎 Scraping: https://hansolo.com/contact
🔎 Scraping: https://hansolo.com/projects/beach-and-lincoln
🔎 Scraping: https://hansolo.com/resources
🔎 Scraping: https://hansolo.com/tenant-improvements
🔎 Scraping: https://hansolo.com/faqs
🔎 Scraping: https://hansolo.com/projects/safety-net
🔎 Scraping: https://hansolo.com/projects/silo
🔎 Scraping: https://hansolo.com/parking-lot-maintenance
🔎 Scraping: https://hansolo.com/projects/hq
🔎 Scraping: https://hansolo.com/portfolio
🔎 Scraping: https://hansolo.com/workorder
🔎 Scraping: https://hansolo.com/services
🔎 Scraping: https://hansolo.com/privacy
🔎 Scraping: https://hansolo.com/projects/kling
🔎 Scraping: https://hansolo.com/structural-repairs
🔎 Scraping: https://hansolo.com/concrete-flatwork
🔎 Scraping: https://hansolo.com/yard

Crawl finished in 1.98 seconds.
Crawled a total of

In [None]:
! pip install requests beautifulsoup4 playwright tenacity
! playwright install

# More robust approach with playwright 

In [None]:
import time
import re
import requests
import asyncio
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, Page, Error, TimeoutError as PlaywrightTimeoutError
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from collections import deque
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type

# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
# NEW CONFIGURATION FLAG: Set to False to ignore robots.txt rules
RESPECT_ROBOTS_TXT = True  # <-- CHANGE THIS TO False TO SCRAPE ANYWAY

MAX_CONCURRENT_REQUESTS = 5
MAX_PAGES_TO_CRAWL = 100
MAX_CRAWL_DEPTH = 10
REQUEST_DELAY = 1
BROWSER_TIMEOUT = 25000
USER_AGENT = "My-Email-Scraper-Bot/1.0"

IGNORED_EXTENSIONS = {'.pdf', '.zip', '.rar', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.mp3', '.mp4', '.avi', '.mov'}
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.bmp'}

# ... get_page_source_hybrid function is unchanged ...
@retry(
    stop=stop_after_attempt(3),
    wait=wait_fixed(2),
    retry=retry_if_exception_type((requests.RequestException, PlaywrightTimeoutError)),
    reraise=True
)
async def get_page_source_hybrid(url: str, page: Page) -> str:
    html_source = ""
    try:
        response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=15, allow_redirects=True)
        response.raise_for_status()
        content_type = response.headers.get('content-type', '').lower()
        if 'text/html' in content_type:
            html_source = response.text
        else:
            return ""
        if len(html_source) > 1000:
            return html_source
    except requests.RequestException:
        pass
    await page.goto(url, wait_until='networkidle', timeout=BROWSER_TIMEOUT)
    return await page.content()

# ==============================================================================
# 3. THE CRAWLER CLASS (MODIFIED)
# ==============================================================================
class Crawler:
    def __init__(self, start_url: str):
        self.start_url = start_url
        self.base_netloc = urlparse(start_url).netloc
        self.urls_to_visit = deque([(start_url, 0)])
        self.visited_urls = set()
        self.found_emails = set()
        self.semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
        
        # MODIFIED: Only initialize the parser if we respect robots.txt
        if RESPECT_ROBOTS_TXT:
            self.robot_parser = self._get_robot_parser(start_url)
        else:
            self.robot_parser = None
            print("⚠️ robots.txt rules are being ignored.")

    def _get_robot_parser(self, url: str) -> RobotFileParser:
        # ... this helper function is unchanged ...
        rp = RobotFileParser()
        robots_url = urljoin(url, '/robots.txt')
        try:
            rp.set_url(robots_url)
            rp.read()
            print(f"✅ Successfully read robots.txt from {robots_url}")
        except Exception as e:
            print(f"⚠️ Could not read robots.txt, proceeding with caution. Error: {e}")
        return rp

    async def _process_page(self, url: str, depth: int, page: Page):
        # MODIFIED: Conditionally check robots.txt
        if self.robot_parser and not self.robot_parser.can_fetch(USER_AGENT, url):
            print(f"🚫 Denied by robots.txt: {url}")
            return
        
        # ... rest of the function is unchanged ...
        try:
            html_content = await get_page_source_hybrid(url, page)
        except Exception as e:
            print(f"❌ Failed to fetch {url} after all retries: {e}")
            return
        email_regex = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b'
        potential_emails = re.findall(email_regex, html_content, re.IGNORECASE)
        clean_emails = {email for email in potential_emails if not any(email.lower().endswith(ext) for ext in IMAGE_EXTENSIONS)}
        new_emails = clean_emails - self.found_emails
        if new_emails:
            print(f"  ✅ Found {len(new_emails)} new email(s) on {url}: {', '.join(new_emails)}")
            self.found_emails.update(new_emails)
        if depth < MAX_CRAWL_DEPTH:
            soup = BeautifulSoup(html_content, 'html.parser')
            for link_tag in soup.find_all('a', href=True):
                href = link_tag['href']
                if href.lower().startswith(('mailto:', 'tel:', 'javascript:')):
                    continue
                absolute_link = urljoin(url, href)
                parsed_link = urlparse(absolute_link)
                path = parsed_link.path.lower()
                if (parsed_link.netloc == self.base_netloc and 
                    parsed_link.scheme in ['http', 'https'] and 
                    not any(path.endswith(ext) for ext in IGNORED_EXTENSIONS.union(IMAGE_EXTENSIONS))):
                    clean_link = parsed_link._replace(query="", fragment="").geturl()
                    if clean_link not in self.visited_urls and clean_link not in {u for u, d in self.urls_to_visit}:
                        self.urls_to_visit.append((clean_link, depth + 1))
    
    # ... _worker and run methods are unchanged ...
    async def _worker(self, page: Page):
        while self.urls_to_visit:
            if len(self.visited_urls) >= MAX_PAGES_TO_CRAWL:
                break
            url, depth = self.urls_to_visit.popleft()
            if url in self.visited_urls:
                continue
            async with self.semaphore:
                self.visited_urls.add(url)
                print(f"🔎 [{len(self.visited_urls)}/{MAX_PAGES_TO_CRAWL} | Depth: {depth}] Scraping: {url}")
                await self._process_page(url, depth, page)
                await asyncio.sleep(REQUEST_DELAY)

    async def run(self, page: Page):
        print(f"\n🕷️ Starting crawl of '{self.base_netloc}' with up to {MAX_CONCURRENT_REQUESTS} concurrent workers.")
        await self._worker(page)
        print("\n🏁 Crawl finished or limit reached.")

# ... main execution block is unchanged ...
async def main():
    async with async_playwright() as p:
        print("🚀 Initializing Playwright browser...")
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent=USER_AGENT)
        page = await context.new_page()
        print("✅ Browser is ready.")
        try:
            while True:
                start_url = input("\nEnter the full website URL to crawl (or 'quit' to exit): ").strip()
                if start_url.lower() == 'quit':
                    break
                if not start_url.startswith('http'):
                    print("⚠️ Please enter a full URL, including 'http://' or 'https://'.")
                    continue
                crawler = Crawler(start_url)
                start_time = time.time()
                await crawler.run(page)
                end_time = time.time()
                print("\n" + "="*20 + " CRAWL COMPLETE " + "="*20)
                print(f"Crawl finished in {end_time - start_time:.2f} seconds.")
                print(f"Crawled a total of {len(crawler.visited_urls)} pages.")
                if crawler.found_emails:
                    print(f"\n✅ SUCCESS! Found {len(crawler.found_emails)} unique email address(es):")
                    for i, email in enumerate(sorted(list(crawler.found_emails)), 1):
                        print(f"  {i}. {email}")
                else:
                    print("\n❌ No email addresses were found on this website.")
                print("="*58 + "\n")
        finally:
            print("\n👋 Closing browser...")
            await browser.close()
            print("✅ Browser has been closed.")
await main()

🚀 Initializing Playwright browser...
✅ Browser is ready.
✅ Successfully read robots.txt from http://colesfrenchdip.com/robots.txt

🕷️ Starting crawl of 'colesfrenchdip.com' with up to 5 concurrent workers.
🔎 [1/100 | Depth: 0] Scraping: http://colesfrenchdip.com/
  ✅ Found 8 new email(s) on http://colesfrenchdip.com/: example@mysite.com, 605a7baede844d278b89dc95ae0a9123@sentry-next.wixpress.com, 78f7996315bc402f9dcb8a2f974b82d1@sentry.wixpress.com, info@colesfrenchdip.com, 88170cb0c9d64f94b5821ca7fd2d55a4@sentry-next.wixpress.com, 5d1795a2db124a268f1e1bd88f503500@sentry.wixpress.com, 18d2f96d279149989b95faf0a4b41882@sentry-next.wixpress.com, 9a65e97ebe8141fca0c4fd686f70996b@sentry.wixpress.com

🏁 Crawl finished or limit reached.

Crawl finished in 1.49 seconds.
Crawled a total of 1 pages.

✅ SUCCESS! Found 8 unique email address(es):
  1. 18d2f96d279149989b95faf0a4b41882@sentry-next.wixpress.com
  2. 5d1795a2db124a268f1e1bd88f503500@sentry.wixpress.com
  3. 605a7baede844d278b89dc95ae0a

# Using crawl4ai


In [1]:
# 📦 Install required packages
!pip install -q crawl4ai nest-asyncio pandas requests beautifulsoup4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
import time
import re
import requests
import asyncio
import csv
from typing import Set, List, Tuple, Dict
from dataclasses import dataclass
from bs4 import BeautifulSoup
from crawl4ai import AsyncWebCrawler
from urllib.parse import urljoin, urlparse
import nest_asyncio

# Apply nest_asyncio for Colab/Jupyter compatibility
nest_asyncio.apply()

# ==============================================================================
# 1. DATA CLASSES & CONFIGURATION
# ==============================================================================

@dataclass
class CrawlConfig:
    """Configuration for crawling parameters."""
    max_pages: int = 50
    stop_after_finding: int = 10  # Stop after finding this many *total* emails
    batch_size: int = 5
    timeout: int = 5
    delay: float = 0.1
    crawl4ai_max_pages: int = 30

@dataclass
class CrawlResult:
    """Structured result for a crawling operation."""
    business_url: str
    categorized_emails: Dict[str, Set[str]]
    pages_crawled: int
    time_taken: float
    method_used: str

# ==============================================================================
# 2. CORE LOGIC CLASSES
# ==============================================================================

class EmailDetector:
    """Handles detection and categorization of emails and priority pages."""
    IGNORED_EXTENSIONS = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.rar', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}
    IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.bmp'}

    # Keywords for prioritizing pages that are likely to contain contact info
    PRIORITY_KEYWORDS = {"contact", "about", "team", "staff", "directory", "locations", "faq", "help", "support"}
    HR_KEYWORDS = {"career", "jobs", "hr", "human-resources", "employment", "hiring"}
    SALES_KEYWORDS = {"sales", "inquiry", "inquiries", "info", "press", "media", "business", "contact-us"}
    
    def get_url_priority(self, href: str, text: str) -> int:
        """Assigns a priority score to a URL based on its likelihood of containing emails."""
        content = (href + " " + text).lower()
        score = 0
        if any(key in content for key in self.PRIORITY_KEYWORDS): score += 100
        if any(key in content for key in self.HR_KEYWORDS): score += 50
        if any(key in content for key in self.SALES_KEYWORDS): score += 50
        return score

    def categorize_email(self, email: str, context_text: str) -> str:
        """Categorizes an email as HR, Sales, or Other based on its content and surrounding text."""
        email_lower = email.lower()
        context_lower = context_text.lower()

        # Check email address first
        if any(key in email_lower for key in self.HR_KEYWORDS): return "HR"
        if any(key in email_lower for key in self.SALES_KEYWORDS): return "Sales/Contact"
        
        # Check surrounding text for context
        if any(key in context_lower for key in self.HR_KEYWORDS): return "HR"
        if any(key in context_lower for key in self.SALES_KEYWORDS): return "Sales/Contact"
        
        return "Other"

class LinkAndEmailExtractor:
    """Extracts links and categorized emails from a single HTML page."""
    def __init__(self, detector: EmailDetector):
        self.detector = detector

    def extract(self, html: str, base_url: str) -> Tuple[List[Tuple[str, int]], Dict[str, Set[str]]]:
        soup = BeautifulSoup(html, 'html.parser')
        page_links_with_priority = []
        categorized_emails = {"HR": set(), "Sales/Contact": set(), "Other": set()}
        
        # Extract Links with Priority
        base_netloc = urlparse(base_url).netloc
        for tag in soup.find_all("a", href=True):
            href, text = tag['href'], tag.get_text().lower().strip()
            full_url = urljoin(base_url, href)
            parsed = urlparse(full_url)
            if parsed.netloc == base_netloc and parsed.scheme in ["http", "https"]:
                if not any(parsed.path.lower().endswith(ext) for ext in self.detector.IGNORED_EXTENSIONS):
                    clean_url = parsed._replace(query="", fragment="").geturl()
                    priority = self.detector.get_url_priority(href, text)
                    page_links_with_priority.append((clean_url, priority))

        # Extract and Categorize Emails
        page_text = soup.get_text(" ", strip=True)
        email_regex = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b'
        for match in re.finditer(email_regex, page_text, re.IGNORECASE):
            email = match.group(0)
            if any(email.lower().endswith(ext) for ext in self.detector.IMAGE_EXTENSIONS): continue
            
            context_text = page_text[max(0, match.start() - 75) : match.end() + 75]
            category = self.detector.categorize_email(email, context_text)
            categorized_emails[category].add(email)
            
        return page_links_with_priority, categorized_emails

class FastEmailCrawler:
    """Fast, concurrent crawler using requests, designed as a fallback."""
    def __init__(self, config: CrawlConfig):
        self.config = config
        self.extractor = LinkAndEmailExtractor(EmailDetector())

    async def crawl(self, start_url: str) -> Dict[str, Set[str]]:
        urls_to_visit = [(0, start_url)]
        visited_urls = set()
        found_emails = {"HR": set(), "Sales/Contact": set(), "Other": set()}
        
        print(f"🕷️  Fast Crawling '{urlparse(start_url).netloc}'...")
        while urls_to_visit and len(visited_urls) < self.config.max_pages:
            total_found = sum(len(s) for s in found_emails.values())
            if total_found >= self.config.stop_after_finding:
                print(f"🎯 Found {total_found} emails! Stopping fast crawl early.")
                break

            urls_to_visit.sort(key=lambda x: -x[0])
            batch = [u for p, u in urls_to_visit[:self.config.batch_size] if u not in visited_urls]
            urls_to_visit = urls_to_visit[self.config.batch_size:]

            if not batch: continue
            
            tasks = [self._process_url(url, visited_urls) for url in batch]
            results = await asyncio.gather(*tasks, return_exceptions=True)

            for i, res in enumerate(results):
                if isinstance(res, Exception): continue
                new_links, new_emails = res
                for category, emails in new_emails.items():
                    found_emails[category].update(emails)
                for link, priority in new_links:
                    if link not in visited_urls and link not in {u for p, u in urls_to_visit}:
                        urls_to_visit.append((priority, link))

            await asyncio.sleep(self.config.delay)
        
        print(f"📊 Fast Crawl: Visited {len(visited_urls)} pages.")
        return found_emails

    async def _process_url(self, url: str, visited: set) -> Tuple[List, Dict]:
        print(f"  - Fast check: {url}")
        visited.add(url)
        loop = asyncio.get_event_loop()
        try:
            html = await loop.run_in_executor(None, lambda u: requests.get(u, timeout=self.config.timeout).text, url)
            return self.extractor.extract(html, url)
        except Exception:
            return [], {}

class Crawl4AIPrimaryEmailCrawler:
    """Primary crawler using Crawl4AI for its power and simplicity."""
    def __init__(self, config: CrawlConfig):
        self.config = config
        self.detector = EmailDetector()

    async def crawl(self, start_url: str) -> Dict[str, Set[str]]:
        print(f"🔄 Using Crawl4AI as primary method for '{urlparse(start_url).netloc}'...")
        categorized_emails = {"HR": set(), "Sales/Contact": set(), "Other": set()}
        
        try:
            async with AsyncWebCrawler(headless=True, verbose=False) as crawler:
                result = await crawler.arun(url=start_url, max_pages=self.config.crawl4ai_max_pages)
                
                if not result.success:
                    print("  - Crawl4AI did not succeed.")
                    return categorized_emails

                print(f"  - Crawl4AI finished. Processed {result.pages_crawled} pages.")
                clean_text = result.markdown

                for match in re.finditer(EMAIL_REGEX, clean_text, re.IGNORECASE):
                    email = match.group(0)
                    if any(email.lower().endswith(ext) for ext in self.detector.IMAGE_EXTENSIONS): continue
                    context_text = clean_text[max(0, match.start() - 75) : match.end() + 75]
                    category = self.detector.categorize_email(email, context_text)
                    categorized_emails[category].add(email)
                return categorized_emails
        except Exception as e:
            print(f"  - ❌ Error with Crawl4AI: {e}")
            return categorized_emails

# ==============================================================================
# 3. MAIN ORCHESTRATOR CLASS
# ==============================================================================

class AdvancedEmailScraper:
    """Orchestrates the crawling process using a hybrid strategy."""
    def __init__(self, config: CrawlConfig):
        self.config = config
        self.primary_crawler = Crawl4AIPrimaryEmailCrawler(config)
        self.fallback_crawler = FastEmailCrawler(config)

    async def scrape_website(self, start_url: str) -> CrawlResult:
        start_time = time.time()
        
        # Attempt with the powerful Crawl4AI first
        categorized_emails = await self.primary_crawler.crawl(start_url)
        method_used = "Crawl4AI"
        
        total_found = sum(len(s) for s in categorized_emails.values())
        if total_found < self.config.stop_after_finding:
            print("🔄 Crawl4AI found fewer emails than target. Trying fast method as fallback...")
            fallback_emails = await self.fallback_crawler.crawl(start_url)
            method_used = "Hybrid (Crawl4AI + Fast Fallback)"
            # Merge results, giving precedence to the fallback's potentially deeper crawl
            for category, emails in fallback_emails.items():
                categorized_emails[category].update(emails)

        return CrawlResult(
            business_url=start_url,
            categorized_emails=categorized_emails,
            pages_crawled=0, # Note: Page count is complex in hybrid, focusing on results
            time_taken=time.time() - start_time,
            method_used=method_used
        )

    def display_results(self, result: CrawlResult):
        print("\n" + "="*20 + " CRAWL RESULTS " + "="*20)
        print(f"⏱️ Time taken: {result.time_taken:.2f} seconds")
        print(f"🛠️ Method used: {result.method_used}")

        total_emails = sum(len(s) for s in result.categorized_emails.values())
        print(f"📧 Total unique emails found: {total_emails}")
        
        if total_emails > 0:
            print("\n--- Categorized Emails ---")
            for category, emails in result.categorized_emails.items():
                if emails:
                    print(f"  - {category}:")
                    for email in sorted(list(emails)):
                        print(f"    - {email}")
            
            # Save to CSV
            with open("scraped_emails.csv", "a", newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                if f.tell() == 0:
                    writer.writerow(["Business Website", "Email Category", "Email Address"])
                for category, emails in result.categorized_emails.items():
                    for email in emails:
                        writer.writerow([result.business_url, category, email])
            print(f"\n📄 Saved {total_emails} emails to scraped_emails.csv")
        else:
            print("❌ No email addresses were found.")
        print("="*58 + "\n")

# ==============================================================================
# 4. INTERACTIVE EXECUTION LOOP
# ==============================================================================

async def main():
    print("🚀 Starting Advanced Email Scraper...")
    print("💡 This script uses a hybrid approach to find and categorize emails.")
    
    while True:
        try:
            start_url = input("\nEnter website URL (or 'quit' to exit): ").strip()
            if start_url.lower() == "quit": break
            if not start_url.startswith("http"):
                print("⚠️ Please enter a valid full URL with http/https.")
                continue
            
            stop_count_input = input(f"How many total emails to find before stopping? (default: 10): ").strip()
            stop_after_finding = int(stop_count_input) if stop_count_input else 10

            config = CrawlConfig(stop_after_finding=stop_after_finding)
            scraper = AdvancedEmailScraper(config)
            result = await scraper.scrape_website(start_url)
            scraper.display_results(result)
            
        except KeyboardInterrupt:
            print("\n👋 Interrupted by user.")
            break
        except Exception as e:
            print(f"❌ An unexpected error occurred: {e}")

    print("✅ Done.")

# Run the main loop
await main()

🚀 Starting Advanced Email Scraper...
💡 This script uses a hybrid approach to find and categorize emails.
🔄 Using Crawl4AI as primary method for 'www.magnoliabakery.com'...


  - ❌ Error with Crawl4AI: 'CrawlResult' object has no attribute 'pages_crawled'
🔄 Crawl4AI found fewer emails than target. Trying fast method as fallback...
🕷️  Fast Crawling 'www.magnoliabakery.com'...
  - Fast check: https://www.magnoliabakery.com/
  - Fast check: https://www.magnoliabakery.com/products/glossier-banana-pudding-balm-dotcom
  - Fast check: https://www.magnoliabakery.com/products/blueberry-crisp-pudding-cup
  - Fast check: https://www.magnoliabakery.com/pages/us-franchise
  - Fast check: https://www.magnoliabakery.com/products/the-magnolia-bakery-handbook-of-icebox-desserts
  - Fast check: https://www.magnoliabakery.com/collections/grocery-banana-pudding
  - Fast check: https://www.magnoliabakery.com/pages/us-franchising-help-center
  - Fast check: https://www.magnoliabakery.com/pages/order-now
  - Fast check: https://www.magnoliabakery.com/collections
  - Fast check: https://www.magnoliabakery.com/pages/catering-events
  - Fast check: https://www.magnoliabakery.com/pr