In [2]:
import asyncio
from typing import List, Dict, Optional, Union, Any
from pydantic import BaseModel, Field
from langchain_core.tools import tool
import random
import time

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMExtractionStrategy, LLMConfig, BrowserConfig
from multi_agents.constants.constants import Constants

# Keep all your existing Pydantic models
class ListingSummary(BaseModel):
    url: Optional[str] = Field(description="Direct link to the listing page")
    title: str = Field(description="Property listing title/name")
    type: str = Field(description="Property type (e.g., 'Rental unit', 'Riad')")
    rating_text: Optional[str] = Field(description="Rating information (e.g., '4.85 out of 5')")

class HostResponse(BaseModel):
    responder_name: str = Field(description="Name of the person who responded (usually the host)")
    date: str = Field(description="Date of the host's response")
    text: str = Field(description="The content of the host's response")

class Review(BaseModel):
    reviewer_name: str = Field(description="Name of the guest who left the review")
    reviewer_location: Optional[str] = Field(description="Guest's location or 'N/A'")
    date: str = Field(description="Date the review was posted")
    text: str = Field(description="The full content/message of the review")
    host_response: Optional[HostResponse] = Field(description="The host's response to the review, if available")

class PlaceVisited(BaseModel):
    place: str = Field(description="Location name (e.g., 'London, United Kingdom')")
    details: str = Field(description="Visit information (e.g., 'June 2025', '4 trips')")

class AirbnbHostProfile(BaseModel):
    name: str = Field(description="The host's display name.")
    profile_picture_url: Optional[str] = Field(description="URL of the host's profile picture.")
    bio: Optional[str] = Field(description="The host's personal bio or description text.")
    about_details: List[str] = Field(description="A list of structured details from the 'About' section (work, pets, etc.).")
    places_visited: List[PlaceVisited] = Field(description="List of places the host has visited.")
    listings: List[ListingSummary] = Field(description="A list of all property listings by the host.")
    reviews: List[Review] = Field(description="A list of all reviews left for the host.")

class PriceDetails(BaseModel):
    display_price: Optional[str] = Field(description="The main price displayed for the listing (e.g., '$150 / night').")
    breakdown: Optional[Dict[str, Any]] = Field(description="A dictionary of the detailed price breakdown if available.")

class HostInfo(BaseModel):
    name: str = Field(description="The host's name.")
    details: Optional[str] = Field(description="Host status and experience (e.g., 'Superhost | 3 years hosting').")
    profile_url: Optional[str] = Field(description="Link to the host's profile page.")

class AirbnbListingDetails(BaseModel):
    apartment_name: str = Field(description="The title or name of the property listing.")
    listing_summary: str = Field(description="A brief summary including number of guests, bedrooms, beds, and baths.")
    rating: Optional[str] = Field(description="The overall rating score (e.g., '4.89').")
    reviews_count: Optional[str] = Field(description="The total number of reviews.")
    image_urls: List[str] = Field(description="A list of up to 5 URLs for the property's images.")
    description: str = Field(description="The full text description of the property.")
    host_info: HostInfo = Field(description="Information about the host of the listing.")
    amenities: List[str] = Field(description="A list of up to 10 key amenities available.")
    location_details: Dict[str, str] = Field(description="Location details, including address and neighborhood description.")
    price_details: Optional[PriceDetails] = Field(description="Detailed pricing information for the listing.")

# Session management
_last_request_time = {}
_session_count = 0

def run_async_tool(tool_coro):
    """Execute an async tool from a synchronous context."""
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        loop = None

    if loop and loop.is_running():
        return asyncio.ensure_future(tool_coro)
    else:
        return asyncio.run(tool_coro)

def get_rotating_user_agent():
    """Get a different user agent for each request"""
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
    ]
    global _session_count
    _session_count += 1
    return user_agents[_session_count % len(user_agents)]

async def enforce_rate_limiting(min_delay=45):
    """Enforce delays between requests"""
    global _last_request_time
    
    current_time = time.time()
    domain = "airbnb.com"
    
    if domain in _last_request_time:
        time_since_last = current_time - _last_request_time[domain]
        if time_since_last < min_delay:
            delay = min_delay - time_since_last + random.uniform(10, 20)
            print(f"⏳ Rate limiting: waiting {delay:.1f} seconds...")
            await asyncio.sleep(delay)
    
    _last_request_time[domain] = current_time

def create_stealth_browser_config():
    """Create browser config with anti-detection measures"""
    user_agent = get_rotating_user_agent()
    
    # Enhanced stealth arguments
    stealth_args = [
        "--no-sandbox",
        "--disable-dev-shm-usage",
        "--disable-blink-features=AutomationControlled",
        "--disable-extensions",
        "--disable-plugins",
        "--disable-web-security",
        "--disable-features=VizDisplayCompositor",
        "--disable-background-timer-throttling",
        "--disable-renderer-backgrounding",
        "--disable-backgrounding-occluded-windows",
        "--disable-ipc-flooding-protection",
        "--disable-client-side-phishing-detection",
        "--disable-component-extensions-with-background-pages",
        "--disable-default-apps",
        "--disable-domain-reliability",
        "--disable-sync",
        "--no-default-browser-check",
        "--no-first-run",
        "--no-pings",
        "--password-store=basic",
        "--use-mock-keychain",
        "--disable-gpu",
        "--disable-software-rasterizer",
        f"--user-agent={user_agent}",
        f"--window-size={random.randint(1200, 1920)},{random.randint(800, 1080)}",
        f"--window-position={random.randint(0, 200)},{random.randint(0, 200)}"
    ]
    
    return BrowserConfig(
        headless=True,  # Set to False for debugging
        extra_args=stealth_args
    )

@tool
def get_airbnb_profile_data_fixed(profile_url: str, max_retries: int = 3) -> Dict[str, Any]:
    """
    Fixed Airbnb profile scraper that works with current Crawl4AI API.
    Uses session isolation and rate limiting to avoid detection.
    
    Args:
        profile_url (str): The complete URL to the Airbnb host's profile page.
        max_retries (int): Maximum number of retry attempts
    
    Returns:
        Dict[str, Any]: Structured profile data or error information
    """
    async def scrape_with_isolation():
        # Enforce rate limiting before each request
        await enforce_rate_limiting()
        
        for attempt in range(max_retries):
            try:
                if attempt > 0:
                    delay = random.uniform(60, 120)  # Long delays between retries
                    print(f"🔄 Retry attempt {attempt + 1} after {delay:.1f}s delay...")
                    await asyncio.sleep(delay)
                
                print(f"🚀 Attempting to scrape {profile_url} (attempt {attempt + 1}/{max_retries})")
                
                # Enhanced JavaScript with better timing
                js_click_reviews = """
                (async () => {
                    try {
                        // Wait for page to fully load
                        await new Promise(resolve => setTimeout(resolve, 4000));
                        
                        const selectors = [
                            'button:has-text("Show all")',
                            'button:has-text("See all")',
                            'a:has-text("Show all reviews")',
                            'button[aria-label*="review"]',
                            'button[data-testid*="review"]'
                        ];
                        
                        for (const selector of selectors) {
                            try {
                                const elements = document.querySelectorAll('button, a, div[role="button"]');
                                for (const el of elements) {
                                    const text = (el.textContent || '').toLowerCase();
                                    if ((text.includes('show all') || text.includes('see all')) && 
                                        text.includes('review')) {
                                        console.log('Clicking review button:', text);
                                        el.click();
                                        return true;
                                    }
                                }
                            } catch (e) {
                                console.log('Error with selector:', selector, e);
                            }
                        }
                        console.log('No review button found');
                        return false;
                    } catch (e) {
                        console.log('Error in review click script:', e);
                        return false;
                    }
                })();
                """

                js_scroll_modal = """
                (async () => {
                    try {
                        // Wait for modal to open
                        await new Promise(resolve => setTimeout(resolve, 6000));
                        
                        const modalSelectors = [
                            "div[role='dialog']",
                            "div[aria-modal='true']",
                            ".modal",
                            "div[data-testid*='modal']",
                            "div[data-testid*='reviews-modal']"
                        ];
                        
                        let modal = null;
                        for (const selector of modalSelectors) {
                            modal = document.querySelector(selector);
                            if (modal) break;
                        }
                        
                        if (!modal) {
                            console.log('No modal found for scrolling');
                            return;
                        }
                        
                        console.log('Modal found, starting scroll process');
                        
                        // Find scrollable container
                        const scrollableElements = modal.querySelectorAll('div');
                        let scrollableDiv = null;
                        
                        for (const div of scrollableElements) {
                            if (div.scrollHeight > div.clientHeight && 
                                getComputedStyle(div).overflow !== 'visible') {
                                scrollableDiv = div;
                                break;
                            }
                        }
                        
                        if (!scrollableDiv) {
                            console.log('No scrollable div found');
                            return;
                        }

                        let lastHeight = 0;
                        let scrollAttempts = 0;
                        const maxScrolls = 6;
                        
                        while (scrollAttempts < maxScrolls) {
                            scrollableDiv.scrollTop = scrollableDiv.scrollHeight;
                            await new Promise(resolve => setTimeout(resolve, 3000));
                            
                            const currentHeight = scrollableDiv.scrollHeight;
                            console.log(`Scroll ${scrollAttempts + 1}: height ${currentHeight}`);
                            
                            if (currentHeight === lastHeight) {
                                console.log('No new content loaded');
                                break;
                            }
                            
                            lastHeight = currentHeight;
                            scrollAttempts++;
                        }
                        
                        console.log('Scrolling complete');
                    } catch (e) {
                        console.log('Error in scroll script:', e);
                    }
                })();
                """

                browser_config = create_stealth_browser_config()
                
                llm_config = LLMConfig(
                    provider=f"groq/{Constants.MODEL}",
                    api_token=Constants.GROQ_API_KEY,
                    temperature=0.05
                )

                config = CrawlerRunConfig(
                    js_code=[js_click_reviews, js_scroll_modal],
                    wait_until="load",
                    page_timeout=120000,  # 2 minutes timeout
                    delay_before_return_html=8.0,  # Wait 8 seconds before extraction
                    extraction_strategy=LLMExtractionStrategy(
                        llm_config=llm_config,
                        schema=AirbnbHostProfile.model_json_schema(),
                        extraction_type="schema",
                        instruction=(
                            "Extract all information from this Airbnb host profile page. "
                            "Include: host name, bio, profile picture URL, about details, "
                            "places visited, property listings, and all reviews with responses. "
                            "Be comprehensive and accurate. If data is missing, use null/empty values."
                        )
                    )
                )

                # Create a fresh crawler instance for each attempt
                async with AsyncWebCrawler(config=browser_config) as crawler:
                    result = await crawler.arun(url=profile_url, config=config)
                    
                    if result.success and result.extracted_content:
                        print("✅ Successfully extracted content!")
                        return result.extracted_content
                    else:
                        error_msg = getattr(result, 'error_message', 'Unknown error')
                        print(f"❌ Attempt {attempt + 1} failed: {error_msg}")
                        
                        if attempt == max_retries - 1:
                            return {
                                "error": f"Failed after {max_retries} attempts. Last error: {error_msg}",
                                "url": profile_url,
                                "attempts": max_retries,
                                "suggestion": "Try again later, check URL accessibility, or use a VPN"
                            }

            except Exception as e:
                error_msg = str(e)
                print(f"💥 Attempt {attempt + 1} failed with exception: {error_msg}")
                
                if attempt == max_retries - 1:
                    return {
                        "error": f"Scraping failed: {error_msg}",
                        "url": profile_url,
                        "attempts": max_retries,
                        "suggestion": "Check network connection and Crawl4AI installation"
                    }

        return {"error": "Unexpected end of retry loop"}

    return run_async_tool(scrape_with_isolation())


@tool
def reset_airbnb_session() -> Dict[str, str]:
    """Reset the scraper session to start fresh"""
    global _last_request_time, _session_count
    _last_request_time.clear()
    _session_count = 0
    
    return {
        "status": "success", 
        "message": "Session reset successfully",
        "timestamp": str(time.time())
    }


@tool
def quick_airbnb_test(profile_url: str) -> Dict[str, Any]:
    """
    Quick test to check if Airbnb URL is accessible without full scraping
    """
    async def quick_test():
        try:
            browser_config = create_stealth_browser_config()
            
            config = CrawlerRunConfig(
                wait_until="domcontentloaded",
                page_timeout=30000,
                delay_before_return_html=3.0
            )

            async with AsyncWebCrawler(config=browser_config) as crawler:
                result = await crawler.arun(url=profile_url, config=config)
                
                return {
                    "accessible": result.success,
                    "status_code": getattr(result, 'status_code', 'unknown'),
                    "title": getattr(result, 'title', 'No title'),
                    "content_length": len(result.html) if result.html else 0,
                    "url": profile_url,
                    "error": getattr(result, 'error_message', None) if not result.success else None
                }
                
        except Exception as e:
            return {
                "accessible": False,
                "error": str(e),
                "url": profile_url
            }
    
    return run_async_tool(quick_test())

In [3]:
# Example profile to test
profile_url = "https://www.airbnb.com/users/show/532236013"

# CORRECTED: Use .ainvoke() for async calls in Jupyter
result = await get_airbnb_profile_data_fixed.ainvoke({"profile_url": profile_url})
print(result)

🚀 Attempting to scrape https://www.airbnb.com/users/show/532236013 (attempt 1/3)


✅ Successfully extracted content!
[
    {
        "name": "Abdel",
        "profile_picture_url": "https://a0.muscache.com/im/pictures/user/User/original/213a678f-2d3c-4b11-886e-df873b318aa4.jpeg?im_w=720",
        "bio": "I'm Abdel, a passionate entrepreneur and Airbnb host in Marrakech. I offer apartments and riad ideally located (Izdihar, Gueliz, Medina, Palmeraie), combining modern comfort and Moroccan authenticity for a unique experience of the city.",
        "about_details": [
            "Where I've always wanted to go: The Moon",
            "My work: Entrepreneur",
            "What makes my home unique: Premium comfort, Local Experiences.",
            "Pets: Enzo, Malinois",
            "Identity verified"
        ],
        "places_visited": [
            {
                "place": null,
                "details": "0 of 0 items showing"
            }
        ],
        "listings": [
            {
                "url": "https://www.airbnb.com/rooms/1168655512551259276",
  

In [2]:
# Example profile to test
profile_url = "https://www.airbnb.com/users/show/532236013"

# CORRECTED: Use .ainvoke() for async calls in Jupyter
result = await get_airbnb_profile_data_fixed.ainvoke({"profile_url": profile_url})
print(result)

🚀 Attempting to scrape https://www.airbnb.com/users/show/532236013 (attempt 1/3)


✅ Successfully extracted content!
[
    {
        "name": "Abdel",
        "profile_picture_url": "https://a0.muscache.com/im/pictures/user/User/original/213a678f-2d3c-4b11-886e-df873b318aa4.jpeg?im_w=720",
        "bio": "I'm Abdel, a passionate entrepreneur and Airbnb host in Marrakech. I offer apartments and riad ideally located (Izdihar, Gueliz, Medina, Palmeraie), combining modern comfort and Moroccan authenticity for a unique experience of the city.",
        "about_details": [
            "My work: Entrepreneur",
            "Where I've always wanted to go: The Moon",
            "For guests, I always: Receive them in person",
            "Pets: Enzo, Malinois",
            "Identity verified"
        ],
        "places_visited": [],
        "listings": [
            {
                "url": "https://www.airbnb.com/rooms/1168655512551259276",
                "title": "Cozy Flat near the city center, 2BR w/Netflix & AC",
                "type": "Rental unit",
                "rat

In [None]:
# Example profile to test
profile_url = "https://www.airbnb.com/users/show/532236013"

# CORRECTED: Use .ainvoke() for async calls in Jupyter
result = await get_airbnb_profile_data_fixed.ainvoke({"profile_url": profile_url})
print(result)

🚀 Attempting to scrape https://www.airbnb.com/users/show/532236013 (attempt 1/3)


In [4]:
# Test listing scraping
listing_url = 'https://www.airbnb.com/rooms/1430288794722556873?source_impression_id=p3_1754183708_P3MvB73ec_TnOnG7'

print("\n=== Testing get_listing_details ===")
# CORRECTED: Use .ainvoke() here as well
listing_result = await get_listing_details.ainvoke({"listing_url": listing_url})
print(listing_result)


=== Testing get_listing_details ===


Attempting to scrape listing https://www.airbnb.com/rooms/1430288794722556873?source_impression_id=p3_1754183708_P3MvB73ec_TnOnG7 (attempt 1/3)


Successfully extracted listing content!
[]


Running in Jupyter notebook environment
