Dependencies

In [5]:
# Import all required libraries
import os
import re
import csv
import json
import time
import random
import requests
import pandas as pd
from pathlib import Path
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

File structure

In [6]:
# Create directory structure
base_dir = Path("viator-attractions-scraper")
data_dir = base_dir / "data"
raw_dir = data_dir / "raw"
processed_dir = data_dir / "processed"
attractions_dir = raw_dir / "attractions"

# Create all directories
for directory in [base_dir, data_dir, raw_dir, processed_dir, attractions_dir]:
    directory.mkdir(exist_ok=True, parents=True)
    
print(f"Directory structure created at {base_dir.absolute()}")

Directory structure created at /Users/mjsteenberg/Desktop/Desktop - MJ’s MacBook Air - 1/VM/viator-scraper/attractions/viator-attractions-scraper


Part 1: Download Attraction Sitemaps

In [7]:
# Region sitemap URLs
REGION_SITEMAPS = {
    "Asia": "https://www.viator.com/sitemap/Asia/d2-tickets",
    "Australia_Pacific": "https://www.viator.com/sitemap/Australia-and-the-Pacific/d3-tickets",
    "Caribbean": "https://www.viator.com/sitemap/Caribbean/d4-tickets",
    "Central_South_America": "https://www.viator.com/sitemap/Central-and-South-America/d9-tickets",
    "Europe": "https://www.viator.com/sitemap/Europe/d6-tickets",
    "Middle_East_Africa": "https://www.viator.com/sitemap/Middle-East-and-Africa/d1-tickets",
    "North_America": "https://www.viator.com/sitemap/North-America/d8-tickets"
}

def save_webpage_content(url, output_file):
    """
    Save the content of a webpage to a file.
    Handles CAPTCHA challenges by waiting for manual intervention.
    
    Args:
        url: The URL of the webpage to save
        output_file: The path where the HTML content will be saved
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Set up Chrome options
        chrome_options = Options()
        # Don't run headless - we need to solve CAPTCHA
        chrome_options.add_argument('--start-maximized')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        # Remove automation flags
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        
        # Initialize the driver
        driver = webdriver.Chrome(options=chrome_options)
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
        try:
            # Get the page
            print(f"Loading page: {url}")
            driver.get(url)
            
            # Wait for human to solve CAPTCHA if needed
            print("Please solve the CAPTCHA in the browser window if one appears...")
            input("Press Enter after solving the CAPTCHA (or if no CAPTCHA appeared)...")
            
            # Wait for main content to load
            time.sleep(5)
            
            # Get the page source after CAPTCHA
            page_source = driver.page_source
            
            # Create parent directories if they don't exist
            output_path = Path(output_file)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Save the content to a file
            output_path.write_text(page_source, encoding='utf-8')
            print(f"Content successfully saved to {output_file}")
            return True
            
        finally:
            # Always close the driver
            driver.quit()
            
    except Exception as e:
        print(f"Error: {e}")
        return False

In [9]:
# Run this cell to download attraction sitemaps for all regions
# You can modify the list to only download specific regions

# Select which regions to download (comment out any you don't want)
regions_to_download = [
    # "Asia",
    # "Australia_Pacific",
    # "Caribbean",
    # "Central_South_America",
    # "Europe", 
    # "Middle_East_Africa",
    "North_America"
]

for region_name in regions_to_download:
    if region_name not in REGION_SITEMAPS:
        print(f"Unknown region: {region_name}")
        continue
        
    url = REGION_SITEMAPS[region_name]
    output_file = raw_dir / f"{region_name}_attractions_sitemap.html"
    
    print(f"\n{'='*50}")
    print(f"Processing region: {region_name}")
    print(f"{'='*50}")
    
    success = save_webpage_content(url, output_file)
    
    if success:
        print(f"Successfully saved sitemap for {region_name}")
    else:
        print(f"Failed to save sitemap for {region_name}")
    
    # Add a delay between regions to avoid rate limiting
    if region_name != regions_to_download[-1]:  # Skip delay after last item
        delay = random.uniform(10, 20)
        print(f"Waiting for {delay:.2f} seconds before the next region...")
        time.sleep(delay)


Processing region: North_America
Loading page: https://www.viator.com/sitemap/North-America/d8-tickets
Please solve the CAPTCHA in the browser window if one appears...
Content successfully saved to viator-attractions-scraper/data/raw/North_America_attractions_sitemap.html
Successfully saved sitemap for North_America


Part 2: Extract Attractions from Sitemaps

In [10]:
def extract_attraction_links(html_content):
    """
    Extract attraction links and names from HTML content.
    
    Args:
        html_content: The HTML content of the sitemap page
    
    Returns:
        list: List of dictionaries with attraction information
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    attractions = []
    
    # Find all list items with links that match the pattern "Tickets & tours in [Attraction Name]"
    # This regex pattern looks for links with the specific text format
    pattern = re.compile(r'Tickets & tours in\s+(.+)')
    
    # Find all list items (assuming they're in <li> tags)
    list_items = soup.find_all('li')
    
    for item in list_items:
        # Find the anchor tag in the list item
        link = item.find('a')
        if not link:
            continue
            
        # Extract the text from the link
        link_text = link.get_text(strip=True)
        
        # Check if the text matches our pattern
        match = pattern.search(link_text)
        if match:
            attraction_name = match.group(1).strip()
            attraction_url = link.get('href')
            
            # Only include if we have both name and URL
            if attraction_name and attraction_url:
                # Create full URL if it's a relative URL
                if attraction_url.startswith('/'):
                    attraction_url = f"https://www.viator.com{attraction_url}"
                
                attractions.append({
                    'attraction_name': attraction_name,
                    'attraction_url': attraction_url
                })
    
    return attractions

def process_all_sitemaps():
    """
    Process all downloaded sitemap files and extract attractions.
    
    Returns:
        list: Combined list of all attractions with their regions
    """
    all_attractions = []
    
    # Find all HTML files in the input directory
    sitemap_files = list(raw_dir.glob('*_attractions_sitemap.html'))
    
    for sitemap_file in sitemap_files:
        # Extract region name from filename
        region_name = sitemap_file.stem.replace('_attractions_sitemap', '')
        
        print(f"Processing {region_name} sitemap...")
        
        # Read the HTML content
        html_content = sitemap_file.read_text(encoding='utf-8')
        
        # Extract attractions
        attractions = extract_attraction_links(html_content)
        
        # Add region information
        for attraction in attractions:
            attraction['region'] = region_name
        
        all_attractions.extend(attractions)
        
        print(f"Found {len(attractions)} attractions in {region_name}")
    
    return all_attractions

In [11]:
# Run this cell to extract attractions from downloaded sitemaps
attractions = process_all_sitemaps()

# Convert to DataFrame for easy CSV export
df = pd.DataFrame(attractions)

# Save to CSV
csv_file = processed_dir / "attractions.csv"
df.to_csv(csv_file, index=False)
print(f"\nExtracted {len(attractions)} total attractions and saved to {csv_file}")

# Also save as JSON for easier inspection
json_file = processed_dir / "attractions.json"
with open(json_file, 'w', encoding='utf-8') as f:
    json.dump(attractions, f, indent=2)
print(f"Also saved data to {json_file}")

# Print some statistics
print("\nAttractions per region:")
region_counts = df['region'].value_counts()
for region, count in region_counts.items():
    print(f"  {region}: {count} attractions")

# Display a sample of attractions
print("\nSample of attractions:")
df.head(10)

Processing Middle_East_Africa sitemap...
Found 494 attractions in Middle_East_Africa
Processing Asia sitemap...
Found 492 attractions in Asia
Processing Caribbean sitemap...
Found 377 attractions in Caribbean
Processing Australia_Pacific sitemap...
Found 457 attractions in Australia_Pacific
Processing Central_South_America sitemap...
Found 459 attractions in Central_South_America
Processing North_America sitemap...
Found 474 attractions in North_America
Processing Europe sitemap...
Found 484 attractions in Europe

Extracted 3237 total attractions and saved to viator-attractions-scraper/data/processed/attractions.csv
Also saved data to viator-attractions-scraper/data/processed/attractions.json

Attractions per region:
  Middle_East_Africa: 494 attractions
  Asia: 492 attractions
  Europe: 484 attractions
  North_America: 474 attractions
  Central_South_America: 459 attractions
  Australia_Pacific: 457 attractions
  Caribbean: 377 attractions

Sample of attractions:


Unnamed: 0,attraction_name,attraction_url,region
0,Giza Pyramids,https://www.viator.com/Giza-attractions/Giza-P...,Middle_East_Africa
1,Burj Khalifa,https://www.viator.com/Dubai-attractions/Burj-...,Middle_East_Africa
2,Majorelle Garden (Jardin Majorelle),https://www.viator.com/Marrakech-attractions/M...,Middle_East_Africa
3,Abu Simbel Temples,https://www.viator.com/Aswan-attractions/Abu-S...,Middle_East_Africa
4,Valley of the Kings,https://www.viator.com/Luxor-attractions/Valle...,Middle_East_Africa
5,Dubai Marina,https://www.viator.com/Dubai-attractions/Dubai...,Middle_East_Africa
6,Sheikh Zayed Grand Mosque,https://www.viator.com/Abu-Dhabi-attractions/S...,Middle_East_Africa
7,Nile River,https://www.viator.com/Cairo-attractions/Nile-...,Middle_East_Africa
8,Burj Al-Arab Jumeirah,https://www.viator.com/Dubai-attractions/Burj-...,Middle_East_Africa
9,Egyptian Museum (Museum of Egyptian Antiquities),https://www.viator.com/Cairo-attractions/Egypt...,Middle_East_Africa


Part 3: Download Individual Attraction Pages

In [12]:
# Set up constants for attraction page downloads
STATUS_FILE = processed_dir / "download_status.csv"

# User agent for requests
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"

# Delay between requests (seconds)
MIN_DELAY = 5
MAX_DELAY = 15

def get_sanitized_filename(url):
    """
    Create a sanitized filename from a URL.
    
    Args:
        url: The URL of the page
    
    Returns:
        str: A sanitized filename
    """
    # Parse URL to get path components
    parsed = urlparse(url)
    path = parsed.path
    
    # Remove leading and trailing slashes
    path = path.strip('/')
    
    # Replace slashes and other problematic characters with underscores
    path = re.sub(r'[\\/:"*?<>|]', '_', path)
    
    # Add the .html extension
    return f"{path}.html"

def download_page_with_requests(url, output_file):
    """
    Download a page using the requests library.
    
    Args:
        url: The URL to download
        output_file: The file to save the content to
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        headers = {
            'User-Agent': USER_AGENT,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.viator.com/',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
        }
        
        response = requests.get(url, headers=headers, timeout=30)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Create parent directories if they don't exist
            output_path = Path(output_file)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Save the content to a file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(response.text)
                
            return True
        else:
            print(f"Failed to download {url}: Status code {response.status_code}")
            return False
            
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False

def download_page_with_selenium(url, output_file):
    """
    Download a page using Selenium (for pages that might have CAPTCHAs).
    
    Args:
        url: The URL to download
        output_file: The file to save the content to
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Set up Chrome options
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # Run in headless mode
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument(f'user-agent={USER_AGENT}')
        
        # Initialize the driver
        driver = webdriver.Chrome(options=chrome_options)
        
        try:
            # Load the page
            driver.get(url)
            
            # Wait for content to load
            time.sleep(3)
            
            # Get the page source
            page_source = driver.page_source
            
            # Create parent directories if they don't exist
            output_path = Path(output_file)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Save the content to a file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(page_source)
                
            return True
            
        finally:
            # Always close the driver
            driver.quit()
            
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False

def load_status_file():
    """
    Load or create a status CSV file to track download progress.
    
    Returns:
        dict: A dictionary mapping attraction URLs to download status
    """
    status_dict = {}
    
    if STATUS_FILE.exists():
        with open(STATUS_FILE, 'r', newline='', encoding='utf-8') as f:
            reader = csv.reader(f)
            next(reader)  # Skip header
            for row in reader:
                if len(row) >= 2:
                    url, status = row[0], row[1]
                    status_dict[url] = status
    
    return status_dict

def update_status_file(status_dict):
    """
    Update the status CSV file with current download status.
    
    Args:
        status_dict: A dictionary mapping attraction URLs to download status
    """
    with open(STATUS_FILE, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['url', 'status'])
        for url, status in status_dict.items():
            writer.writerow([url, status])

In [20]:
# Run this cell to start downloading attraction pages
# You can set the limit to restrict the number of pages to download

# Load attractions list
csv_file = processed_dir / "attractions.csv"
try:
    attractions_df = pd.read_csv(csv_file)
except Exception as e:
    print(f"Error loading attractions CSV: {e}")
    print(f"Make sure {csv_file} exists and is a valid CSV file.")
    raise

# Load or create the status file
status_dict = load_status_file()

# Count total attractions
total_attractions = len(attractions_df)
completed = sum(1 for status in status_dict.values() if status == 'success')

print(f"Found {total_attractions} attractions to process.")
print(f"Already downloaded: {completed}")

# Set a limit on how many to download in this run (set to None to download all)
download_limit = None  # Change this number or set to None for no limit

# Filter by region if desired
regions_to_download = [
    # "Asia",
    # "Australia_Pacific",
    # "Caribbean",
    # "Central_South_America",
    # "Europe", 
    # "Middle_East_Africa",
    "North_America"
]
if regions_to_download:
    attractions_df = attractions_df[attractions_df['region'].isin(regions_to_download)]
    print(f"Filtered to {len(attractions_df)} attractions in specified regions")

# Track processed count
processed = 0

# Iterate through attractions
for index, row in attractions_df.iterrows():
    attraction_url = row['attraction_url']
    attraction_name = row['attraction_name']
    region = row['region']
    
    # Skip if already successfully downloaded
    if attraction_url in status_dict and status_dict[attraction_url] == 'success':
        continue
    
    # Check if we've reached the limit
    if download_limit is not None and processed >= download_limit:
        print(f"\nReached download limit of {download_limit}. Stopping.")
        break
    
    processed += 1
    
    # Create the output filename
    filename = get_sanitized_filename(attraction_url)
    output_file = attractions_dir / region / filename
    
    print(f"\n[{processed}/{total_attractions}] Downloading: {attraction_name}")
    print(f"URL: {attraction_url}")
    print(f"Output: {output_file}")
    
    # Try to download the page
    try:
        # First try with requests (faster)
        success = download_page_with_requests(attraction_url, output_file)
        
        # If that fails, try with Selenium
        if not success:
            print("Requests download failed, trying with Selenium...")
            success = download_page_with_selenium(attraction_url, output_file)
        
        # Update status
        status_dict[attraction_url] = 'success' if success else 'failed'
        update_status_file(status_dict)
        
        # Report success or failure
        if success:
            print(f"Successfully downloaded: {attraction_name}")
        else:
            print(f"Failed to download: {attraction_name}")
            
        # Add a delay between requests to avoid rate limiting
        if index < len(attractions_df) - 1 and (download_limit is None or processed < download_limit):
            delay = random.uniform(MIN_DELAY, MAX_DELAY)
            print(f"Waiting for {delay:.2f} seconds before the next request...")
            time.sleep(delay)
        
    except Exception as e:
        print(f"Error processing {attraction_name}: {e}")
        status_dict[attraction_url] = 'error'
        update_status_file(status_dict)

# Print summary
successful = sum(1 for status in status_dict.values() if status == 'success')
failed = sum(1 for status in status_dict.values() if status in ['failed', 'error'])

print(f"\nDownload completed!")
print(f"Total attractions processed in this run: {processed}")
print(f"Total successfully downloaded (all runs): {successful}")
print(f"Total failed (all runs): {failed}")

Found 3237 attractions to process.
Already downloaded: 2763
Filtered to 474 attractions in specified regions

[1/3237] Downloading: The White House
URL: https://www.viator.com/Washington-DC-attractions/The-White-House/d657-a1102
Output: viator-attractions-scraper/data/raw/attractions/North_America/Washington-DC-attractions_The-White-House_d657-a1102.html
Successfully downloaded: The White House

[2/3237] Downloading: Grand Canyon West Rim
URL: https://www.viator.com/Las-Vegas-attractions/Grand-Canyon-West-Rim/d684-a1594
Output: viator-attractions-scraper/data/raw/attractions/North_America/Las-Vegas-attractions_Grand-Canyon-West-Rim_d684-a1594.html
Successfully downloaded: Grand Canyon West Rim

[3/3237] Downloading: Hoover Dam
URL: https://www.viator.com/Las-Vegas-attractions/Hoover-Dam/d684-a11
Output: viator-attractions-scraper/data/raw/attractions/North_America/Las-Vegas-attractions_Hoover-Dam_d684-a11.html
Successfully downloaded: Hoover Dam

[4/3237] Downloading: Alcatraz
URL: htt

Part 4: Extract Attraction Details

In [25]:
def extract_attraction_details(html_content, attraction_name, region):
    """
    Extract detailed information from an attraction page using verified selectors.
    
    Args:
        html_content: The HTML content of the attraction page
        attraction_name: The name of the attraction
        region: The region of the attraction
        
    Returns:
        dict: A dictionary with attraction details
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    details = {
        'attraction_name': attraction_name,
        'region': region,
        'tour_count': 0,
        'tours': []
    }
    
    try:
        # Extract tour count from meta tag or any text containing "results" or "tours"
        count_meta = soup.find("meta", attrs={"name": "totalItemCount"})
        if count_meta and count_meta.get("content"):
            details['tour_count'] = int(count_meta['content'])
        else:
            # Look for text containing number of results/tours
            count_texts = soup.find_all(string=re.compile(r'(\d+)(\s+)(results|tours|experiences)'))
            for text in count_texts:
                match = re.search(r'(\d+)', text)
                if match:
                    details['tour_count'] = int(match.group(1))
                    break
        
        # Find tour cards using the consistent data-automation attribute
        tour_cards = soup.select('[data-automation="poi-product-list-card"]')
        print(f"Found {len(tour_cards)} tour cards for {attraction_name}")
        
        # Process each tour card
        for card in tour_cards[:10]:  # Limit to first 10 tours
            tour = {}
            
            # Extract tour title using a.productCard__A2Ct span
            title_elem = card.select_one('a.productCard__A2Ct span')
            if title_elem:
                tour['title'] = title_elem.get_text(strip=True)
            
            # Extract rating and review count from aria-label
            rating_elem = card.select_one('[aria-label^="Rated"]')
            if rating_elem:
                rating_match = re.search(r'Rated ([\d.]+)', rating_elem.get('aria-label', ''))
                if rating_match:
                    tour['rating'] = float(rating_match.group(1))
                
                # Extract review count from the same aria-label
                review_match = re.search(r'with ([\d,]+)', rating_elem.get('aria-label', ''))
                if review_match:
                    tour['review_count'] = int(review_match.group(1).replace(',', ''))
            
            # Extract price using regex for "From $"
            price_elem = card.find(string=re.compile(r'From \$'))
            if price_elem:
                price_match = re.search(r'\$([\d,.]+)', str(price_elem))
                if price_match:
                    tour['price'] = price_match.group(1).replace(',', '')
            
            # Add to tours list if we have at least a title
            if tour.get('title'):
                details['tours'].append(tour)
            else:
                print(f"Found a card but couldn't extract a title for {attraction_name}")
        
        # If we didn't find any tours but the count indicates there should be some
        if not details['tours'] and details['tour_count'] > 0:
            print(f"Warning: Found {details['tour_count']} tours according to count, but extracted 0 tour details for {attraction_name}")
        
    except Exception as e:
        print(f"Error extracting details for {attraction_name}: {e}")
    
    return details

def process_attraction_pages():
    """
    Process downloaded attraction pages and extract details.
    
    Returns:
        list: List of attraction details
    """
    # Load attractions list
    attractions_df = pd.read_csv(processed_dir / "attractions.csv")
    
    # Load status file to find successfully downloaded pages
    status_dict = load_status_file()
    
    # Filter to successfully downloaded attractions
    successful_urls = [url for url, status in status_dict.items() if status == 'success']
    successful_attractions = attractions_df[attractions_df['attraction_url'].isin(successful_urls)]
    
    all_details = []
    
    for index, row in successful_attractions.iterrows():
        attraction_url = row['attraction_url']
        attraction_name = row['attraction_name']
        region = row['region']
        
        # Determine the file path
        filename = get_sanitized_filename(attraction_url)
        file_path = attractions_dir / region / filename
        
        if not file_path.exists():
            print(f"File not found for {attraction_name}: {file_path}")
            continue
        
        print(f"Processing {attraction_name}...")
        
        try:
            # Read the HTML content
            html_content = file_path.read_text(encoding='utf-8')
            
            # Extract details
            details = extract_attraction_details(html_content, attraction_name, region)
            
            # Add to results
            all_details.append(details)
            
        except Exception as e:
            print(f"Error processing {attraction_name}: {e}")
    
    return all_details

In [26]:
# Run this cell to extract details from downloaded attraction pages
attraction_details = process_attraction_pages()

# Save the details to a JSON file
details_file = processed_dir / "attraction_details.json"
with open(details_file, 'w', encoding='utf-8') as f:
    json.dump(attraction_details, f, indent=2)

print(f"Extracted details for {len(attraction_details)} attractions and saved to {details_file}")

# Create a DataFrame for analysis
details_df = pd.json_normalize(
    attraction_details, 
    record_path='tours', 
    meta=['attraction_name', 'region', 'tour_count']
)

# Save to CSV
details_csv = processed_dir / "attraction_details.csv"
details_df.to_csv(details_csv, index=False)
print(f"Saved details to {details_csv}")

# Display some basic statistics
if not details_df.empty:
    print("\nTop attractions by tour count:")
    top_by_tours = pd.DataFrame(attraction_details).sort_values('tour_count', ascending=False)
    print(top_by_tours[['attraction_name', 'region', 'tour_count']].head(10))
    
    print("\nSample of tour details:")
    display(details_df.head())
else:
    print("No attraction details found. Make sure you've downloaded some attraction pages first.")

Processing Giza Pyramids...
Found 1 tours in structured data for Giza Pyramids
Processing Burj Khalifa...
Found 1 tours in structured data for Burj Khalifa
Processing Majorelle Garden (Jardin Majorelle)...
Found 100 potential tour containers for Majorelle Garden (Jardin Majorelle)
Container text sample: a Tripadvisor companyDiscoverHomeThings to do in MoroccoThings to do in Central MoroccoThings to do in MarrakechMajorelle Garden (Jardin Majorelle) ToursThings to doMajorelle Garden (Jardin Majorelle)
Container text sample: a Tripadvisor companyDiscoverHomeThings to do in MoroccoThings to do in Central MoroccoThings to do in MarrakechMajorelle Garden (Jardin Majorelle) ToursThings to doMajorelle Garden (Jardin Majorelle)
Container text sample: HomeThings to do in MoroccoThings to do in Central MoroccoThings to do in MarrakechMajorelle Garden (Jardin Majorelle) ToursThings to doMajorelle Garden (Jardin Majorelle) Tours and TicketsWith its bo
Container text sample: 200+results-Best Seller

KeyboardInterrupt: 

Part 5: Additional Analysis

In [None]:
# This cell performs additional analysis on the extracted attraction data
# Only run this after you have extracted attraction details

try:
    # Check if the details file exists
    details_file = processed_dir / "attraction_details.json"
    if not details_file.exists():
        print("No attraction details found. Run the previous cell first.")
    else:
        # Load the attraction details
        with open(details_file, 'r', encoding='utf-8') as f:
            attraction_details = json.load(f)
        
        # Convert to DataFrame for analysis
        details_df = pd.json_normalize(
            attraction_details, 
            record_path='tours', 
            meta=['attraction_name', 'region', 'tour_count']
        )
        
        if details_df.empty:
            print("No tour details found.")
        else:
            # Analysis by region
            print("=== Analysis by Region ===")
            region_analysis = pd.DataFrame(attraction_details).groupby('region').agg(
                attraction_count=('attraction_name', 'count'),
                avg_tours_per_attraction=('tour_count', 'mean'),
                total_tours=('tour_count', 'sum')
            ).reset_index()
            
            print(region_analysis)
            
            # Rating analysis
            if 'rating' in details_df.columns:
                print("\n=== Rating Analysis ===")
                print(f"Average rating: {details_df['rating'].mean():.2f}")
                print(f"Rating distribution:")
                rating_bins = [0, 3, 4, 4.5, 5]
                rating_counts = pd.cut(details_df['rating'], bins=rating_bins).value_counts().sort_index()
                for bin_range, count in rating_counts.items():
                    print(f"  {bin_range}: {count} tours")
                
                # Top rated attractions
                print("\nTop rated attractions:")
                tours_by_attraction = details_df.groupby('attraction_name').agg(
                    avg_rating=('rating', 'mean'),
                    tour_count=('attraction_name', 'count')
                ).reset_index()
                
                top_rated = tours_by_attraction[tours_by_attraction['tour_count'] >= 5].sort_values(
                    'avg_rating', ascending=False
                ).head(10)
                
                print(top_rated)
                
            # Create a simple visualization using pandas
            print("\n=== Price Analysis ===")
            if 'price' in details_df.columns:
                # Convert price to numeric
                details_df['price_numeric'] = pd.to_numeric(details_df['price'], errors='coerce')
                
                price_stats = details_df.groupby('attraction_name').agg(
                    min_price=('price_numeric', 'min'),
                    max_price=('price_numeric', 'max'),
                    avg_price=('price_numeric', 'mean')
                ).reset_index()
                
                # Display the most expensive and least expensive attractions
                print("Most expensive attractions (by average tour price):")
                print(price_stats.sort_values('avg_price', ascending=False).head(5))
                
                print("\nLeast expensive attractions (by average tour price):")
                print(price_stats.sort_values('avg_price').head(5))
    
except Exception as e:
    print(f"Error in analysis: {e}")