In [4]:
"""
Yellow Pages Web Scraper (yellowpages.com - US Version)
Senior Python Automation Engineer Implementation

Features:
1. Uses 'undetected-chromedriver' to bypass Cloudflare blocking.
2. Forces Chrome version 143 to match your installed browser.
3. Targets specific input IDs to fix 'invalid element state' errors.
4. Implements robust pagination with staleness checks.

Required packages:
    pip install selenium undetected-chromedriver openpyxl pandas
"""

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import pandas as pd
from datetime import datetime
import re
import sys

class YellowPagesScraper:
    """Scraper class for Yellow Pages (yellowpages.com) website."""
    
    def __init__(self):
        """Initialize the scraper with Undetected Chrome WebDriver."""
        print("Initializing Undetected Chrome WebDriver (Bypassing Cloudflare)...")
        
        # Set up Chrome options
        options = uc.ChromeOptions()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--start-maximized')
        
        # --- FIX: Force version 143 to match your browser ---
        try:
            self.driver = uc.Chrome(options=options, version_main=143)
        except Exception as e:
            print(f"Version 143 failed, trying default... Error: {e}")
            self.driver = uc.Chrome(options=options) # Fallback

        self.wait = WebDriverWait(self.driver, 20)
        
        print("WebDriver initialized successfully.\n")
    
    def navigate_to_site(self):
        """Navigate to Yellow Pages homepage."""
        print("Navigating to https://www.yellowpages.com/...")
        self.driver.get("https://www.yellowpages.com/")
        print("Page loaded.\n")
    
    def perform_search(self, job_type, location):
        """
        Perform search on Yellow Pages using specific Input IDs.
        """
        print(f"Searching for: '{job_type}' in '{location}'...")
        
        try:
            # Wait for the page to fully load
            time.sleep(3)
            
            # --- FIND 'WHAT' INPUT ---
            what_field = None
            what_selectors = [
                (By.ID, "query"),       # Validated from your screenshot
                (By.NAME, "search_terms"),
                (By.CSS_SELECTOR, "#query-container input"),
                (By.CSS_SELECTOR, "input[placeholder*='Find a business']")
            ]
            
            for by, selector in what_selectors:
                try:
                    what_field = self.wait.until(EC.visibility_of_element_located((by, selector)))
                    print(f"Found 'What' field using: {by} = {selector}")
                    break
                except TimeoutException:
                    continue
            
            if not what_field:
                raise Exception("Could not find 'What' input field")
            
            # Clear and enter the job type
            what_field.click()
            what_field.clear()
            time.sleep(0.5)
            what_field.send_keys(job_type)
            print(f"✓ Entered '{job_type}' in 'What' field.")
            
            # --- FIND 'WHERE' INPUT ---
            where_field = None
            where_selectors = [
                (By.ID, "location"),    # Validated from your screenshot
                (By.NAME, "geo_location_terms"),
                (By.CSS_SELECTOR, "#location-container input"),
                (By.CSS_SELECTOR, "input[placeholder*='Where']")
            ]
            
            for by, selector in where_selectors:
                try:
                    where_field = self.wait.until(EC.visibility_of_element_located((by, selector)))
                    print(f"Found 'Where' field using: {by} = {selector}")
                    break
                except TimeoutException:
                    continue
            
            if not where_field:
                raise Exception("Could not find 'Where' input field")
            
            # Clear and enter the location
            where_field.click()
            where_field.clear()
            time.sleep(0.5)
            where_field.send_keys(location)
            print(f"✓ Entered '{location}' in 'Where' field.")
            
            # --- FIND SEARCH BUTTON ---
            time.sleep(1)
            search_button = None
            button_selectors = [
                (By.CSS_SELECTOR, "button.btn-warning"), 
                (By.CSS_SELECTOR, "button[type='submit']"),
                (By.XPATH, "//button[contains(text(), 'FIND')]"),
                (By.CSS_SELECTOR, ".search-form button")
            ]
            
            for by, selector in button_selectors:
                try:
                    search_button = self.driver.find_element(by, selector)
                    print(f"Found search button using: {by} = {selector}")
                    break
                except NoSuchElementException:
                    continue
            
            if search_button:
                self.driver.execute_script("arguments[0].click();", search_button)
                print("✓ Search button clicked.\n")
            else:
                where_field.send_keys(Keys.RETURN)
                print("✓ Submitted search via Enter key.\n")
            
            # Wait for results page to load
            time.sleep(4)
            print(f"Navigated to: {self.driver.current_url}\n")
            
            # Check for Cloudflare Block immediately after search
            if "blocked" in self.driver.title.lower() or "cloudflare" in self.driver.page_source.lower():
                 print("\n!!! DETECTED CLOUDFLARE BLOCK !!!")
                 print("The site blocked the search request. Try running the script again later.")
                 raise Exception("Cloudflare Blocked Search Results")

            # Wait for results to appear
            self.wait.until(
                EC.presence_of_element_located((
                    By.CSS_SELECTOR, 
                    ".result, .search-results, .organic, .srp-listing"
                ))
            )
            print("Search results loaded successfully.\n")
            
        except TimeoutException:
            print(f"ERROR: Timeout while performing search.")
            raise
        except Exception as e:
            print(f"ERROR: {str(e)}")
            raise
    
    def extract_listings(self, max_results=None, max_pages=None):
        """
        Extract business listings from search results across multiple pages.
        """
        if max_results:
            print(f"Extracting up to {max_results} listings...\n")
        else:
            print(f"Extracting ALL listings from all pages...\n")
        
        results = []
        current_page = 1
        
        try:
            # Loop through pages
            while True:
                print(f"{'='*60}")
                print(f"SCRAPING PAGE {current_page}")
                print(f"{'='*60}\n")
                
                listing_selectors = [
                    ".srp-listing",  
                    ".result",
                    ".search-results .result",
                    ".organic",
                    "[class*='srp-listing']",
                    "[class*='result-item']"
                ]
                
                listings = []
                for selector in listing_selectors:
                    try:
                        listings = self.driver.find_elements(By.CSS_SELECTOR, selector)
                        if listings:
                            print(f"Found {len(listings)} listings on page {current_page} using selector: {selector}\n")
                            break
                    except:
                        continue
                
                if not listings:
                    print(f"No listings found on page {current_page}. Stopping.")
                    break
                
                # Process each listing
                for idx, listing in enumerate(listings, 1):
                    if max_results and len(results) >= max_results:
                        print(f"\nReached maximum of {max_results} results. Stopping.\n")
                        return results
                    
                    print(f"Processing listing #{len(results) + 1} (Page {current_page}, Item {idx})...")
                    
                    try:
                        # Scroll the listing into view
                        self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", listing)
                        time.sleep(0.2)
                        
                        # Extract business name
                        business_name = "Unknown"
                        name_selectors = [
                            "a.business-name", 
                            ".business-name",
                            "h2.n a",
                            ".info-section h2 a",
                            ".info h2 a"
                        ]
                        
                        for name_sel in name_selectors:
                            try:
                                name_element = listing.find_element(By.CSS_SELECTOR, name_sel)
                                business_name = name_element.text.strip()
                                if business_name:
                                    break
                            except NoSuchElementException:
                                continue
                        
                        if business_name == "Unknown":
                            print("  Warning: Business name not found. Skipping this listing.")
                            continue
                        
                        # Extract phone number
                        phone_number = "Not Available"
                        phone_found = False
                        
                        # Method 1: Direct phone text
                        phone_selectors = [
                            ".phones.phone.primary", 
                            ".phone.primary",
                            ".phones",
                            "div.phone"
                        ]
                        
                        for phone_sel in phone_selectors:
                            try:
                                phone_elements = listing.find_elements(By.CSS_SELECTOR, phone_sel)
                                for phone_el in phone_elements:
                                    phone_text = phone_el.text.strip()
                                    phone_text = ' '.join(phone_text.split())
                                    
                                    if phone_text and (phone_text[0].isdigit() or phone_text.startswith('(')):
                                        phone_number = phone_text
                                        phone_found = True
                                        break
                            except:
                                continue
                            if phone_found: break
                        
                        # Method 2: Look for phone links
                        if not phone_found:
                            try:
                                phone_links = listing.find_elements(By.CSS_SELECTOR, "a[href^='tel:']")
                                for phone_link in phone_links:
                                    href = phone_link.get_attribute('href')
                                    phone_text = href.replace('tel:', '').replace('+1', '').strip()
                                    if phone_text and len(phone_text) >= 10:
                                        phone_number = phone_text
                                        phone_found = True
                                        break
                            except: pass
                        
                        # Add to results
                        if business_name != "Unknown":
                            results.append({
                                "Business Name": business_name,
                                "Phone Number": phone_number
                            })
                            if len(results) % 50 == 0:
                                print(f"  ✓ Progress: {len(results)} listings extracted so far...")
                        
                    except Exception as e:
                        continue
                
                # Check max_pages
                if max_pages and current_page >= max_pages:
                    print(f"\nReached maximum of {max_pages} pages. Stopping.\n")
                    break
                
                # --- PAGINATION LOGIC (WITH STALENESS CHECK) ---
                print(f"\nLooking for 'Next' button to go to page {current_page + 1}...\n")
                
                next_button_found = False
                next_button_selectors = [
                    "a.next",
                    "a[rel='next']",
                    ".pagination a.next",
                    "a[aria-label='Next']",
                    ".next a"
                ]
                
                for next_sel in next_button_selectors:
                    try:
                        next_buttons = self.driver.find_elements(By.CSS_SELECTOR, next_sel)
                        
                        for next_button in next_buttons:
                            button_class = next_button.get_attribute('class') or ''
                            if 'disabled' in button_class.lower():
                                print("Next button is disabled. Reached last page.\n")
                                next_button_found = False
                                break
                            
                            # 1. Scroll to next button
                            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                            time.sleep(1)
                            
                            # 2. Capture the CURRENT results container (to check when it disappears)
                            try:
                                old_results_container = self.driver.find_element(By.CSS_SELECTOR, ".search-results")
                            except:
                                old_results_container = self.driver.find_element(By.TAG_NAME, "body")

                            # 3. Click next button
                            self.driver.execute_script("arguments[0].click();", next_button)
                            print(f"✓ Clicked 'Next' button (selector: {next_sel})")
                            
                            # 4. Wait for STALENESS of old container
                            print("  Waiting for new page content to load...")
                            try:
                                self.wait.until(EC.staleness_of(old_results_container))
                                # Wait for the new results to appear
                                self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".search-results")))
                                print(f"✓ Page {current_page + 1} loaded successfully.")
                                
                                next_button_found = True
                                current_page += 1
                                break
                            except TimeoutException:
                                print("  Warning: Page transition timed out or didn't refresh strictly.")
                                current_page += 1
                                next_button_found = True
                                break
                        
                        if next_button_found:
                            break
                            
                    except Exception as e:
                        continue
                
                if not next_button_found:
                    print("No 'Next' button found or last page reached. Stopping pagination.\n")
                    break
            
            print(f"{'='*60}")
            print(f"Extraction complete. Successfully scraped {len(results)} listings from {current_page} page(s).")
            print(f"{'='*60}\n")
            return results
            
        except Exception as e:
            print(f"ERROR during extraction: {str(e)}")
            return results
    
    def close(self):
        """Close the browser and clean up."""
        print("Closing browser...")
        try:
            self.driver.quit()
        except:
            pass
        print("Browser closed.\n")


def save_to_excel(results, job_type, location):
    """Save scraped results to an Excel file."""
    if not results:
        print("No results to save.")
        return None
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Generate filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"YellowPages_{job_type}_{location}_{timestamp}.xlsx"
    filename = "".join(c for c in filename if c.isalnum() or c in ('_', '.', '-'))
    
    try:
        df.to_excel(filename, index=False, sheet_name='Results')
        print(f"\n{'='*80}")
        print(f"✓ SUCCESS! Data saved to: {filename}")
        print(f"{'='*80}\n")
        return filename
    except Exception as e:
        print(f"\nERROR saving to Excel: {str(e)}")
        # Fallback to CSV
        csv_filename = filename.replace('.xlsx', '.csv')
        df.to_csv(csv_filename, index=False)
        print(f"✓ Saved as CSV instead: {csv_filename}\n")
        return csv_filename


def print_summary(results):
    """Print a brief summary of scraped results."""
    if not results:
        return
    
    print("\n" + "=" * 80)
    print(f"SCRAPING SUMMARY: {len(results)} Total Listings")
    print("=" * 80)
    
    # Preview
    print("\nFirst 5 Results:")
    for idx, result in enumerate(results[:5], 1):
        print(f"{idx}. {result['Business Name']:<40} | {result['Phone Number']}")
    
    print("\n" + "=" * 80 + "\n")


def main():
    """Main execution function."""
    print("=" * 80)
    print("YELLOW PAGES WEB SCRAPER (Bypass Enabled)")
    print("=" * 80)
    
    job_type = input("Enter Job Type (e.g. Plumber): ").strip()
    location = input("Enter Location (e.g. Los Angeles CA): ").strip()
    
    if not job_type or not location:
        print("Error: Inputs cannot be empty.")
        return
    
    scraper = None
    try:
        scraper = YellowPagesScraper()
        scraper.navigate_to_site()
        scraper.perform_search(job_type, location)
        
        results = scraper.extract_listings()
        
        if results:
            save_to_excel(results, job_type, location)
            print_summary(results)
        else:
            print("No results found.")
        
    except Exception as e:
        print(f"\nFATAL ERROR: {str(e)}")
    
    finally:
        if scraper:
            print("\nBrowser closing in 5 seconds...")
            time.sleep(5)
            scraper.close()
    
    print("Done.")


if __name__ == "__main__":
    main()

YELLOW PAGES WEB SCRAPER (Bypass Enabled)


Enter Job Type (e.g. Plumber):  property management
Enter Location (e.g. Los Angeles CA):  Cameron Park, CA


Initializing Undetected Chrome WebDriver (Bypassing Cloudflare)...
WebDriver initialized successfully.

Navigating to https://www.yellowpages.com/...
Page loaded.

Searching for: 'property management' in 'Cameron Park, CA'...
Found 'What' field using: id = query
✓ Entered 'property management' in 'What' field.
Found 'Where' field using: id = location
✓ Entered 'Cameron Park, CA' in 'Where' field.
Found search button using: css selector = button[type='submit']
✓ Search button clicked.

Navigated to: https://www.yellowpages.com/search?search_terms=property+management&geo_location_terms=Cameron+Park%2C+CA

Search results loaded successfully.

Extracting ALL listings from all pages...

SCRAPING PAGE 1

Found 30 listings on page 1 using selector: .srp-listing

Processing listing #1 (Page 1, Item 1)...
Processing listing #2 (Page 1, Item 2)...
Processing listing #3 (Page 1, Item 3)...
Processing listing #4 (Page 1, Item 4)...
Processing listing #5 (Page 1, Item 5)...
Processing listing #6 (P