In [1]:
%pip install selenium

Collecting selenium
  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.13.2 (from selenium)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.33.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m23.

In [6]:
import os
import re
import time
import random
import json
import pandas as pd
import requests
import threading
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Constants
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyCcSSpO0cTgBz0J9IX6QSAMMJ0mgJkcCto")
TEXT_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
PLACE_DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"
SEARCH_TAGS = ["restaurant"]
SEARCH_LOCATIONS = ["Los Angeles"]
MAX_PAGES_PER_SITE = 15
CAREERS_MAX_PAGES = 5
THREAD_POOL_WORKERS = 5
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
]

# Third-party job sites to exclude
THIRD_PARTY_JOB_SITES = [
    "indeed.com", "ziprecruiter.com", "linkedin.com/jobs", "monster.com",
    "glassdoor.com", "careerbuilder.com", "simplyhired.com", "dice.com",
    "flexjobs.com", "upwork.com", "freelancer.com", "fiverr.com",
    "snagajob.com", "workday.com", "bamboohr.com", "greenhouse.io"
]

# URL Shortening Configuration
MAX_URL_LENGTH = 80
ELLIPSIS = "..."

# Setup headless Chrome for Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=chrome_options)

# ================= URL PROCESSING FUNCTIONS =================

def shorten_url(url, max_length=MAX_URL_LENGTH):
    """Shorten a URL if it exceeds the maximum length"""
    if len(url) <= max_length:
        return url

    try:
        parsed = urlparse(url)
        domain = parsed.netloc
        scheme = parsed.scheme
        base_url = f"{scheme}://{domain}"

        if len(base_url) >= max_length - len(ELLIPSIS):
            return base_url[:max_length - len(ELLIPSIS)] + ELLIPSIS

        remaining_space = max_length - len(base_url) - len(ELLIPSIS)
        if remaining_space > 0 and parsed.path:
            path_part = parsed.path[:remaining_space]
            return f"{base_url}{path_part}{ELLIPSIS}"
        else:
            return base_url

    except Exception:
        return url[:max_length - len(ELLIPSIS)] + ELLIPSIS

def check_url_status(url, timeout=5):
    """Check if a URL is working/accessible (returns True if working)"""
    if not url or url == "None found":
        return False

    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        response = requests.head(url, headers=headers, timeout=timeout, allow_redirects=True)
        return response.status_code < 400
    except:
        try:
            response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
            return response.status_code < 400
        except:
            return False

def process_urls(url_string):
    """Process and shorten URLs, filter out broken ones"""
    if not url_string or url_string == "None found":
        return url_string

    urls = [url.strip() for url in url_string.split(";")]
    working_urls = []

    for url in urls:
        if url and check_url_status(url):
            working_urls.append(shorten_url(url))

    return "; ".join(working_urls) if working_urls else "None found"

def is_third_party_job_site(url):
    """Check if URL is from a third-party job site"""
    return any(site in url.lower() for site in THIRD_PARTY_JOB_SITES)

# ================= GOOGLE MAPS API FUNCTIONS =================

def search_places(query, location):
    """Search for places using Google Maps API"""
    params = {"query": f"{query} in {location}", "key": GOOGLE_API_KEY}
    response = requests.get(TEXT_SEARCH_URL, params=params)
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return [], []
    results = response.json().get("results", [])
    return [r.get("place_id") for r in results], [r.get("name") for r in results]

def get_place_details(place_id):
    """Get detailed information about a place"""
    params = {
        "place_id": place_id,
        "fields": "name,formatted_address,website,formatted_phone_number",
        "key": GOOGLE_API_KEY,
    }
    response = requests.get(PLACE_DETAILS_URL, params=params)
    if response.status_code != 200:
        return None
    return response.json().get("result", {})

# ================= EMAIL EXTRACTION FUNCTIONS =================

def extract_emails(text, soup=None):
    """Extract all email addresses from text"""
    pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    emails = set(re.findall(pattern, text))

    if soup:
        for a in soup.find_all("a", href=True):
            href = a['href'].lower()
            if href.startswith("mailto:"):
                email = href[7:].split("?")[0].strip()
                if re.match(pattern, email):
                    emails.add(email)
            visible = a.get_text(strip=True)
            if re.match(pattern, visible):
                emails.add(visible)
    return list(emails)

def extract_hr_emails(text):
    """Extract HR-related email addresses"""
    pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    emails = re.findall(pattern, text)

    hr_emails = []
    for email in emails:
        email_lower = email.lower()
        if any(email_lower.startswith(prefix) for prefix in ["hr@", "hiring@", "recruiting@", "talent@", "jobs@", "careers@"]):
            hr_emails.append(email)

    return list(set(hr_emails))

def extract_general_emails(text):
    """Extract general contact emails"""
    pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    emails = re.findall(pattern, text)

    general_emails = []
    for email in emails:
        email_lower = email.lower()
        if any(email_lower.startswith(prefix) for prefix in ["info@", "contact@", "hello@", "support@", "admin@"]):
            general_emails.append(email)
        elif email_lower.endswith("@gmail.com") or email_lower.endswith("@yahoo.com"):
            general_emails.append(email)

    return list(set(general_emails))

def extract_sales_emails(text):
    """Extract sales-related email addresses"""
    pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    emails = re.findall(pattern, text)

    sales_emails = []
    for email in emails:
        email_lower = email.lower()
        if any(email_lower.startswith(prefix) for prefix in ["sales@", "business@", "partnerships@", "marketing@"]):
            sales_emails.append(email)

    return list(set(sales_emails))

# ================= PAGE DETECTION FUNCTIONS =================

def detect_careers_page(url, text, soup):
    """Detect if this is a careers/jobs page and if it's internal"""
    if is_third_party_job_site(url):
        return False, ""

    careers_indicators = [
        "career", "careers", "job", "jobs", "employment", "hiring", "positions",
        "join our team", "work with us", "apply now", "open positions"
    ]

    url_lower = url.lower()
    text_lower = text.lower()

    # Check URL path
    url_has_careers = any(indicator in url_lower for indicator in careers_indicators)

    # Check page content
    content_has_careers = any(indicator in text_lower for indicator in careers_indicators)

    # Look for application forms
    has_application_form = False
    if soup:
        forms = soup.find_all("form")
        for form in forms:
            form_text = form.get_text().lower()
            if any(word in form_text for word in ["apply", "application", "resume", "cv", "position"]):
                has_application_form = True
                break

    if url_has_careers or (content_has_careers and has_application_form):
        return True, url

    return False, ""

def detect_products_services_page(url, text, soup):
    """Detect pages showing products, services, or industries served"""
    if not text:
        return False, ""

    # Keywords that indicate products/services pages
    product_service_indicators = [
        "products", "services", "solutions", "offerings", "what we do",
        "industries", "sectors", "specialties", "capabilities", "expertise",
        "portfolio", "catalog", "menu", "pricing", "packages"
    ]

    url_lower = url.lower()
    text_lower = text.lower()

    # Check URL path
    url_indicates_products = any(indicator in url_lower for indicator in product_service_indicators)

    # Check if content has substantial product/service information
    content_score = sum(1 for indicator in product_service_indicators if indicator in text_lower)

    # Look for structured content (lists, grids, etc.)
    has_structured_content = False
    if soup:
        # Look for lists or structured content
        lists = soup.find_all(['ul', 'ol', 'div'])
        for element in lists:
            element_text = element.get_text().lower()
            if any(indicator in element_text for indicator in product_service_indicators):
                has_structured_content = True
                break

    # Determine if this is likely a products/services page
    if url_indicates_products or (content_score >= 2 and has_structured_content):
        return True, url

    return False, ""

# ================= WEB CRAWLING FUNCTIONS =================

def crawl_site_comprehensive(base_url, max_pages=15):
    """Comprehensive website crawling"""
    visited = set()
    queue = deque([base_url])

    # Email collections
    hr_emails = set()
    general_emails = set()
    sales_emails = set()

    # Page collections
    careers_pages = set()
    products_services_pages = set()

    domain = urlparse(base_url).netloc.replace("www.", "")
    pages_crawled = 0

    while queue and pages_crawled < max_pages:
        url = queue.popleft()
        if url in visited:
            continue
        visited.add(url)

        try:
            headers = {"User-Agent": random.choice(USER_AGENTS)}
            resp = requests.get(url, headers=headers, timeout=10)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            text = soup.get_text(" ", strip=True)

            # Extract emails by type
            hr_emails.update(extract_hr_emails(text))
            general_emails.update(extract_general_emails(text))
            sales_emails.update(extract_sales_emails(text))

            # Detect page types
            is_careers, careers_url = detect_careers_page(url, text, soup)
            if is_careers:
                careers_pages.add(careers_url)

            is_products, products_url = detect_products_services_page(url, text, soup)
            if is_products:
                products_services_pages.add(products_url)

            # Discover more links
            for a in soup.find_all("a", href=True):
                new_url = urljoin(url, a['href'])
                parsed_url = urlparse(new_url)

                if domain in parsed_url.netloc and new_url not in visited:
                    # Prioritize important pages
                    link_text = a.get_text(strip=True).lower()
                    href_lower = a['href'].lower()

                    priority_keywords = ['career', 'job', 'product', 'service', 'about', 'contact']
                    if any(keyword in link_text or keyword in href_lower for keyword in priority_keywords):
                        queue.appendleft(new_url)
                    else:
                        queue.append(new_url)

            time.sleep(random.uniform(0.5, 1.0))

        except Exception as e:
            print(f"Error crawling {url}: {e}")

        pages_crawled += 1

    return {
        "hr_emails": list(hr_emails),
        "general_emails": list(general_emails),
        "sales_emails": list(sales_emails),
        "careers_pages": list(careers_pages),
        "products_services_pages": list(products_services_pages),
        "pages_crawled": pages_crawled
    }

# ================= BUSINESS PROCESSING =================

def process_business_comprehensive(place_id, name):
    """Process a business with comprehensive data extraction"""
    print(f"Processing business: {name}")

    # Get Google Places details
    details = get_place_details(place_id)
    if not details:
        return None

    website = details.get("website", "")
    address = details.get("formatted_address", "")

    # Initialize result
    result = {
        "Company Name": name,
        "Company Address": address,
        "Company HR Email": "",
        "General Email": "",
        "Company Sales Email": "",
        "Company Careers Page": "",
        "Company Products/Services Page": ""
    }

    # Crawl website if available
    if website:
        try:
            crawl_results = crawl_site_comprehensive(website, MAX_PAGES_PER_SITE)

            # Set emails
            if crawl_results["hr_emails"]:
                result["Company HR Email"] = "; ".join(crawl_results["hr_emails"])

            if crawl_results["general_emails"]:
                result["General Email"] = "; ".join(crawl_results["general_emails"])

            if crawl_results["sales_emails"]:
                result["Company Sales Email"] = "; ".join(crawl_results["sales_emails"])

            # Set pages
            if crawl_results["careers_pages"]:
                result["Company Careers Page"] = "; ".join(crawl_results["careers_pages"])

            if crawl_results["products_services_pages"]:
                result["Company Products/Services Page"] = "; ".join(crawl_results["products_services_pages"])

        except Exception as e:
            print(f"Failed to crawl {website}: {e}")

    return result

# ================= MAIN FUNCTION =================

def main():
    """Main execution function"""
    print("🚀 Starting comprehensive business data scraper")

    lock = threading.Lock()
    checkpoint_file = "scrapped_data.csv"

    # Define CSV columns
    csv_columns = [
        "Company Name",
        "Company Address",
        "Company HR Email",
        "General Email",
        "Company Sales Email",
        "Company Careers Page",
        "Company Products/Services Page"
    ]

    # Load existing data
    if os.path.exists(checkpoint_file):
        existing_df = pd.read_csv(checkpoint_file)
        print(f"📄 Loaded {len(existing_df)} existing records")
    else:
        existing_df = pd.DataFrame(columns=csv_columns)

    existing_data = {row['Company Name']: row for _, row in existing_df.iterrows()} if not existing_df.empty else {}

    # Process each location and search tag
    for location in SEARCH_LOCATIONS:
        for tag in SEARCH_TAGS:
            print(f"\n🔍 Searching for {tag} in {location}")
            place_ids, names = search_places(tag, location)
            print(f"📍 Found {len(place_ids)} businesses")

            # Process businesses with threading
            with ThreadPoolExecutor(max_workers=THREAD_POOL_WORKERS) as executor:
                futures = {
                    executor.submit(process_business_comprehensive, pid, name): name
                    for pid, name in zip(place_ids, names)
                    if name not in existing_data
                }

                for future in as_completed(futures):
                    result = future.result()
                    if result:
                        with lock:
                            # Process URLs (remove broken ones, shorten working ones)
                            csv_result = result.copy()

                            # Process careers page URLs
                            if result["Company Careers Page"]:
                                csv_result["Company Careers Page"] = process_urls(result["Company Careers Page"])
                            else:
                                csv_result["Company Careers Page"] = "None found"

                            # Process products/services page URLs
                            if result["Company Products/Services Page"]:
                                csv_result["Company Products/Services Page"] = process_urls(result["Company Products/Services Page"])
                            else:
                                csv_result["Company Products/Services Page"] = "None found"

                            # Set default values for empty fields
                            for field in ["Company HR Email", "General Email", "Company Sales Email"]:
                                if not csv_result[field]:
                                    csv_result[field] = "None found"

                            # Add to DataFrame and save
                            existing_df = pd.concat([existing_df, pd.DataFrame([csv_result])], ignore_index=True)
                            existing_df.to_csv(checkpoint_file, index=False)
                            print(f"✅ Saved: {result['Company Name']}")

    # Generate final report
    print("\n" + "="*50)
    print("📊 FINAL METRICS REPORT")
    print("="*50)

    total = len(existing_df)
    with_hr = len(existing_df[existing_df['Company HR Email'] != 'None found'])
    with_general = len(existing_df[existing_df['General Email'] != 'None found'])
    with_sales = len(existing_df[existing_df['Company Sales Email'] != 'None found'])
    with_careers = len(existing_df[existing_df['Company Careers Page'] != 'None found'])
    with_products = len(existing_df[existing_df['Company Products/Services Page'] != 'None found'])

    print(f"📈 Total businesses: {total}")
    print(f"👔 With HR emails: {with_hr} ({with_hr/total*100:.1f}%)")
    print(f"📧 With general emails: {with_general} ({with_general/total*100:.1f}%)")
    print(f"💼 With sales emails: {with_sales} ({with_sales/total*100:.1f}%)")
    print(f"🎯 With careers pages: {with_careers} ({with_careers/total*100:.1f}%)")
    print(f"📦 With products/services pages: {with_products} ({with_products/total*100:.1f}%)")

    print(f"\n🎉 Scraping completed! Data saved to: {checkpoint_file}")

if __name__ == "__main__":
    try:
        main()
    finally:
        try:
            driver.quit()
            print("🔧 Selenium driver closed")
        except:
            pass

🚀 Starting comprehensive business data scraper

🔍 Searching for restaurant in Los Angeles
📍 Found 20 businesses
Processing business: Girl & the Goat Los Angeles
Processing business: Perch
Processing business: Bottega Louie
Processing business: The Little Door
Processing business: Water Grill
Error crawling http://www.bottegalouie.com/pages/guestservices@bottegalouie.com: 404 Client Error: Not Found for url: https://www.bottegalouie.com/pages/guestservices@bottegalouie.com
Error crawling https://www.bottegalouie.com/pages/guestservices@bottegalouie.com: 404 Client Error: Not Found for url: https://www.bottegalouie.com/pages/guestservices@bottegalouie.com
Processing business: Bestia
Processing business: 71Above
✅ Saved: Bottega Louie
Processing business: Chi Spacca
✅ Saved: Water Grill
✅ Saved: The Little Door
Error crawling https://www.71above.com/: 403 Client Error: Forbidden for url: https://www.71above.com/
Processing business: République Café Bakery & République Restaurant
✅ Saved: 