# Main code start from here

In [1]:
# ==============================================================================
# Cell 1: Imports & Warning Filter
# ==============================================================================
import os
import re
import time
import random
import pandas as pd
from collections import deque
from urllib.parse import urljoin, urlparse
import warnings

# Apify Client for initial scraping
from apify_client import ApifyClient

# Web scraping libraries for deep crawl
import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# V3: Filter the XMLParsedAsHTMLWarning as you suggested.
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)


In [3]:
# ==============================================================================
# Cell 2: Configuration and Constants
# ==============================================================================

# --- Apify Configuration ---
APIFY_TOKEN = 'apify_api_CMAKVtiWTnwwgGWgJrCbT9bSgLAU8046EwgB'  # Replace with your real token
client = ApifyClient(APIFY_TOKEN)

# --- Web Crawling Configuration ---
MAX_PAGES_PER_SITE = 10 # Reduced, as we are more targeted
THREAD_POOL_WORKERS = 5
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
]

# --- Logic Configuration ---
THIRD_PARTY_JOB_SITES = ["indeed.com", "linkedin.com/jobs", "glassdoor.com", "workday.com", "greenhouse.io"]

# --- Selenium Setup ---
print("Setting up headless Selenium driver...")
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
try:
    driver.quit()
except NameError:
    pass
driver = webdriver.Chrome(options=chrome_options)
print("Selenium driver is ready.")


Setting up headless Selenium driver...
Selenium driver is ready.


In [4]:
# ==============================================================================
# Cell 3: V3 Helper Functions (Focused on HR Info)
# ==============================================================================

def is_valid_email(email):
    """Checks if an extracted email string is likely a real email and not obfuscated junk."""
    if not re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", email):
        return False
    if email.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
        return False
    return True

def extract_emails_v3(text, soup=None):
    """Extracts and validates emails to avoid junk data."""
    pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}'
    potential_emails = set(re.findall(pattern, text))
    if soup:
        for a in soup.find_all("a", href=lambda href: href and href.startswith("mailto:")):
            email = a['href'][7:].split("?")[0].strip()
            potential_emails.add(email)
    
    return {email for email in potential_emails if is_valid_email(email)}

def categorize_emails_v3(email_list, category_type='hr'):
    """Categorizes emails as BEST, BETTER, or EXCLUDE for HR or Sales."""
    keywords = {
        'hr': {
            "best": ["careers@", "jobs@", "hr@", "recruiting@", "talent@"],
            "better": ["info@", "contact@", "hello@"],
            "exclude": ["sales@", "support@", "billing@", "press@"]
        },
        'sales': {
            "best": ["sales@", "orders@", "partner@", "inquiries@"],
            "better": ["info@", "contact@", "hello@"],
            "exclude": ["careers@", "jobs@", "hr@", "support@"]
        }
    }
    cat_keys = keywords[category_type]
    categorized = {"BEST": set(), "BETTER": set(), "EXCLUDE": set()}

    for email in email_list:
        e_lower = email.lower()
        if any(kw in e_lower for kw in cat_keys["exclude"]):
            categorized["EXCLUDE"].add(email)
        elif any(kw in e_lower for kw in cat_keys["best"]):
            categorized["BEST"].add(email)
        elif any(kw in e_lower for kw in cat_keys["better"]):
            categorized["BETTER"].add(email)
        else:
            categorized["BETTER"].add(email)

    return {k: "; ".join(sorted(list(v))) for k, v in categorized.items()}

def detect_careers_page_v3(soup, base_url):
    """Finds the single best careers page URL."""
    best_link = ""
    min_len = 200
    
    for a in soup.find_all("a", href=True):
        href_lower = a['href'].lower()
        text_lower = a.get_text().lower()
        
        is_careers_link = any(kw in href_lower or kw in text_lower for kw in ["career", "jobs", "employment", "hiring", "work with us"])
        
        if is_careers_link and not any(site in href_lower for site in THIRD_PARTY_JOB_SITES):
            full_url = urljoin(base_url, a['href'])
            if len(full_url) < min_len:
                min_len = len(full_url)
                best_link = full_url
                
    return best_link.split('#')[0]


In [5]:
# ==============================================================================
# Cell 4: V3 Focused Website Crawler
# ==============================================================================

def crawl_site_for_hr_info_v3(base_url):
    """
    V3: This function is now focused. It only crawls a website to find:
    1. Additional emails.
    2. The primary careers page.
    It no longer looks for social media.
    """
    if not base_url or not isinstance(base_url, str):
        return {"crawled_emails": [], "crawled_careers_page": ""}

    all_emails = set()
    best_careers_page = ""

    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        resp = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(resp.text, "html.parser")
        text = soup.get_text(" ", strip=True)

        all_emails.update(extract_emails_v3(text, soup))
        best_careers_page = detect_careers_page_v3(soup, base_url)
    except Exception:
        try:
            driver.get(base_url)
            time.sleep(2)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            text = soup.get_text(" ", strip=True)

            all_emails.update(extract_emails_v3(text, soup))
            if not best_careers_page:
                best_careers_page = detect_careers_page_v3(soup, base_url)
        except Exception:
            # If both methods fail, return empty results
            pass

    return {
        "crawled_emails": list(all_emails),
        "crawled_careers_page": best_careers_page,
    }

In [6]:
# ==============================================================================
# Cell 5: V3 Main Orchestration Logic
# ==============================================================================
def process_business_item_v3(item):
    """V3: Enriches an Apify item with focused deep crawl data."""
    title = item.get('title', 'Unknown Business')
    website = item.get("website")
    
    print(f"-> Processing: {title}")
    
    # 1. Get all emails from Apify
    master_email_list = set(item.get('emails', []))
    
    # 2. Run focused crawler for HR info
    crawled_data = {}
    if website:
        crawled_data = crawl_site_for_hr_info_v3(website)
        # Add newly crawled emails to the master list
        master_email_list.update(crawled_data.get('crawled_emails', []))
    
    # 3. Categorize the complete, combined list of emails
    hr_emails = categorize_emails_v3(list(master_email_list), 'hr')
    sales_emails = categorize_emails_v3(list(master_email_list), 'sales')
    
    # 4. Assemble the final record, directly using Apify's social data
    final_data = {
        'Title': title,
        'Address': item.get('address'),
        'Website': website,
        'Phone': item.get('phone'),
        'Category': item.get('categoryName'),
        'HR_Best_Email': hr_emails.get('BEST'),
        'HR_Better_Email': hr_emails.get('BETTER'),
        'Sales_Best_Email': sales_emails.get('BEST'),
        'Careers_Page': crawled_data.get('crawled_careers_page'),
        'Instagram': next(iter(item.get('instagrams', [])), ''), # Get first link from Apify list
        'Facebook': next(iter(item.get('facebooks', [])), ''),
        'X_Twitter': next(iter(item.get('twitters', [])), ''),
        'LinkedIn': next(iter(item.get('linkedIns', [])), ''),
    }
    return final_data

# --- Step 1: Run Apify Actor ---
print("🚀 [V3] Starting Apify Google Places Scraper...")
run_input = {
    "searchStringsArray": ["restaurant"],
    "locationQuery": "Los Angeles",
    "maxCrawledPlacesPerSearch": 50,
    "language": "en",
    "scrapeContacts": True,
    "includeEnrichments": True,
    "maxImages": 0,
    "skipClosedPlaces": True,
}
run = client.actor("compass/crawler-google-places").call(run_input=run_input)
print(f"✅ Apify Actor run initiated. Waiting for results... (Run ID: {run.get('id')})")
dataset_items = client.dataset(run["defaultDatasetId"]).list_items().items
print(f"✅ Apify finished. Found {len(dataset_items)} businesses.")

# --- Step 2: Deep crawl with ThreadPool ---
enriched_results = []
print(f"\n🚀 [V3] Starting focused deep crawl of {len(dataset_items)} websites for HR info...")
with ThreadPoolExecutor(max_workers=THREAD_POOL_WORKERS) as executor:
    future_to_item = {executor.submit(process_business_item_v3, item): item.get('title') for item in dataset_items}
    for future in as_completed(future_to_item):
        try:
            enriched_data = future.result()
            enriched_results.append(enriched_data)
            print(f"✔️ Finished processing: {enriched_data['Title']}")
        except Exception as exc:
            print(f"⚠️ A business generated an exception: {exc}")

# --- Step 3: Save to CSV ---
print("\n🚀 [V3] Finalizing data and saving to CSV...")
final_df = pd.DataFrame(enriched_results)
final_df = final_df.fillna("Not found") # Use "Not found" for any remaining empty cells
output_filename = "final_business_directory_LA_v3.csv"
final_df.to_csv(output_filename, index=False)
print(f"\n🎉 Success! All data saved to '{output_filename}'")
print(final_df.head())

🚀 [V3] Starting Apify Google Places Scraper...


[36m[apify.crawler-google-places runId:cSptrzDMju9r7J6Xm][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:cSptrzDMju9r7J6Xm][0m -> 2025-06-23T23:43:11.806Z ACTOR: Pulling Docker image of build TIVsfdCTM4nrhY1m7 from registry.
[36m[apify.crawler-google-places runId:cSptrzDMju9r7J6Xm][0m -> 2025-06-23T23:43:11.809Z ACTOR: Creating Docker container.
[36m[apify.crawler-google-places runId:cSptrzDMju9r7J6Xm][0m -> 2025-06-23T23:43:11.883Z ACTOR: Starting Docker container.
[36m[apify.crawler-google-places runId:cSptrzDMju9r7J6Xm][0m -> 2025-06-23T23:43:14.910Z [32mINFO[39m  System info[90m {"apifyVersion":"3.3.2","apifyClientVersion":"2.12.0","crawleeVersion":"3.13.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:cSptrzDMju9r7J6Xm][0m -> 2025-06-23T23:43:16.021Z
[36m[apify.crawler-google-places runId:cSptrzDMju9r7J6Xm][0m -> 2025-06-23T23:43:16.023Z [32mINFO[39m  *** STARTING TO LOOK FOR (GEO)LOCATION ***
[3

✅ Apify Actor run initiated. Waiting for results... (Run ID: cSptrzDMju9r7J6Xm)
✅ Apify finished. Found 50 businesses.

🚀 [V3] Starting focused deep crawl of 50 websites for HR info...
-> Processing: Le Reve Restaurant
-> Processing: The Local Peasant - Sherman Oaks Restaurant and Bar
-> Processing: Gaby's Restaurant
-> Processing: West Restaurant & Lounge
-> Processing: Hu's Szechwan Restaurant
-> Processing: Peppone Restaurant
✔️ Finished processing: Le Reve Restaurant
-> Processing: Il Moro Restaurant
✔️ Finished processing: The Local Peasant - Sherman Oaks Restaurant and Bar
-> Processing: Arth Bar & Kitchen - Indian Restaurant in Culver City
✔️ Finished processing: West Restaurant & Lounge
-> Processing: Haifa Restaurant
-> Processing: Pita Bar + Grill - Mediterranean Restaurant Brentwood LA
-> Processing: Siam Chan Thai Restaurant
✔️ Finished processing: Peppone Restaurant
✔️ Finished processing: Haifa Restaurant
✔️ Finished processing: Pita Bar + Grill - Mediterranean Restaurant

In [7]:
# ==============================================================================
# Cell 6: Cleanup
# ==============================================================================
print("\n🔧 Closing Selenium driver...")
driver.quit()
print("✅ Cleanup complete.")


🔧 Closing Selenium driver...
✅ Cleanup complete.
