In [2]:
# ==============================================================================
# Cell 1: Imports & Warning Filter
# ==============================================================================
import os
import re
import time
import random
import pandas as pd
from urllib.parse import urljoin, urlparse
import warnings

# Apify Client
from apify_client import ApifyClient

# Web Scraping & Fuzzy Matching
import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed
from thefuzz import fuzz # V4: For fuzzy string matching

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [7]:
# ==============================================================================
# Cell 2: Configuration and Constants
# ==============================================================================

APIFY_TOKEN = 'apify_api_gxHBZNaeyfZ2uXbe5TmycB92jAtYRx0sO7ZQ'  # Replace
client = ApifyClient(APIFY_TOKEN)

THREAD_POOL_WORKERS = 5
USER_AGENTS = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"]
THIRD_PARTY_JOB_SITES = ["indeed.com", "linkedin.com/jobs", "glassdoor.com", "workday.com", "greenhouse.io"]

# V4: Expanded keywords for careers pages, including your suggestions
CAREER_KEYWORDS = ["career", "jobs", "employment", "hiring", "work with us", "join us", "join our team", "opportunities", "vacancies"]

# --- Selenium Setup ---
print("Setting up headless Selenium driver...")
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=chrome_options)
print("Selenium driver is ready.")

Setting up headless Selenium driver...
Selenium driver is ready.


In [8]:
# ==============================================================================
# Cell 3: V4 Helper Functions (Smarter & More Robust)
# ==============================================================================

def is_valid_email(email):
    if not re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", email): return False
    if email.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): return False
    return True

def extract_emails_v4(text, soup=None):
    pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}'
    potential_emails = set(re.findall(pattern, text))
    if soup:
        for a in soup.find_all("a", href=lambda href: href and href.startswith("mailto:")):
            potential_emails.add(a['href'][7:].split("?")[0].strip())
    return {email for email in potential_emails if is_valid_email(email)}

def categorize_emails_v4(email_list):
    categorized = {"HR_Best": set(), "HR_Better": set(), "Sales_Best": set()}
    for email in email_list:
        e_lower = email.lower()
        # HR Emails
        if any(kw in e_lower for kw in ["careers@", "jobs@", "hr@", "recruiting@"]):
            categorized["HR_Best"].add(email)
        # Sales Emails
        elif any(kw in e_lower for kw in ["sales@", "orders@", "partner@"]):
            categorized["Sales_Best"].add(email)
        # General/Fallback Emails
        elif any(kw in e_lower for kw in ["info@", "contact@", "hello@"]):
            categorized["HR_Better"].add(email)
    return {k: "; ".join(sorted(list(v))) for k, v in categorized.items()}

def detect_careers_page_v4(soup, base_url):
    """V4: Uses both exact keywords and fuzzy matching to find the best careers page."""
    best_link = ""
    highest_score = 0
    for a in soup.find_all("a", href=True):
        href = a['href']
        text_lower = a.get_text().lower()
        score = 0
        if any(kw in text_lower for kw in CAREER_KEYWORDS):
            score = 100
        if not score:
            try:
                path_parts = urlparse(href).path.lower().split('/')
                for part in path_parts:
                    if not part: continue
                    part_score = max(fuzz.ratio(part.replace('-', ' '), kw) for kw in CAREER_KEYWORDS)
                    if part_score > 90: # A 90% match is great for catching typos like 'carriers'
                        score = part_score
                        break
            except: pass
        if score > highest_score and not any(site in href.lower() for site in THIRD_PARTY_JOB_SITES):
            highest_score = score
            best_link = urljoin(base_url, href)
    return best_link.split('#')[0].split('?')[0] # Return clean URL

def extract_socials_v4(soup, base_url):
    """V4: More robust social media extraction."""
    links = {"Instagram": "", "Facebook": "", "X_Twitter": "", "LinkedIn": ""}
    social_patterns = {
        "Facebook": r"facebook\.com\/[A-Za-z0-9\._-]+",
        "Instagram": r"instagram\.com\/[A-Za-z0-9\._-]+",
        "X_Twitter": r"(twitter|x)\.com\/[A-Za-z0-9_]+",
        "LinkedIn": r"linkedin\.com\/company\/[A-Za-z0-9\._-]+"
    }
    for key, pattern in social_patterns.items():
        for a in soup.find_all("a", href=re.compile(pattern)):
            links[key] = urljoin(base_url, a['href'])
            break # Found the first one, move to the next social network
    return links

In [9]:
# ==============================================================================
# Cell 4: V4 Focused Website Crawler
# ==============================================================================

def crawl_website_v4(base_url):
    """V4: Crawls homepage using Selenium for emails, careers page, and social links."""
    if not base_url or not isinstance(base_url, str):
        return {"crawled_emails": [], "crawled_careers_page": "", "crawled_socials": {}}
    try:
        driver.get(base_url)
        time.sleep(2) 
        soup = BeautifulSoup(driver.page_source, "html.parser")
        text = soup.get_text(" ", strip=True)
        return {
            "crawled_emails": list(extract_emails_v4(text, soup)),
            "crawled_careers_page": detect_careers_page_v4(soup, base_url),
            "crawled_socials": extract_socials_v4(soup, base_url),
        }
    except Exception:
        return {"crawled_emails": [], "crawled_careers_page": "", "crawled_socials": {}}

In [10]:
# ==============================================================================
# Cell 5: V4 Main Orchestration Logic
# ==============================================================================
def process_business_item_v4(item):
    title = item.get('title', 'Unknown Business')
    website = item.get("website")
    print(f"-> Processing: {title}")
    
    master_email_list = set(item.get('emails', []))
    crawled_data = crawl_website_v4(website) if website else {}
    master_email_list.update(crawled_data.get('crawled_emails', []))
    
    categorized_emails = categorize_emails_v4(list(master_email_list))
    crawled_socials = crawled_data.get("crawled_socials", {})
    
    final_data = {
        'Title': title,
        'Address': item.get('address'),
        'Website': website,
        'Phone': item.get('phone'),
        'Category': item.get('categoryName'),
        'HR_Best_Email': categorized_emails.get('HR_Best'),
        'HR_Better_Email': categorized_emails.get('HR_Better'),
        'Sales_Best_Email': categorized_emails.get('Sales_Best'),
        'Careers_Page': crawled_data.get('crawled_careers_page'),
        # Prioritize Apify's data, but fall back to our crawled data
        'Instagram': next(iter(item.get('instagrams', [])), crawled_socials.get("Instagram", "")),
        'Facebook': next(iter(item.get('facebooks', [])), crawled_socials.get("Facebook", "")),
        'X_Twitter': next(iter(item.get('twitters', [])), crawled_socials.get("X_Twitter", "")),
        'LinkedIn': next(iter(item.get('linkedIns', [])), crawled_socials.get("LinkedIn", "")),
    }
    return final_data

# --- Main execution block ---
print("🚀 [V4] Starting Apify Google Places Scraper...")
run_input = {
    "searchStringsArray": ["restaurant"], "locationQuery": "Los Angeles", "maxCrawledPlacesPerSearch": 50,
    "language": "en", "scrapeContacts": True, "includeEnrichments": True,
    "maxImages": 0, "skipClosedPlaces": True,
}
run = client.actor("compass/crawler-google-places").call(run_input=run_input)
print(f"✅ Apify Actor run initiated. Waiting for results... (Run ID: {run.get('id')})")
dataset_items = client.dataset(run["defaultDatasetId"]).list_items().items
print(f"✅ Apify finished. Found {len(dataset_items)} businesses.")

enriched_results = []
print(f"\n🚀 [V4] Starting deep crawl with fuzzy matching...")
with ThreadPoolExecutor(max_workers=THREAD_POOL_WORKERS) as executor:
    future_to_item = {executor.submit(process_business_item_v4, item): item.get('title') for item in dataset_items}
    for future in as_completed(future_to_item):
        try:
            enriched_data = future.result()
            enriched_results.append(enriched_data)
            print(f"✔️ Finished processing: {enriched_data['Title']}")
        except Exception as exc:
            print(f"⚠️ A business generated an exception: {exc}")

print("\n🚀 [V4] Finalizing data and saving to CSV...")
final_df = pd.DataFrame(enriched_results)
final_df = final_df.fillna("Not found")
output_filename = "final_business_directory_LA_v4.csv"
final_df.to_csv(output_filename, index=False)
print(f"\n🎉 Success! All data saved to '{output_filename}'")
print(final_df.head())

🚀 [V4] Starting Apify Google Places Scraper...


[36m[apify.crawler-google-places runId:aQF0kI7CShKdt7b3z][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:aQF0kI7CShKdt7b3z][0m -> 2025-06-24T18:07:28.766Z ACTOR: Pulling Docker image of build TIVsfdCTM4nrhY1m7 from registry.
[36m[apify.crawler-google-places runId:aQF0kI7CShKdt7b3z][0m -> 2025-06-24T18:07:28.768Z ACTOR: Creating Docker container.
[36m[apify.crawler-google-places runId:aQF0kI7CShKdt7b3z][0m -> 2025-06-24T18:07:28.833Z ACTOR: Starting Docker container.
[36m[apify.crawler-google-places runId:aQF0kI7CShKdt7b3z][0m -> 2025-06-24T18:07:32.158Z [32mINFO[39m  System info[90m {"apifyVersion":"3.3.2","apifyClientVersion":"2.12.0","crawleeVersion":"3.13.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:aQF0kI7CShKdt7b3z][0m -> 2025-06-24T18:07:32.965Z
[36m[apify.crawler-google-places runId:aQF0kI7CShKdt7b3z][0m -> 2025-06-24T18:07:32.967Z [32mINFO[39m  *** STARTING TO LOOK FOR (GEO)LOCATION ***
[3

✅ Apify Actor run initiated. Waiting for results... (Run ID: aQF0kI7CShKdt7b3z)
✅ Apify finished. Found 50 businesses.

🚀 [V4] Starting deep crawl with fuzzy matching...
-> Processing: Fortune House Restaurant
-> Processing: Pupuseria & Restaurant Las Isletas
-> Processing: Anarbagh - Woodland Hills Indian Restaurant
-> Processing: Cafe FooDelicious
-> Processing: Koko's Middle Eastern
-> Processing: Evergreen Restaurant
✔️ Finished processing: Pupuseria & Restaurant Las Isletas
✔️ Finished processing: Evergreen Restaurant
-> Processing: Chiguacle, Restaurant & Tortilleria
-> Processing: Versailles Cuban Food- Encino
-> Processing: El Rancherito Restaurant
-> Processing: The Baker Restaurant Bakery & Cafe
✔️ Finished processing: Fortune House Restaurant
✔️ Finished processing: Anarbagh - Woodland Hills Indian Restaurant
✔️ Finished processing: El Rancherito Restaurant
-> Processing: Flooky's - Woodland Hills
✔️ Finished processing: Cafe FooDelicious
-> Processing: Vinh Loi Tofu
✔️ Fini

In [11]:
# ==============================================================================
# Cell 6: Cleanup
# ==============================================================================
print("\n🔧 Closing Selenium driver...")
driver.quit()
print("✅ Cleanup complete.")


🔧 Closing Selenium driver...
✅ Cleanup complete.
