In [3]:
!pip install requests pandas selenium webdriver-manager fuzzywuzzy python-Levenshtein

Collecting pandas
  Downloading pandas-2.3.1-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting selenium
  Using cached selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Using cached python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.3.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting trio~=0.30.0 (from selenium)
  Using cached trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Using cached trio_websocket-0.12.2-py3-none-any

In [7]:
!pip install google


Collecting google
  Downloading google-3.0.0-py2.py3-none-any.whl.metadata (627 bytes)
Downloading google-3.0.0-py2.py3-none-any.whl (45 kB)
Installing collected packages: google
Successfully installed google-3.0.0


Business Lead Generation and Email Scraping
This notebook scrapes business details from Yelp, detects websites using Yelp and Google Search, categorizes businesses into those with and without websites, and extracts emails from Facebook pages and business websites. It generates CSVs for businesses with and without websites and prepares data for cold emailing with tailored service offers (website development or chatbot integration).
Dependencies:
pip install requests pandas selenium webdriver-manager fuzzywuzzy python-Levenshtein beautifulsoup4 googlesearch-python

Requirements:

A valid Yelp API key (replace API_KEY in the code).
Chrome WebDriver (automatically handled by webdriver-manager).
Manual Facebook login for email scraping.
Optional: Hunter.io API key for improved email extraction (commented out).

Outputs:

businesses_with_websites.csv: Businesses with detected websites.
businesses_without_websites.csv: Businesses without websites.
final_leads.csv: All businesses with emails and service offers.

Compliance:

Ensure compliance with Yelp and Facebook terms of service.
Follow CAN-SPAM Act or GDPR for cold emailing (include unsubscribe links, verify emails).


In [9]:
import requests
import pandas as pd
import csv
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from fuzzywuzzy import fuzz
from bs4 import BeautifulSoup
from googlesearch import search

# Yelp API Key
API_KEY = '8xEkMW9G-MyNjsVSh36yj6pvjmffoSnxl_XlARrk5UyHZv1IeKq65LYTMftgTLBPqqzfa8SVrK1F-fikdMjkwzkNmq9uc8A5vhO7W3iA8DaOhWXIfzwkijLkVnVyaHYx'
HEADERS = {'Authorization': f'Bearer {API_KEY}'}

# Optional: Hunter.io API Key (uncomment if available)
# HUNTER_API_KEY = 'your_hunter_api_key'

In [11]:
def search_yelp(term, zip_code, limit=50):
    """Search Yelp for businesses by term and zip code."""
    url = 'https://api.yelp.com/v3/businesses/search'
    params = {
        'term': term,
        'location': zip_code,
        'limit': limit
    }
    try:
        response = requests.get(url, headers=HEADERS, params=params)
        response.raise_for_status()
        return response.json().get('businesses', [])
    except requests.RequestException as e:
        print(f"❌ Error searching Yelp: {e}")
        return []

In [13]:
def check_website(url):
    """Check if a website URL is valid."""
    try:
        response = requests.get(url, timeout=5)
        return response.status_code == 200
    except:
        return False

def find_website(business_name, address):
    """Search Google for the business website."""
    query = f"{business_name} {address} official website"
    try:
        for url in search(query, num_results=5):
            if 'yelp.com' not in url and 'facebook.com' not in url:
                return url if check_website(url) else ''
        return ''
    except Exception as e:
        print(f"⚠️ Error searching Google for website: {e}")
        return ''

In [15]:
def process_businesses(businesses):
    """Process Yelp businesses, detect websites, and categorize them."""
    businesses_with_websites = []
    businesses_without_websites = []
    
    for b in businesses:
        # Check Yelp for website (avoid Yelp URLs)
        website = b.get('url', '') if not b.get('url', '').lower().startswith('https://www.yelp.com') else ''
        
        # If no website found, try Google Search
        if not website:
            website = find_website(b.get('name', ''), ", ".join(b['location'].get('display_address', [])))
        
        business_data = {
            'Name': b.get('name', ''),
            'Phone': b.get('display_phone', ''),
            'Address': ", ".join(b['location'].get('display_address', [])),
            'Category': ", ".join([cat['title'] for cat in b.get('categories', [])]),
            'Website': website,
            'Source': 'Yelp'
        }
        
        # Categorize based on website presence
        if business_data['Website']:
            businesses_with_websites.append(business_data)
        else:
            businesses_without_websites.append(business_data)
    
    return businesses_with_websites, businesses_without_websites

In [17]:
def save_to_csv(businesses, filename):
    """Save business data to a CSV file."""
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['Name', 'Phone', 'Address', 'Category', 'Website', 'Source'])
        writer.writeheader()
        writer.writerows(businesses)
    print(f"✅ Saved {len(businesses)} leads to {filename}")

In [19]:
def create_driver():
    """Initialize Chrome WebDriver with options."""
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-notifications")
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=options)

In [21]:
def search_facebook(driver, business_name):
    """Search Facebook for business pages by name."""
    search_url = f"https://www.facebook.com/search/pages?q={business_name.replace(' ', '%20')}"
    try:
        driver.get(search_url)
        time.sleep(6)
        links = driver.find_elements(By.XPATH, "//a[@href and @role='link']")
        valid_links = [link.get_attribute("href") for link in links if link.get_attribute("href") and "facebook.com" in link.get_attribute("href") and not any(x in link.get_attribute("href") for x in ["/search/", "/profile.php", "/people/"])]
        return valid_links[:5]  # Limit to top 5 to avoid excessive scraping
    except Exception as e:
        print(f"❌ Error searching Facebook: {e}")
        return []

In [23]:
def match_address(driver, target_address, page_url):
    """Validate Facebook page by matching address with fuzzy matching."""
    try:
        driver.get(page_url)
        time.sleep(6)
        page_text = driver.find_element(By.TAG_NAME, "body").text
        score = fuzz.token_set_ratio(target_address.lower(), page_text.lower())
        return score >= 70  # Adjustable threshold
    except Exception as e:
        print(f"⚠️ Error matching address for {page_url}: {e}")
        return False

In [25]:
def extract_email_facebook(driver):
    """Extract email from a Facebook page using regex."""
    try:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 2);")
        time.sleep(4)
        text = driver.find_element(By.TAG_NAME, "body").text
        emails = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
        return emails[0] if emails else ""
    except Exception as e:
        print(f"⚠️ Error extracting email from Facebook: {e}")
        return ""

In [27]:
def extract_email_website(url):
    """Extract email from a business website using BeautifulSoup."""
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        emails = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
        return emails[0] if emails else ""
    except Exception as e:
        print(f"⚠️ Error extracting email from website {url}: {e}")
        return ""

In [29]:
def get_email_hunter(business_name, website):
    """Placeholder for Hunter.io email extraction (requires API key)."""
    # Uncomment and add your Hunter.io API key to use
    # url = f"https://api.hunter.io/v2/domain-search?domain={website}&api_key={HUNTER_API_KEY}"
    # try:
    #     response = requests.get(url)
    #     response.raise_for_status()
    #     data = response.json()
    #     return data.get('data', {}).get('emails', [{}])[0].get('value', '')
    # except Exception as e:
    #     print(f"⚠️ Error with Hunter.io for {business_name}: {e}")
    #     return ""
    return ""  # Return empty string if not using Hunter.io

In [31]:
def scrape_emails(input_csv, output_csv):
    """Scrape emails from Facebook and websites for businesses in the input CSV."""
    businesses = pd.read_csv(input_csv).to_dict('records')
    driver = create_driver()
    
    print("🔐 Please log in to Facebook manually...")
    driver.get("https://www.facebook.com")
    input("✅ Press Enter after logging into Facebook...")
    
    results = []
    for i, business in enumerate(businesses):
        name = business['Name']
        address = business['Address']
        website = business.get('Website', '')
        print(f"🔍 [{i+1}] Processing: {name} - {address}")
        
        # Try extracting email
        matched_email = ""
        
        # Step 1: Try website (if available)
        if website:
            matched_email = extract_email_website(website)
            if matched_email:
                print(f"📧 Email Found (Website): {matched_email}")
        
        # Step 2: Try Facebook if no email from website
        if not matched_email:
            fb_links = search_facebook(driver, name)
            for link in fb_links:
                if match_address(driver, address, link):
                    matched_email = extract_email_facebook(driver)
                    if matched_email:
                        print(f"📧 Email Found (Facebook): {matched_email}")
                        break
                    else:
                        print(f"⚠️ No email on matched Facebook page")
                time.sleep(2)  # Avoid rate limiting
        
        # Step 3: Try Hunter.io (optional, uncomment if API key available)
        # if not matched_email and website:
        #     matched_email = get_email_hunter(name, website)
        #     if matched_email:
        #         print(f"📧 Email Found (Hunter.io): {matched_email}")
        
        if not matched_email:
            print("❌ No matching email found")
        
        results.append({
            "Name": name,
            "Phone": business.get('Phone', ''),
            "Address": address,
            "Category": business.get('Category', ''),
            "Website": website,
            "Email": matched_email,
            "Source": business.get('Source', 'Yelp'),
            "Service Offered": "Chatbot Integration" if website else "Website Development"
        })
        time.sleep(5)  # Avoid rate limiting
    
    driver.quit()
    
    # Save results to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["Name", "Phone", "Address", "Category", "Website", "Email", "Source", "Service Offered"])
        writer.writeheader()
        writer.writerows(results)
    print(f"✅ Done! Leads saved to: {output_csv}")

In [33]:
import smtplib
from email.mime.text import MIMEText

def send_cold_email(to_email, business_name, service_offered, sender_email, sender_password):
    """Send a cold email with a tailored service offer."""
    if not to_email:
        print(f"❌ No email provided for {business_name}")
        return
    
    if service_offered == "Website Development":
        subject = "Boost Your Business with a Professional Website"
        body = f"""Hi {business_name},

We noticed your business doesn’t have a website. Our team specializes in creating affordable, high-quality websites to help you attract more customers.

Reply to schedule a free consultation!
Best,
[Your Name]
Unsubscribe: [Your Unsubscribe Link]
"""
    else:
        subject = "Enhance Your Website with a Smart Chatbot"
        body = f"""Hi {business_name},

Your website looks great! Want to engage visitors 24/7? We offer chatbot integration to answer customer queries and boost conversions.

Reply to learn more!
Best,
[Your Name]
Unsubscribe: [Your Unsubscribe Link]
"""
    
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender_email
    msg['To'] = to_email
    
    try:
        with smtplib.SMTP('smtp.gmail.com', 587) as server:
            server.starttls()
            server.login(sender_email, sender_password)
            server.send_message(msg)
            print(f"📧 Email sent to {to_email}")
    except Exception as e:
        print(f"❌ Error sending email to {to_email}: {e}")

# Example usage (uncomment to test, replace with your email and app password)
# sender_email = "your_email@gmail.com"
# sender_password = "your_app_password"
# df = pd.read_csv("final_leads.csv")
# for _, row in df.iterrows():
#     send_cold_email(row['Email'], row['Name'], row['Service Offered'], sender_email, sender_password)

In [37]:
# Input section
zip_code = input("Enter ZIP Code (e.g., 90223): ")
keyword = input("Enter business type (e.g., bakery, plumber): ")

# Scrape Yelp data
raw_results = search_yelp(keyword, zip_code)
businesses_with_websites, businesses_without_websites = process_businesses(raw_results)

# Save to separate CSVs
save_to_csv(businesses_with_websites, "businesses_with_websites.csv")
save_to_csv(businesses_without_websites, "businesses_without_websites.csv")

# Scrape emails for all businesses
all_businesses = businesses_with_websites + businesses_without_websites
if all_businesses:
    save_to_csv(all_businesses, "all_businesses.csv")  # Temporary CSV for email scraping
    scrape_emails("all_businesses.csv", "final_leads.csv")

Enter ZIP Code (e.g., 90223):  79707
Enter business type (e.g., bakery, plumber):  Doctor


⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for website: search() got an unexpected keyword argument 'num_results'
⚠️ Error searching Google for 

✅ Press Enter after logging into Facebook... 


🔍 [1] Processing: Patel Raj R MD - 4214 Andrews Hwy, Ste 100B, Midland, TX 79703
⚠️ Error extracting email from website nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
❌ No matching email found
🔍 [2] Processing: Midland Primary Care - 1300 W Wall St, Midland, TX 79701
⚠️ Error extracting email from website nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
❌ No matching email found
🔍 [3] Processing: JOYCE ALASE, MD - 207 Tradewinds Blvd C, Ste C, Midland, TX 79706
⚠️ Error extracting email from website nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
❌ No matching email found
🔍 [4] Processing: Patel Pk MD - 4214 Andrews Hwy, Ste 303, Midland, TX 79703
⚠️ Error extracting email from website nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
❌ No matching email found
🔍 [5] Processing: Shylesh R Ganta, MD - 3401 Greenbriar, Ste 100, Midland, TX 79707
⚠️ Error extracting email from website 