In [4]:
# ==============================================================================
# Cell 1: Setup and Loading Data
# ==============================================================================
import os
import pandas as pd
import json
import threading
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import WebBaseLoader
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import Counter

In [None]:
# --- Configuration ---
os.environ["OPENAI_API_KEY"] = "sk-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"

INPUT_FILENAME = 'business_directory_cleaned.csv'
OUTPUT_KEYWORDS_FILENAME = 'catering_keyword_dictionary_full.json'

# --- Load Data (No Sampling) ---
print(f"🚀 Loading all business data from '{INPUT_FILENAME}'...")

try:
    df = pd.read_csv(INPUT_FILENAME)
    if 'Company Website' not in df.columns:
        raise ValueError("The required column 'Company Website' was not found in the CSV.")
    
    # Drop rows where the website URL is missing or invalid
    df.dropna(subset=['Company Website'], inplace=True)
    df = df[df['Company Website'].str.startswith('http', na=False)]
    
    # Create the final list of all URLs to be processed
    website_urls = df['Company Website'].tolist()
    
    print(f"✅ Loaded and prepared to analyze all {len(website_urls)} websites.")

except FileNotFoundError:
    print(f"❌ ERROR: The file '{INPUT_FILENAME}' was not found.")
    website_urls = []
except ValueError as e:
    print(f"❌ ERROR: {e}")
    website_urls = []

🚀 Loading all business data from 'business_directory_cleaned.csv'...
✅ Loaded and prepared to analyze all 34 websites.


In [7]:
# ==============================================================================
# Cell 2: AI Keyword Extraction Logic
# ==============================================================================

# Initialize the Language Model
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Define the prompt template for the AI
prompt = ChatPromptTemplate.from_template("""
You are an expert market research analyst for the food and beverage industry. 
The following is the text content scraped from a single company's website.

Your task is to analyze this text and identify specific keywords and short phrases (2-3 words) that strongly indicate the company offers CATERING services.

Please provide your output as a single, clean JSON object with one key, "keywords". 
If you find no relevant keywords, return an empty list.

- Focus on service-related terms (e.g., "corporate catering", "wedding events").
- Exclude generic business terms like 'contact us', 'about us', 'our menu', 'gallery', 'home', and copyright notices.
- Base your answer *only* on the text provided.

Here is the website content:
---
{context}
---
""")

# Define the analysis chain
chain = prompt | llm | StrOutputParser()

def analyze_website_for_keywords(url: str) -> list:
    """
    Uses LangChain to scrape a website and sends the content to an LLM
    to extract keywords. Returns a list of keywords.
    """
    print(f"  -> Analyzing: {url}")
    try:
        # Use the loader in its simplest, most robust form.
        loader = WebBaseLoader(
            web_paths=(url,),
            requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}}
        )
        docs = loader.load()
        
        if not docs:
            print(f"  -  WARNING: Could not load any content from {url}")
            return []
        
        website_content = "\n".join([doc.page_content for doc in docs])
        
        if not website_content.strip():
            print(f"  -  WARNING: Loaded page from {url} but found no text content.")
            return []

        # Invoke the chain with the scraped content.
        response_str = chain.invoke({"context": website_content})
        
        # Parse the JSON response from the LLM.
        try:
            if response_str.startswith("```json"):
                response_str = response_str.strip("```json").strip()
            
            response_json = json.loads(response_str)
            keywords = response_json.get("keywords", [])
        except json.JSONDecodeError:
            print(f"  -  WARNING: LLM did not return valid JSON. Response: {response_str}")
            return []
        
        if isinstance(keywords, list):
            print(f"  -  Found keywords: {keywords}")
            return keywords
        else:
            return []
            
    except Exception as e:
        print(f"  -  ERROR analyzing {url}: {e}")
        return []

In [8]:
# ==============================================================================
# Cell 3: Run Analysis at Scale and Create Final Dictionary
# ==============================================================================

all_extracted_keywords = []

if 'website_urls' in locals() and website_urls and os.environ.get("OPENAI_API_KEY") != "YOUR_OPENAI_API_KEY_HERE":
    print(f"\n🚀 Starting AI analysis for all {len(website_urls)} websites. This may take some time...")
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(analyze_website_for_keywords, url): url for url in website_urls}
        
        for future in as_completed(future_to_url):
            try:
                keywords = future.result()
                if keywords:
                    all_extracted_keywords.extend(keywords)
            except Exception as exc:
                print(f"❌ An error occurred for URL {future_to_url[future]}: {exc}")

    print("\n✅ AI analysis complete for all websites.")
    
    # --- Process and save the final keyword dictionary ---
    if all_extracted_keywords:
        # Count the occurrences of each keyword
        keyword_counts = Counter(all_extracted_keywords)
        
        # Only keep keywords that appeared on at least 2 different websites
        MINIMUM_OCCURRENCES = 2
        
        final_keywords = sorted([
            keyword for keyword, count in keyword_counts.items() 
            if count >= MINIMUM_OCCURRENCES
        ])
        
        # Create the final dictionary structure
        keyword_dictionary = {
            "Catering": {
                "skill_id": 11,
                "keywords": final_keywords
            }
        }
        
        # Save the dictionary to a JSON file
        with open(OUTPUT_KEYWORDS_FILENAME, 'w') as f:
            json.dump(keyword_dictionary, f, indent=4)
            
        print(f"\n🎉 Success! Keyword dictionary created and saved to '{OUTPUT_KEYWORDS_FILENAME}'.")
        print("\n--- Final Catering Keywords ---")
        for keyword in final_keywords:
            print(f"- {keyword}")
        
    else:
        print("\n⚠️ No keywords were extracted from any of the websites.")

elif not os.environ.get("OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY") == "YOUR_OPENAI_API_KEY_HERE":
    print("\n❌ ERROR: Please set your OpenAI API key in Cell 1 before running.")
else:
    print("\n⚠️ No URLs were loaded. Cannot proceed.")


🚀 Starting AI analysis for all 34 websites. This may take some time...
  -> Analyzing: http://www.marbled.la/
  -> Analyzing: https://goodheartcatering.com/
  -> Analyzing: https://luxebites.com/
  -> Analyzing: https://www.cratefulcatering.com/
  -> Analyzing: https://www.bitecatering.net/
  -  ERROR analyzing http://www.marbled.la/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
  -> Analyzing: https://chubscatering.com/
  -  Found keywords: ['corporate events', 'wedding menus', 'crew meals', 'beverage service', 'wedding catering', 'corporate catering', 'production catering', 'bar mitzvah catering', 'kosher catering', 'beverage catering services']
  -> Analyzing: http://alohacateringservicesinc.com/
  -  Found keywords: ['catering company', 'intimate gatherings', 'large, high-end events', 'customized menus', 'corporate events', 'holiday parties', 'weddings', 'rehearsal dinners', 'baby showers', 'nonprofit events', 'church events', 'full-

In [9]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# ==============================================================================
# Cell 1: Configuration and Loading Inputs
# ==============================================================================

# --- File Configuration ---
# The original CSV with all your businesses
INPUT_BUSINESS_FILENAME = 'business_directory_cleaned.csv' 

# The keyword dictionary file you just created with the AI
INPUT_KEYWORDS_FILENAME = 'catering_keyword_dictionary.json'

# The final, enriched output file
OUTPUT_FILENAME = 'final_tagged_businesses.csv'

# --- Load Keyword Dictionary ---
print(f"🚀 Loading keyword dictionary from '{INPUT_KEYWORDS_FILENAME}'...")
try:
    with open(INPUT_KEYWORDS_FILENAME, 'r') as f:
        keyword_dict = json.load(f)
    
    # Extract the keywords and skill ID for the 'Catering' category
    catering_info = keyword_dict.get('Catering', {})
    CATERING_KEYWORDS = catering_info.get('keywords', [])
    CATERING_SKILL_ID = catering_info.get('skill_id')

    if not CATERING_KEYWORDS or not CATERING_SKILL_ID:
        raise ValueError("Keywords or skill_id for 'Catering' not found in JSON file.")
        
    print(f"✅ Successfully loaded {len(CATERING_KEYWORDS)} keywords for Skill ID {CATERING_SKILL_ID}.")

except (FileNotFoundError, ValueError) as e:
    print(f"❌ ERROR: Could not load or parse the keyword dictionary. {e}")
    CATERING_KEYWORDS = [] # Ensure the script doesn't fail later

# --- Load Business Data ---
print(f"\n🚀 Loading business data from '{INPUT_BUSINESS_FILENAME}'...")
try:
    df = pd.read_csv(INPUT_BUSINESS_FILENAME)
    df.dropna(subset=['Company Website'], inplace=True)
    df = df[df['Company Website'].str.startswith('http', na=False)]
    print(f"✅ Loaded {len(df)} businesses with valid websites to be processed.")
except FileNotFoundError:
    print(f"❌ ERROR: The file '{INPUT_BUSINESS_FILENAME}' was not found.")
    df = None

🚀 Loading keyword dictionary from 'catering_keyword_dictionary.json'...
✅ Successfully loaded 8 keywords for Skill ID 11.

🚀 Loading business data from 'business_directory_cleaned.csv'...
✅ Loaded 34 businesses with valid websites to be processed.


In [10]:
# ==============================================================================
# Cell 2: Scraper and Tagger Function
# ==============================================================================

def scrape_and_find_matches(url: str, keywords_to_find: list):
    """
    Scrapes a single URL and checks its text against a list of keywords.
    Returns the list of keywords that were found.
    """
    if not url:
        return []
    
    try:
        # We can use a simple requests-based scraper here, as we don't need to handle complex JS
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(url, headers=headers, timeout=10)
        
        # Check if the request was successful
        if response.status_code != 200:
            return []
            
        soup = BeautifulSoup(response.text, 'html.parser')
        page_text = soup.get_text(" ", strip=True).lower()
        
        # Find all keywords from our list that are present in the page text
        matched_keywords = [
            keyword for keyword in keywords_to_find 
            if keyword.lower() in page_text
        ]
        
        return matched_keywords
        
    except requests.RequestException:
        # If the website is down or fails to load, return an empty list
        return []

In [11]:
# ==============================================================================
# Cell 3: Main Processing Logic
# ==============================================================================

def process_business_row(row_tuple):
    """
    Worker function that takes a row, scrapes the website, finds matches,
    and returns the updated row information.
    """
    index, row_data = row_tuple
    company_name = row_data['Company Name']
    website_url = row_data['Company Website']
    
    print(f"  -> Processing: {company_name}")
    
    # Scrape the website and get a list of any keywords that matched
    matched_keywords = scrape_and_find_matches(website_url, CATERING_KEYWORDS)
    
    # Create the new columns based on the results
    if matched_keywords:
        # Join the list of matched keywords into a single string
        row_data['Matched_Keywords'] = ", ".join(matched_keywords)
        row_data['Skill_ID'] = CATERING_SKILL_ID
    else:
        row_data['Matched_Keywords'] = ""
        row_data['Skill_ID'] = ""
        
    return row_data

# --- Main execution block ---
if df is not None and CATERING_KEYWORDS:
    print(f"\n🚀 Starting tagging process for all {len(df)} businesses...")
    
    all_results = []
    
    # Use ThreadPoolExecutor to process rows in parallel for speed
    with ThreadPoolExecutor(max_workers=10) as executor:
        # We pass df.iterrows() which gives us both the index and the row data
        future_to_name = {
            executor.submit(process_business_row, row_tuple): row_tuple[1]['Company Name'] 
            for row_tuple in df.iterrows()
        }
        
        for future in as_completed(future_to_name):
            try:
                # The result is the updated row (as a dictionary)
                updated_row = future.result()
                all_results.append(updated_row)
            except Exception as exc:
                print(f"❌ An error occurred for business {future_to_name[future]}: {exc}")

    # --- Save the final enriched DataFrame ---
    if all_results:
        # Create a new DataFrame from the list of updated row dictionaries
        final_df = pd.DataFrame(all_results)
        
        # Reorder columns to have the new ones at the end
        original_cols = [col for col in df.columns if col in final_df.columns]
        new_cols = ['Matched_Keywords', 'Skill_ID']
        final_df = final_df[original_cols + new_cols]
        
        final_df.to_csv(OUTPUT_FILENAME, index=False)
        print(f"\n🎉 Success! Enriched data saved to '{OUTPUT_FILENAME}'.")
        
        # Display a sample of the results
        print("\n--- Sample of Final Output ---")
        print(final_df[['Company Name', 'Matched_Keywords', 'Skill_ID']].head(10))
    else:
        print("\n⚠️ No businesses were processed.")
else:
    print("\n⚠️ Script did not run. Check that both input files are available and correctly configured.")


🚀 Starting tagging process for all 34 businesses...
  -> Processing: Marbled LA
  -> Processing: Good Heart Catering
  -> Processing: Luxe Bites - LA's Best Charcuterie Boards and Event Catering
  -> Processing: Crateful Catering Los Angeles
  -> Processing: Bite Catering Couture
  -> Processing: Chubby Fingers Catering Co
  -> Processing: Aloha Catering Services Inc
  -> Processing: Heirloom LA
  -> Processing: Las Hermanas Catering
  -> Processing: Spotted Hen Catering
  -> Processing: Felice Italian Catering
  -> Processing: Robert's Catering Services
  -> Processing: TGIS Catering Services
  -> Processing: Bites and Bashes Catering
  -> Processing: Basil Pizza Bar Catering
  -> Processing: The Daily by HC
  -> Processing: Schaffer
  -> Processing: Simon's Caterers
  -> Processing: K Michelle's Kitchen Catering
  -> Processing: Ask 4 Tacos Catering
  -> Processing: Paulina's Catering
  -> Processing: OFF THE SHELF CATERING
  -> Processing: Haute Chefs Los Angeles
  -> Processing: C

In [3]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import time

# Google Places API setup
API_KEY = 'AIzaSyCpnXVIbUNCZTfTt7xoihkGt14SgCmaYxw'  # Replace with your Google Places API key
query = 'pizzeria in LA'
url = f'https://maps.googleapis.com/maps/api/place/textsearch/json?query={query}&key={API_KEY}'

# Fetch business data from Google Places API
def get_business_websites(query):
    try:
        response = requests.get(f'https://maps.googleapis.com/maps/api/place/textsearch/json?query={query}&key={API_KEY}')
        response.raise_for_status()
        data = response.json()
        websites = [result.get('website') for result in data.get('results', []) if 'website' in result]
        return [w for w in websites if w]  # Filter out None or empty websites
    except Exception as e:
        print(f"Error fetching business data: {e}")
        return []

# Extract emails from a webpage
def extract_emails(url, max_pages=10):
    emails = set()
    visited_urls = set()
    urls_to_visit = {url}
    
    while urls_to_visit and len(visited_urls) < max_pages:
        current_url = urls_to_visit.pop()
        if current_url in visited_urls:
            continue
        visited_urls.add(current_url)
        
        try:
            response = requests.get(current_url, timeout=5)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract emails from page text
            text = soup.get_text()
            found_emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
            emails.update(found_emails)
            
            # Find links to other pages on the same domain
            for link in soup.find_all('a', href=True):
                href = link['href']
                full_url = urljoin(current_url, href)
                if urlparse(full_url).netloc == urlparse(url).netloc:
                    urls_to_visit.add(full_url)
                    
            time.sleep(1)  # Respectful scraping delay
        except Exception as e:
            print(f"Error scraping {current_url}: {e}")
    
    return list(emails)

# Predefined patterns for classification
hr_patterns = ['hr@', 'careers@', 'jobs@', 'humanresources@']
sales_patterns = ['sales@', 'business@', 'info@', 'contact@']

# Classify emails based on patterns
def classify_email(email):
    email_lower = email.lower()
    if any(pattern in email_lower for pattern in hr_patterns):
        return 'HR'
    elif any(pattern in email_lower for pattern in sales_patterns):
        return 'Sales'
    return 'Other'

# Main workflow
def main():
    websites = get_business_websites(query)
    results = []
    
    for website in websites:
        print(f"Scraping {website}...")
        emails = extract_emails(website)
        for email in emails:
            category = classify_email(email)
            results.append({'website': website, 'email': email, 'category': category})
    
    # Save results to CSV
    import pandas as pd
    df = pd.DataFrame(results)
    df.to_csv('business_emails.csv', index=False)
    print("Results saved to business_emails.csv")
    return results

if __name__ == "__main__":
    main()

Results saved to business_emails.csv
