In [None]:
# ==============================================================================
# Cell 1: Setup and Configuration (OpenAI Version)
# ==============================================================================
import os
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import threading

# --- OpenAI / LLM Configuration ---
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --- API Key Configuration ---
# For security, it's best to set this as an environment variable.
os.environ["OPENAI_API_KEY"] = "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"

# --- File Configuration ---
INPUT_FILENAME = 'business_directory_cleaned.csv'
OUTPUT_KEYWORDS_FILENAME = 'catering_keyword_dictionary_openai.json'

# --- Selenium Setup (Thread-Safe) ---
thread_local_driver = threading.local()

def get_driver():
    """Creates or retrieves a Selenium driver for the current thread."""
    if not hasattr(thread_local_driver, 'driver'):
        print("  -  Initializing new Selenium driver for a thread...")
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
        thread_local_driver.driver = webdriver.Chrome(options=chrome_options)
    return thread_local_driver.driver

def close_driver():
    """Closes the driver for the current thread if it exists."""
    if hasattr(thread_local_driver, 'driver'):
        thread_local_driver.driver.quit()
        del thread_local_driver.driver

# --- Load Business Data ---
print(f"🚀 Loading business data from '{INPUT_FILENAME}'...")
try:
    df = pd.read_csv(INPUT_FILENAME)
    df.dropna(subset=['Company Website'], inplace=True)
    df = df[df['Company Website'].str.startswith('http', na=False)]
    website_urls = df['Company Website'].tolist()
    print(f"✅ Loaded and prepared to analyze all {len(website_urls)} websites.")
except FileNotFoundError:
    print(f"❌ ERROR: File '{INPUT_FILENAME}' not found.")
    website_urls = []


🚀 Loading business data from 'business_directory_cleaned.csv'...
✅ Loaded and prepared to analyze all 34 websites.


In [11]:
# ==============================================================================
# Cell 2: Hybrid Scraper and AI Analyzer (OpenAI Version)
# ==============================================================================

# --- Define the Hybrid Scraper ---
def scrape_website_with_hybrid_approach(url: str) -> str:
    """Implements the hybrid scraping strategy to get website text."""
    page_text = ""
    MIN_TEXT_LENGTH = 300
    try:
        # Attempt 1: Fast Scrape
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            page_text = soup.get_text(" ", strip=True)
    except requests.RequestException:
        pass # Silently fail and move to Selenium

    if len(page_text) > MIN_TEXT_LENGTH:
        print(f"  -  (Fast Scrape) Success for: {url}")
        return page_text

    # Attempt 2: Robust Fallback with Selenium
    print(f"  -  (Fast Scrape) Failed. Falling back to Selenium for: {url}")
    try:
        driver = get_driver()
        driver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        page_text = soup.get_text(" ", strip=True)
    except Exception as e:
        print(f"  -  (Selenium) ERROR scraping {url}: {e}")
        return ""
    
    return page_text

# --- Define the AI Analysis Logic (Using OpenAI) ---
# Initialize the OpenAI Language Model. 'gpt-4o' is fast and cost-effective.
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# prompt = ChatPromptTemplate.from_template("""
# You are an expert market research analyst. Based *only* on the following website text,
# identify keywords and short phrases (2-3 words) that indicate the company offers CATERING services.

# Return your answer as a single JSON object with one key, "keywords", which is a list of strings.
# If no keywords are found, return an empty list.
# Exclude generic terms like 'contact us', 'about us', 'menu', 'gallery'.

# Website Text:
# ---
# {context}
# ---
# """)

prompt = ChatPromptTemplate.from_template("""
You are an expert market research analyst for the food and beverage industry. 
The following is the text content scraped from a single company's website.

Your task is to analyze this text and identify specific keywords and short phrases (2-3 words) that strongly indicate the company offers CATERING services.

Please provide your output as a single, clean JSON object with one key, "keywords". 
If you find no relevant keywords, return an empty list.

- Focus on service-related terms (e.g., "corporate catering", "wedding events").
- Exclude generic business terms like 'contact us', 'about us', 'our menu', 'gallery', 'home', and copyright notices.
- Base your answer *only* on the text provided.

Here is the website content:
---
{context}
---
""")

# The chain tells LangChain how to process the data: prompt -> LLM -> clean string output
chain = prompt | llm | StrOutputParser()

def analyze_text_for_keywords(text_content: str) -> list:
    """Sends pre-scraped text to the OpenAI API and returns keywords."""
    if not text_content.strip():
        return []
    try:
        # Invoke the chain, which sends the request to OpenAI
        response_str = chain.invoke({"context": text_content})
        
        # Clean up potential markdown formatting from the response
        if response_str.startswith("```json"):
            response_str = response_str.strip("```json").strip()

        response_json = json.loads(response_str)
        keywords = response_json.get("keywords", [])
        return keywords if isinstance(keywords, list) else []
    except (json.JSONDecodeError, Exception) as e:
        print(f"  -  LLM or JSON parsing error: {e}")
        return []

In [16]:
# ==============================================================================
# Cell 3: Main Orchestration (Putting It All Together)
# ==============================================================================

def process_website(url: str):
    """The main worker function for each thread."""
    print(f"-> Processing: {url}")
    website_text = scrape_website_with_hybrid_approach(url)
    keywords = analyze_text_for_keywords(website_text)
    print(f"  -  Found keywords for {url}: {keywords}")
    return keywords

# --- Main Execution Block ---
all_extracted_keywords = []

if website_urls and os.environ.get("OPENAI_API_KEY") != "YOUR_OPENAI_API_KEY_HERE":
    print(f"\n🚀 Starting hybrid analysis for all {len(website_urls)} websites using OpenAI...")
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(process_website, url): url for url in website_urls}
        
        for future in as_completed(future_to_url):
            try:
                keywords = future.result()
                if keywords:
                    all_extracted_keywords.extend(keywords)
            except Exception as exc:
                print(f"❌ An error occurred for URL {future_to_url[future]}: {exc}")
        
        # Clean up all Selenium drivers used by the threads
        executor.submit(close_driver).result()

    print("\n✅ AI analysis complete.")
    
    # --- Aggregate and Save the Final Keyword Dictionary ---
    if all_extracted_keywords:
        from collections import Counter
        keyword_counts = Counter(all_extracted_keywords)
        MINIMUM_OCCURRENCES = 2
        
        final_keywords = sorted([k for k, c in keyword_counts.items() if c >= MINIMUM_OCCURRENCES])
        
        keyword_dictionary = {
            "Catering": {"skill_id": 11, "keywords": final_keywords}
        }
        
        with open(OUTPUT_KEYWORDS_FILENAME, 'w') as f:
            json.dump(keyword_dictionary, f, indent=4)
            
        print(f"\n🎉 Success! Keyword dictionary created and saved to '{OUTPUT_KEYWORDS_FILENAME}'.")
        print("\n--- Final Catering Keywords (from OpenAI) ---")
        print(keyword_dictionary["Catering"]["keywords"])
        
    else:
        print("\n⚠️ No keywords were extracted.")
elif not website_urls:
    print("\n⚠️ No URLs were loaded. Cannot proceed.")
else:
    print("\n❌ ERROR: Please set your OpenAI API key in Cell 1 before running.")


🚀 Starting hybrid analysis for all 34 websites using OpenAI...
-> Processing: http://www.marbled.la/
-> Processing: https://goodheartcatering.com/
-> Processing: https://luxebites.com/
-> Processing: https://www.cratefulcatering.com/
-> Processing: https://www.bitecatering.net/
  -  (Fast Scrape) Failed. Falling back to Selenium for: http://www.marbled.la/
  -  Initializing new Selenium driver for a thread...
  -  (Fast Scrape) Success for: https://www.bitecatering.net/
  -  (Fast Scrape) Success for: https://goodheartcatering.com/
  -  (Fast Scrape) Success for: https://www.cratefulcatering.com/  -  (Fast Scrape) Success for: https://luxebites.com/

  -  Found keywords for https://luxebites.com/: ['event catering', 'corporate catering', 'production catering', 'wedding catering', 'bar catering', 'smoothie catering', 'small party catering', 'event caterer']
-> Processing: https://chubscatering.com/
  -  (Fast Scrape) Success for: https://chubscatering.com/
  -  Found keywords for https

In [17]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

# ==============================================================================
# Cell 1: Configuration and Loading Inputs
# ==============================================================================

# --- File Configuration ---
# The original CSV with all your businesses
INPUT_BUSINESS_FILENAME = 'business_directory_cleaned.csv' 

# The keyword dictionary file you just created with OpenAI
INPUT_KEYWORDS_FILENAME = 'catering_keyword_dictionary_openai.json'

# The final, enriched output file
OUTPUT_FILENAME = 'final_tagged_businesses_openai.csv'

# --- Load Keyword Dictionary ---
print(f"🚀 Loading keyword dictionary from '{INPUT_KEYWORDS_FILENAME}'...")
try:
    with open(INPUT_KEYWORDS_FILENAME, 'r') as f:
        keyword_dict = json.load(f)
    
    # Extract the keywords and skill ID for the 'Catering' category
    catering_info = keyword_dict.get('Catering', {})
    CATERING_KEYWORDS = catering_info.get('keywords', [])
    CATERING_SKILL_ID = catering_info.get('skill_id')

    if not CATERING_KEYWORDS or not CATERING_SKILL_ID:
        raise ValueError("Keywords or skill_id for 'Catering' not found in JSON file.")
        
    print(f"✅ Successfully loaded {len(CATERING_KEYWORDS)} keywords for Skill ID {CATERING_SKILL_ID}.")

except (FileNotFoundError, ValueError) as e:
    print(f"❌ ERROR: Could not load or parse the keyword dictionary. {e}")
    CATERING_KEYWORDS = [] # Ensure the script doesn't fail later

# --- Load Business Data ---
print(f"\n🚀 Loading business data from '{INPUT_BUSINESS_FILENAME}'...")
try:
    df = pd.read_csv(INPUT_BUSINESS_FILENAME)
    df.dropna(subset=['Company Website'], inplace=True)
    df = df[df['Company Website'].str.startswith('http', na=False)]
    print(f"✅ Loaded {len(df)} businesses with valid websites to be processed.")
except FileNotFoundError:
    print(f"❌ ERROR: The file '{INPUT_BUSINESS_FILENAME}' was not found.")
    df = None


🚀 Loading keyword dictionary from 'catering_keyword_dictionary_openai.json'...
✅ Successfully loaded 22 keywords for Skill ID 11.

🚀 Loading business data from 'business_directory_cleaned.csv'...
✅ Loaded 34 businesses with valid websites to be processed.


In [18]:
# ==============================================================================
# Cell 2: Scraper and Tagger Function
# ==============================================================================

def scrape_and_find_matches(url: str, keywords_to_find: list):
    """
    Scrapes a single URL and checks its text against a list of keywords.
    Returns the list of keywords that were found.
    """
    if not url:
        return []
    
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            return []
            
        soup = BeautifulSoup(response.text, 'html.parser')
        page_text = soup.get_text(" ", strip=True).lower()
        
        # Find all keywords from our list that are present in the page text
        matched_keywords = [
            keyword for keyword in keywords_to_find 
            if keyword.lower() in page_text
        ]
        
        return matched_keywords
        
    except requests.RequestException:
        return []

In [19]:
# ==============================================================================
# Cell 3: Main Processing Logic
# ==============================================================================

def process_business_row(row_tuple):
    """
    Worker function that takes a row, scrapes the website, finds matches,
    and returns the updated row information.
    """
    index, row_data = row_tuple
    company_name = row_data['Company Name']
    website_url = row_data['Company Website']
    
    print(f"  -> Processing: {company_name}")
    
    matched_keywords = scrape_and_find_matches(website_url, CATERING_KEYWORDS)
    
    # Create a copy to modify
    updated_row = row_data.to_dict()

    if matched_keywords:
        updated_row['Matched_Keywords'] = ", ".join(matched_keywords)
        updated_row['Skill_ID'] = CATERING_SKILL_ID
    else:
        updated_row['Matched_Keywords'] = ""
        updated_row['Skill_ID'] = ""
        
    return updated_row

# --- Main execution block ---
if df is not None and CATERING_KEYWORDS:
    print(f"\n🚀 Starting tagging process for all {len(df)} businesses...")
    
    all_results = []
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_name = {
            executor.submit(process_business_row, row_tuple): row_tuple[1]['Company Name'] 
            for row_tuple in df.iterrows()
        }
        
        for future in as_completed(future_to_name):
            try:
                updated_row = future.result()
                all_results.append(updated_row)
            except Exception as exc:
                print(f"❌ An error occurred for business {future_to_name[future]}: {exc}")

    if all_results:
        final_df = pd.DataFrame(all_results)
        
        # Ensure the new columns are present
        if 'Matched_Keywords' not in final_df.columns:
            final_df['Matched_Keywords'] = ""
        if 'Skill_ID' not in final_df.columns:
            final_df['Skill_ID'] = ""

        # Reorder columns for clarity
        original_cols = [col for col in df.columns if col in final_df.columns]
        new_cols = ['Matched_Keywords', 'Skill_ID']
        final_df = final_df[original_cols + new_cols]
        
        final_df.to_csv(OUTPUT_FILENAME, index=False)
        print(f"\n🎉 Success! Enriched data saved to '{OUTPUT_FILENAME}'.")
        
        print("\n--- Sample of Final Output ---")
        print(final_df[['Company Name', 'Company Website', 'Matched_Keywords', 'Skill_ID']].head(10))
    else:
        print("\n⚠️ No businesses were processed.")
else:
    print("\n⚠️ Script did not run. Check that both input files are available and correctly configured.")


🚀 Starting tagging process for all 34 businesses...
  -> Processing: Marbled LA
  -> Processing: Good Heart Catering
  -> Processing: Luxe Bites - LA's Best Charcuterie Boards and Event Catering
  -> Processing: Crateful Catering Los Angeles
  -> Processing: Bite Catering Couture
  -> Processing: Chubby Fingers Catering Co
  -> Processing: Aloha Catering Services Inc
  -> Processing: Heirloom LA
  -> Processing: Las Hermanas Catering
  -> Processing: Spotted Hen Catering
  -> Processing: Felice Italian Catering
  -> Processing: Robert's Catering Services
  -> Processing: TGIS Catering Services
  -> Processing: Bites and Bashes Catering
  -> Processing: Basil Pizza Bar Catering
  -> Processing: The Daily by HC
  -> Processing: Schaffer
  -> Processing: Simon's Caterers
  -> Processing: K Michelle's Kitchen Catering
  -> Processing: Ask 4 Tacos Catering
  -> Processing: Paulina's Catering
  -> Processing: OFF THE SHELF CATERING
  -> Processing: Haute Chefs Los Angeles
  -> Processing: C