In [24]:
# Install all necessary packages
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [35]:
# Import the drive module from Google Colab
from google.colab import drive
import os

# Mount your Google Drive.
# This will prompt you for authentication the first time.
print("🚀 Mounting Google Drive...")
drive.mount('/content/drive')

# --- IMPORTANT: Define the path to your main project folder on Google Drive ---
# Make sure this path exactly matches the folder structure you created.
PROJECT_PATH = "/content/drive/My Drive/AI_Business_Classifier/"
os.chdir(PROJECT_PATH) # Change the current working directory to your project folder

# Verify that the directory was changed and list files to confirm
print(f"\n✅ Successfully changed directory to: {os.getcwd()}")
print("\nFiles in your project directory:")
!ls -R

🚀 Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

✅ Successfully changed directory to: /content/drive/My Drive/AI_Business_Classifier

Files in your project directory:
.:
Extracted_Business_Groups_and_Types.csv     Outputs
Extracted_Business_Groups_and_Types.gsheet  Skill_Tag_Rulebooks

./Outputs:
final_classified_businesses.csv  final_classified_businesses.gsheet

./Skill_Tag_Rulebooks:
'Beauty, Massage & Spa.csv'  'Food & Beverage Establishments.csv'


In [None]:
# ==============================================================================
# Cell 2: The Full Pipeline - Setup, Classes, and Execution
# ==============================================================================
import pandas as pd
import json
import re
import time
import threading
from dataclasses import dataclass
from typing import List, Optional, Dict, Any

# LangChain, OpenAI, and Scraping Tools
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from openai import RateLimitError
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from thefuzz import fuzz
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

# ============================
# 1. CONFIGURATION
# ============================

# # IMPORTANT: Set your API keys here
# os.environ["OPENAI_API_KEY"] = "REMOVED_SECRET_KEY"
# GOOGLE_API_KEY = "AIzaSyBXbfRXA9eJxdL3DmLt3TuDbCtvzP6RWLA"
# Option 2: Set directly in code (less secure - remove from production)
# Option 1: Set from environment variables (recommended)
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

if not OPENAI_API_KEY:
    OPENAI_API_KEY = ""
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

if not GOOGLE_API_KEY:
    GOOGLE_API_KEY = ""

# Define file and folder paths relative to the project directory
UNIVERSAL_TAGS_FILE = "Extracted_Business_Groups_and_Types.csv"
RULEBOOKS_FOLDER = "Skill_Tag_Rulebooks/"
OUTPUT_FOLDER = "Outputs/"
FINAL_OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, "final_classified_businesses.csv")

CAREER_KEYWORDS = ["career", "jobs", "employment", "hiring", "work with us", "join us", "opportunities"]
THIRD_PARTY_JOB_SITES = ["indeed.com", "linkedin.com/jobs", "glassdoor.com", "workday.com"]

In [37]:
# Initialize OpenAI LLM
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Thread-local storage for Selenium driver
thread_local_driver = threading.local()

def get_driver():
    """Initialize and return a thread-local Selenium driver with stealth mode."""
    if not hasattr(thread_local_driver, 'driver'):
        print("  -  Initializing new STEALTH Selenium driver...")
        try:
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-blink-features=AutomationControlled")
            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
            chrome_options.add_experimental_option('useAutomationExtension', False)

            driver = webdriver.Chrome(options=chrome_options)
            stealth(driver,
                languages=["en-US", "en"],
                vendor="Google Inc.",
                platform="Win32",
                webgl_vendor="Intel Inc.",
                renderer="Intel Iris OpenGL Engine",
                fix_hairline=True
            )
            thread_local_driver.driver = driver
            print("  ✅ Selenium driver initialized successfully")
        except Exception as e:
            print(f"  ❌ Error initializing Selenium driver: {e}")
            return None
    return thread_local_driver.driver

def load_and_parse_configs():
    """Load and parse all configuration files (universal tags and rulebooks)."""
    print("🚀 Loading and parsing all configuration files...")

    # Load universal tags mapping
    try:
        df_universal = pd.read_csv(UNIVERSAL_TAGS_FILE)
        group_keyword_map = {}

        for _, row in df_universal.iterrows():
            group = row['Business Group']
            types_str = row['Business Type']

            if pd.notna(types_str):
                for biz_type in [t.strip().lower() for t in types_str.split(',')]:
                    group_keyword_map[biz_type] = group

        print(f"  ✅ Loaded {len(group_keyword_map)} business type mappings")
    except FileNotFoundError:
        print(f"  ❌ Error: {UNIVERSAL_TAGS_FILE} not found")
        return {}, {}
    except Exception as e:
        print(f"  ❌ Error loading universal tags: {e}")
        return {}, {}

    # Load rulebooks
    rulebook_data = {}
    try:
        if not os.path.exists(RULEBOOKS_FOLDER):
            print(f"  ❌ Error: {RULEBOOKS_FOLDER} directory not found")
            return group_keyword_map, {}

        for filename in os.listdir(RULEBOOKS_FOLDER):
            if filename.endswith(".csv"):
                business_group_name = filename.replace(".csv", "")
                df_rulebook = pd.read_csv(os.path.join(RULEBOOKS_FOLDER, filename))
                df_rulebook.columns = df_rulebook.columns.str.strip()

                rules = {}
                skill_id_map = {}
                header_notes = {}

                for _, row in df_rulebook.iterrows():
                    tag = str(row.get('Skills Tags', '')).strip()

                    if tag.upper() in ['BUSINESS TYPES', 'SPECIAL TAGS', 'IMPORTANT NOTES']:
                        header_notes[tag.upper()] = str(row.get('Prompt Rule', ''))
                    elif pd.notna(tag) and tag:
                        rules[tag] = str(row.get('Prompt Rule', ''))
                        skill_id_map[tag] = str(row.get('Skills IDs', ''))

                rulebook_data[business_group_name] = {
                    "rules": rules,
                    "skill_id_map": skill_id_map,
                    "header_notes": header_notes
                }

        print(f"  ✅ Loaded {len(rulebook_data)} rulebooks")
    except Exception as e:
        print(f"  ❌ Error loading rulebooks: {e}")
        return group_keyword_map, {}

    print("✅ All configurations loaded and parsed successfully")
    return group_keyword_map, rulebook_data

# Load configurations
group_keyword_map, rulebook_data = load_and_parse_configs()

🚀 Loading and parsing all configuration files...
  ✅ Loaded 215 business type mappings
  ✅ Loaded 2 rulebooks
✅ All configurations loaded and parsed successfully


In [38]:
# ============================
# CELL 4: BUSINESS FINDER CLASS
# ============================

class BusinessFinder:
    """Class to find businesses using Google Places API."""

    def __init__(self, google_api_key, universal_map):
        self.google_api_key = google_api_key
        self.universal_map = universal_map
        self.places_base_url = "https://maps.googleapis.com/maps/api/place"

    def determine_business_group(self, query):
        """Determine the business group based on query keywords."""
        query_lower = query.lower()

        # Check for exact matches first
        for keyword, group in self.universal_map.items():
            if keyword in query_lower:
                return group

        # If no exact match, try partial matches
        for keyword, group in self.universal_map.items():
            if any(word in query_lower for word in keyword.split()):
                return group

        return None

    def find_businesses(self, query: str, max_results: int = 15):
        """Find businesses using Google Places API."""
        business_group = self.determine_business_group(query)
        if not business_group:
            print(f"⚠️ Could not determine a business group for query: '{query}'")
            return [], None

        print(f"\n🔎 Identified Business Group: '{business_group}'")
        print(f"🔍 Searching Google Places for '{query}'...")

        try:
            params = {
                'query': query,
                'key': self.google_api_key,
                'type': 'establishment'
            }

            response = requests.get(f"{self.places_base_url}/textsearch/json", params=params)
            response.raise_for_status()
            data = response.json()

            if data.get('status') != 'OK':
                print(f"⚠️ Google Places API returned status: {data.get('status')}")
                return [], business_group

            businesses = []
            place_ids = set()

            # Get more initial results to filter for websites
            for place in data.get("results", [])[:max_results * 2]:
                place_id = place.get("place_id")
                if not place_id or place_id in place_ids:
                    continue

                details = self._get_details(place_id)
                if details.get("website"):
                    businesses.append(details)
                    place_ids.add(place_id)

                if len(businesses) >= max_results:
                    break

            print(f"✅ Found {len(businesses)} businesses with websites")
            return businesses, business_group

        except requests.RequestException as e:
            print(f"❌ Error with Google Places API: {e}")
            return [], business_group
        except Exception as e:
            print(f"❌ Unexpected error in find_businesses: {e}")
            return [], business_group

    def _get_details(self, place_id):
        """Get detailed information for a specific place."""
        if not place_id:
            return {}

        try:
            params = {
                'place_id': place_id,
                'fields': 'name,website,formatted_address,business_status',
                'key': self.google_api_key
            }

            response = requests.get(f"{self.places_base_url}/details/json", params=params)
            response.raise_for_status()
            data = response.json()

            if data.get('status') == 'OK':
                return data.get('result', {})
            else:
                print(f"⚠️ Details API returned status: {data.get('status')} for place_id: {place_id}")
                return {}

        except requests.RequestException as e:
            print(f"❌ Error getting place details: {e}")
            return {}
        except Exception as e:
            print(f"❌ Unexpected error in _get_details: {e}")
            return {}

print("✅ BusinessFinder class defined")

✅ BusinessFinder class defined


In [39]:
# ============================
# CELL 5: WEBSITE ANALYZER CLASS
# ============================

class WebsiteAnalyzer:
    """Class to analyze websites and classify business skills."""

    def __init__(self, llm_instance, rulebook_data):
        self.llm_instance = llm_instance
        self.rulebook_data = rulebook_data

        # Create the prompt template
        self.prompt_template = ChatPromptTemplate.from_template("""
You are an expert business classification analyst. Analyze the website text based on the provided rules.

**PRIORITY 1: GLOBAL SPECIAL RULES**
{special_rules}

**PRIORITY 2: INDUSTRY-SPECIFIC RULES**
{dynamic_rules}

**IMPORTANT INSTRUCTIONS:**
- Read the website text carefully
- Apply the rules in order of priority
- Only return skills that have clear evidence in the text
- Be conservative - if unsure, don't include the skill

**OUTPUT FORMAT:**
Return a single JSON object with one key: "applied_skills". The value must be a list of strings, with each string being an EXACT match from the rule names provided above. If no skills apply, return an empty list.

Example:
{{"applied_skills": ["skill1", "skill2"]}}

**WEBSITE TEXT TO ANALYZE:**
{context}
""")

        # Create the chain
        self.chain = self.prompt_template | llm_instance | StrOutputParser()

    def scrape_and_analyze(self, url: str, business_group: str):
        """Scrape website and analyze for business skills."""
        print(f"  -> Scraping and analyzing: {url}")

        # Scrape the website
        soup = self._get_soup_from_url(url)
        if not soup:
            return {"careers_page": "", "skill_names": []}

        # Find careers page
        careers_page = self._find_careers_page(soup, url)

        # Get clean text for analysis
        scraped_text = self._get_clean_text(soup)

        # Classify with AI
        skill_names = self._classify_with_ai(scraped_text, business_group)

        return {
            "careers_page": careers_page,
            "skill_names": skill_names
        }

    def _get_soup_from_url(self, url):
        """Get BeautifulSoup object from URL, with fallback to Selenium."""
        # Try requests first (faster)
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            if response.status_code == 200:
                return BeautifulSoup(response.text, 'html.parser')

        except requests.RequestException as e:
            print(f"  - Requests failed for {url}: {e}")

        # Fallback to Selenium
        print(f"  - Falling back to Selenium for: {url}")
        try:
            driver = get_driver()
            if not driver:
                return None

            driver.get(url)
            time.sleep(3)  # Wait for page to load

            return BeautifulSoup(driver.page_source, 'html.parser')

        except Exception as e:
            print(f"  - Selenium failed for {url}: {e}")
            return None

    def _get_clean_text(self, soup):
        """Extract and clean text from BeautifulSoup object."""
        # Remove unwanted elements
        for tag in soup.select('nav, header, footer, script, style, noscript, iframe'):
            tag.decompose()

        # Get text and clean it
        text = soup.get_text(" ", strip=True)

        # Remove excessive whitespace
        text = " ".join(text.split())

        # Limit text length to avoid token limits
        if len(text) > 10000:
            text = text[:10000] + "..."

        return text

    def _find_careers_page(self, soup, base_url):
        """Find the careers page URL from the website."""
        best_link = ""
        highest_score = 0

        for a in soup.find_all("a", href=True):
            if not a.get('href'):
                continue

            href = a.get('href')
            text = a.get_text().lower().strip()

            # Direct text match
            score = 0
            if any(keyword in text for keyword in CAREER_KEYWORDS):
                score = 100

            # URL path analysis
            if not score:
                try:
                    parsed_url = urlparse(href)
                    path_parts = parsed_url.path.lower().split('/')

                    for part in path_parts:
                        if not part:
                            continue

                        # Check for career-related terms in URL
                        part_clean = part.replace('-', ' ').replace('_', ' ')
                        for keyword in CAREER_KEYWORDS:
                            similarity = fuzz.ratio(part_clean, keyword)
                            if similarity > 80:
                                score = max(score, similarity)

                except Exception:
                    continue

            # Avoid third-party job sites
            if score > highest_score:
                if not any(site in href.lower() for site in THIRD_PARTY_JOB_SITES):
                    highest_score = score
                    best_link = urljoin(base_url, href)

        # Clean the URL (remove fragments and query params)
        if best_link:
            best_link = best_link.split('#')[0].split('?')[0]

        return best_link

    def _classify_with_ai(self, text_content, business_group):
        """Classify business skills using AI."""
        if not text_content.strip():
            return []

        if not business_group or business_group not in self.rulebook_data:
            print(f"  - No rulebook found for business group: {business_group}")
            return []

        rules = self.rulebook_data[business_group]

        # Format the rules for the prompt
        prompt_rules = []
        for tag, rule in rules['rules'].items():
            prompt_rules.append(f"- {tag}:\n  {rule}")

        prompt_rules_str = "\n".join(prompt_rules)

        # Get special rules
        special_notes = rules['header_notes'].get('SPECIAL TAGS', 'No special rules apply.')

        # Try classification with retries
        for attempt in range(3):
            try:
                response_str = self.chain.invoke({
                    "special_rules": special_notes,
                    "dynamic_rules": prompt_rules_str,
                    "context": text_content
                })

                # Clean up response
                if "```json" in response_str:
                    response_str = response_str.split("```json")[1].split("```")[0].strip()
                elif "```" in response_str:
                    response_str = response_str.split("```")[1].split("```")[0].strip()

                # Parse JSON
                result = json.loads(response_str)
                applied_skills = result.get("applied_skills", [])

                # Validate that returned skills exist in our rules
                valid_skills = []
                for skill in applied_skills:
                    if skill in rules['rules']:
                        valid_skills.append(skill)
                    else:
                        print(f"  - Warning: AI returned invalid skill '{skill}'")

                return valid_skills

            except RateLimitError:
                wait_time = 2 ** attempt
                print(f"  - Rate limit hit, waiting {wait_time} seconds...")
                time.sleep(wait_time)

            except json.JSONDecodeError as e:
                print(f"  - JSON decode error (attempt {attempt + 1}): {e}")
                if attempt == 2:  # Last attempt
                    print(f"  - Raw response: {response_str}")

            except Exception as e:
                print(f"  - AI classification error (attempt {attempt + 1}): {e}")

        return []

print("✅ WebsiteAnalyzer class defined")

✅ WebsiteAnalyzer class defined


In [40]:
# ============================
# CELL 6: MAIN EXECUTION FUNCTION
# ============================

def main():
    """Main execution pipeline."""
    print("🚀 Starting Business Analysis Pipeline")

    # Validate API keys
    if not GOOGLE_API_KEY or GOOGLE_API_KEY == "YOUR_GOOGLE_API_KEY_HERE":
        print("❌ CRITICAL ERROR: Google API Key is not set.")
        print("   Please set the GOOGLE_API_KEY environment variable or update the configuration.")
        return

    if not OPENAI_API_KEY or OPENAI_API_KEY == "YOUR_OPENAI_API_KEY_HERE":
        print("❌ CRITICAL ERROR: OpenAI API Key is not set.")
        print("   Please set the OPENAI_API_KEY environment variable or update the configuration.")
        return

    # Initialize components
    finder = BusinessFinder(GOOGLE_API_KEY, group_keyword_map)
    analyzer = WebsiteAnalyzer(llm, rulebook_data)

    # Get user input
    user_query = input("\n🔍 Enter your business search query (e.g., 'bakeries in los angeles'): ").strip()
    if not user_query:
        print("❌ No query provided. Exiting.")
        return

    # Find businesses
    businesses_to_analyze, business_group = finder.find_businesses(user_query, max_results=15)

    if not businesses_to_analyze:
        print("❌ No businesses found or could not determine business group for this query.")
        return

    if not business_group:
        print("❌ Could not determine business group for this query.")
        return

    # Load existing results to avoid duplicates
    try:
        df_processed = pd.read_csv(FINAL_OUTPUT_FILE)
        processed_websites = set(df_processed['Website'].dropna())
        print(f"📊 Found {len(processed_websites)} previously processed websites")
    except FileNotFoundError:
        df_processed = pd.DataFrame()
        processed_websites = set()
        print("📊 No previous results found, starting fresh")

    # Filter out already processed businesses
    businesses_to_process = [
        b for b in businesses_to_analyze
        if b.get('website') and b.get('website') not in processed_websites
    ]

    if not businesses_to_process:
        print("✅ All businesses from this search have already been processed.")
        print(f"📊 Check results in: {FINAL_OUTPUT_FILE}")
        return

    print(f"\n🔬 Starting analysis for {len(businesses_to_process)} new businesses...")
    print(f"📊 Business Group: {business_group}")

    # Process each business
    new_results = []
    for i, business in enumerate(businesses_to_process, 1):
        print(f"\n--- Processing {i}/{len(businesses_to_process)}: {business.get('name', 'Unknown')} ---")

        try:
            # Analyze the website
            analysis = analyzer.scrape_and_analyze(business['website'], business_group)

            skill_names = analysis.get("skill_names", [])
            skill_id_map = rulebook_data[business_group]['skill_id_map']

            # Map skill names to IDs
            matched_ids = []
            for skill in skill_names:
                ids = skill_id_map.get(skill, "")
                if ids:
                    matched_ids.extend([s.strip() for s in str(ids).split(',') if s.strip()])

            # Create result record
            result = {
                "Business_Name": business.get("name", ""),
                "Website": business.get("website", ""),
                "Address": business.get("formatted_address", ""),
                "Business_Group": business_group,
                "Careers_Page": analysis.get("careers_page", ""),
                "Skill_IDs": ", ".join(sorted(list(set(matched_ids)))),
                "Skill_Names": ", ".join(skill_names),
                "Processing_Date": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
            }

            new_results.append(result)

            print(f"  ✅ Found {len(skill_names)} skills: {', '.join(skill_names)}")

        except Exception as e:
            print(f"  ❌ Error processing {business.get('name', 'Unknown')}: {e}")

            # Add error record
            error_result = {
                "Business_Name": business.get("name", ""),
                "Website": business.get("website", ""),
                "Address": business.get("formatted_address", ""),
                "Business_Group": business_group,
                "Careers_Page": "",
                "Skill_IDs": "",
                "Skill_Names": f"ERROR: {str(e)}",
                "Processing_Date": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            new_results.append(error_result)

        # Small delay between requests
        time.sleep(1)

    # Save results
    if new_results:
        df_new = pd.DataFrame(new_results)

        # Combine with existing results
        if not df_processed.empty:
            # Ensure column alignment
            for col in df_new.columns:
                if col not in df_processed.columns:
                    df_processed[col] = ""
            for col in df_processed.columns:
                if col not in df_new.columns:
                    df_new[col] = ""

            df_final = pd.concat([df_processed, df_new], ignore_index=True)
        else:
            df_final = df_new

        # Save to CSV
        df_final.to_csv(FINAL_OUTPUT_FILE, index=False)

        print(f"\n🎉 Pipeline complete!")
        print(f"📊 {len(new_results)} new businesses processed")
        print(f"📁 Results saved to: {FINAL_OUTPUT_FILE}")

        # Show summary
        print("\n--- Summary of New Results ---")
        successful_results = df_new[~df_new['Skill_Names'].str.contains('ERROR:', na=False)]
        if not successful_results.empty:
            print(f"✅ Successfully processed: {len(successful_results)} businesses")
            print(f"📊 Skills found: {successful_results['Skill_Names'].str.split(', ').explode().nunique()} unique skills")

        error_results = df_new[df_new['Skill_Names'].str.contains('ERROR:', na=False)]
        if not error_results.empty:
            print(f"❌ Errors encountered: {len(error_results)} businesses")

        print(f"\n📋 Sample results:")
        print(df_new[['Business_Name', 'Website', 'Skill_Names']].head().to_string(index=False))

    else:
        print("\n❌ No new results to save.")

def cleanup_driver():
    """Clean up Selenium driver."""
    if hasattr(thread_local_driver, 'driver'):
        try:
            thread_local_driver.driver.quit()
            print("✅ Selenium driver closed successfully")
        except Exception as e:
            print(f"⚠️ Error closing Selenium driver: {e}")

print("✅ Main execution function defined")

✅ Main execution function defined


In [41]:
# ============================
# CELL 7: EXECUTION BLOCK
# ============================

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n🛑 Process interrupted by user")
    except Exception as e:
        print(f"\n❌ Unexpected error: {e}")
    finally:
        cleanup_driver()
        print("🏁 Pipeline execution completed")

print("✅ Complete pipeline ready for execution")

🚀 Starting Business Analysis Pipeline

🔍 Enter your business search query (e.g., 'bakeries in los angeles'): salon in LA

🔎 Identified Business Group: 'Beauty, Massage & Spa'
🔍 Searching Google Places for 'salon in LA'...
✅ Found 15 businesses with websites
📊 Found 21 previously processed websites

🔬 Starting analysis for 15 new businesses...
📊 Business Group: Beauty, Massage & Spa

--- Processing 1/15: Arianna Hair Boutique ---
  -> Scraping and analyzing: https://arianna-hairboutique.com/
  ✅ Found 3 skills: Salon, Women Hair, Makeup

--- Processing 2/15: Hairroin Salon ---
  -> Scraping and analyzing: http://www.hairroinsalon.com/
  ✅ Found 2 skills: Salon, Women Hair

--- Processing 3/15: Atelier by Tiffany ---
  -> Scraping and analyzing: http://atelierbytiffany.com/
  - Requests failed for http://atelierbytiffany.com/: 404 Client Error: Not Found for url: http://atelierbytiffany.com/
  - Falling back to Selenium for: http://atelierbytiffany.com/
  -  Initializing new STEALTH Sele

# Ignore below code



In [33]:
# ==============================================================================
# Cell 1: Installations and Google Drive Mount
# ==============================================================================
print("🚀 Installing necessary packages...")
# Install all necessary packages in quiet mode to keep the output clean
!pip install pandas openpyxl requests beautifulsoup4 selenium thefuzz[speedup] openai langchain langchain_openai selenium-stealth -q
print("✅ Packages installed.")

from google.colab import drive
import os

# Mount your Google Drive. This will prompt you for authentication.
print("\n🚀 Mounting Google Drive...")
drive.mount('/content/drive')

# --- IMPORTANT: Define the path to your main project folder on Google Drive ---
# Make sure this path exactly matches your folder structure.
PROJECT_PATH = "/content/drive/My Drive/AI_Business_Classifier/"

# Check if the path exists
if os.path.exists(PROJECT_PATH):
    os.chdir(PROJECT_PATH) # Change the current working directory
    print(f"\n✅ Successfully changed directory to: {os.getcwd()}")
    print("\nFiles in your project directory:")
    # Use !ls -R to recursively list contents, confirming your setup
    !ls -R
else:
    print(f"❌ ERROR: The specified project path does not exist in your Google Drive.")
    print(f"Please create the folder structure or correct the path: '{PROJECT_PATH}'")

🚀 Installing necessary packages...
✅ Packages installed.

🚀 Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

✅ Successfully changed directory to: /content/drive/My Drive/AI_Business_Classifier

Files in your project directory:
.:
Extracted_Business_Groups_and_Types.csv  Outputs  Skill_Tag_Rulebooks

./Outputs:
final_classified_businesses.csv

./Skill_Tag_Rulebooks:
'Food & Beverage Establishments.csv'


In [34]:
# ==============================================================================
# Cell 2: The Full Pipeline - From Scratch, Simplified and Corrected
# ==============================================================================
import pandas as pd
import json
import re
import time
import threading
from typing import List, Optional, Dict

# LangChain, OpenAI, and Scraping Tools
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from openai import RateLimitError
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from thefuzz import fuzz
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

# ============================
# 1. CONFIGURATION
# ============================
os.environ["OPENAI_API_KEY"] = "REMOVED_SECRET_KEY"
GOOGLE_API_KEY = "AIzaSyBXbfRXA9eJxdL3DmLt3TuDbCtvzP6RWLA"

# --- Using paths relative to the project folder set in Cell 1 ---
UNIVERSAL_TAGS_FILE = "Extracted_Business_Groups_and_Types.csv"
RULEBOOKS_FOLDER = "Skill_Tag_Rulebooks/"
OUTPUT_FOLDER = "Outputs/"
FINAL_OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, "final_classified_businesses.csv")

CAREER_KEYWORDS = ["career", "jobs", "employment", "hiring", "work with us", "join us", "opportunities"]
THIRD_PARTY_JOB_SITES = ["indeed.com", "linkedin.com/jobs", "glassdoor.com", "workday.com"]

# ============================
# 2. SETUP (LLM and Selenium)
# ============================
llm = ChatOpenAI(model="gpt-4o", temperature=0)
thread_local_driver = threading.local()

def get_driver():
    if not hasattr(thread_local_driver, 'driver'):
        print("  -  Initializing new STEALTH Selenium driver...")
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        driver = webdriver.Chrome(options=chrome_options)
        stealth(driver, languages=["en-US", "en"], vendor="Google Inc.", platform="Win32")
        thread_local_driver.driver = driver
    return thread_local_driver.driver

def close_thread_drivers():
    # This function will be called at the end to clean up
    if hasattr(thread_local_driver, 'driver'):
        thread_local_driver.driver.quit()
        del thread_local_driver.driver

# ============================
# 3. DATA LOADING & PARSING
# ============================
def load_and_parse_configs():
    print("🚀 Loading and parsing all configuration files...")
    df_universal = pd.read_csv(UNIVERSAL_TAGS_FILE)
    group_keyword_map = {}
    for _, row in df_universal.iterrows():
        group, types_str = row['Business Group'], row['Business Type']
        if pd.notna(types_str):
            for biz_type in [t.strip().lower() for t in types_str.split(',')]:
                group_keyword_map[biz_type] = group

    rulebook_data = {}
    for filename in os.listdir(RULEBOOKS_FOLDER):
        if filename.endswith(".csv"):
            business_group_name = filename.replace(".csv", "")
            df_rulebook = pd.read_csv(os.path.join(RULEBOOKS_FOLDER, filename))
            df_rulebook.columns = df_rulebook.columns.str.strip()
            rules, skill_id_map = {}, {}
            for _, row in df_rulebook.iterrows():
                tag, skill_ids = str(row.get('Skills Tags', '')).strip(), str(row.get('Skills IDs', ''))
                if pd.notna(tag) and tag and tag.upper() not in ['BUSINESS TYPES', 'SPECIAL TAGS', 'IMPORTANT NOTES']:
                    rules[tag] = str(row.get('Prompt Rule', ''))
                    if pd.notna(skill_ids): skill_id_map[tag] = skill_ids
            rulebook_data[business_group_name] = {"rules": rules, "skill_id_map": skill_id_map}

    print("✅ All configurations loaded and parsed.")
    return group_keyword_map, rulebook_data

group_keyword_map, rulebook_data = load_and_parse_configs()

# ============================
# 4. CORE WORKER FUNCTION
# ============================

def find_and_classify_business(business_info: Dict, business_group: str):
    name, website, address = business_info['name'], business_info['website'], business_info['formatted_address']
    print(f"\n-> Analyzing: {name} ({website})")

    # --- Scrape Website ---
    soup = None
    try:
        response = requests.get(website, headers={"User-Agent": "Mozilla/5.0..."}, timeout=10)
        if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser')
    except requests.RequestException: pass

    if not soup:
        print(f"  - Falling back to Selenium for: {website}")
        try:
            driver = get_driver()
            driver.get(website)
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
        except Exception as e: print(f"  - Selenium ERROR: {e}"); return {"error": f"Failed to scrape: {e}"}

    if not soup: return {"error": "Scraping failed completely."}

    # --- Extract Careers Page ---
    best_link, highest_score = "", 0
    for a in soup.find_all("a", href=True):
        score, text_lower = 0, a.get_text().lower()
        if any(kw in text_lower for kw in CAREER_KEYWORDS): score = 100
        if not score:
            try:
                path = urlparse(a['href']).path.lower()
                if any(fuzz.ratio(path, kw) > 90 for kw in CAREER_KEYWORDS): score = 90
            except: pass
        if score > highest_score and not any(site in a['href'].lower() for site in THIRD_PARTY_JOB_SITES):
            highest_score, best_link = score, urljoin(website, a['href'])
    careers_page = best_link.split('#')[0].split('?')[0]

    # --- Get Clean Text for AI ---
    for tag in soup.select('nav, header, footer, script, style'): tag.decompose()
    scraped_text = " ".join(soup.get_text(strip=True).split())[:12000]

    # --- Classify with AI ---
    skill_names = []
    if scraped_text:
        prompt_rules = "\n".join([f"- {tag}:\n  {rule}" for tag, rule in rulebook_data[business_group]['rules'].items()])
        prompt = ChatPromptTemplate.from_template("You are an expert... {rules} ... {context}") # Your full prompt here
        chain = prompt | llm | StrOutputParser()
        try:
            response_str = chain.invoke({"rules": prompt_rules, "context": scraped_text})
            if "```json" in response_str: response_str = response_str.split("```json")[1].split("```")[0].strip()
            skill_names = json.loads(response_str).get("applied_skills", [])
            print(f"  - AI Found Skills: {skill_names}")
        except Exception as e: print(f"  - AI Analysis ERROR: {e}")

    # --- Map Skill Names to IDs ---
    skill_id_map = rulebook_data[business_group]['skill_id_map']
    matched_ids = []
    for skill in skill_names:
        ids = skill_id_map.get(skill)
        if ids: matched_ids.extend([s.strip() for s in str(ids).split(',') if s.strip()])

    return {
        "Business_Name": name, "Website": website, "Address": address,
        "Careers_Page": careers_page, "Skill_IDs": ", ".join(sorted(list(set(matched_ids)))),
        "Skill_Names": ", ".join(skill_names)
    }

# ============================
# 5. MAIN EXECUTION PIPELINE
# ============================
def main():
    if GOOGLE_API_KEY == "YOUR_GOOGLE_API_KEY_HERE" or os.environ.get("OPENAI_API_KEY") == "YOUR_OPENAI_API_KEY_HERE":
        print("❌ CRITICAL ERROR: Please set your API Keys in the configuration section.")
        return

    user_query = input("🔍 Enter your business search query (e.g., 'bakeries in los angeles'): ").strip()
    if not user_query: return

    # Determine Business Group
    business_group = None
    for keyword, group in group_keyword_map.items():
        if keyword in user_query.lower():
            business_group = group
            break
    if not business_group or business_group not in rulebook_data:
        print(f"⚠️ Could not map query to a known business group. Please be more specific."); return

    # Find Businesses
    print(f"🔎 Identified Business Group: '{business_group}'. Finding businesses...")
    params = {'query': user_query, 'key': GOOGLE_API_KEY}
    response = requests.get("https://maps.googleapis.com/maps/api/place/textsearch/json", params=params)
    places = response.json().get("results", [])

    businesses_to_analyze = []
    for place in places[:20]:
        details_params = {'place_id': place.get('place_id'), 'fields': 'name,website,formatted_address', 'key': GOOGLE_API_KEY}
        details_response = requests.get("https://maps.googleapis.com/maps/api/place/details/json", params=details_params)
        details = details_response.json().get('result', {})
        if details.get("website"): businesses_to_analyze.append(details)

    if not businesses_to_analyze: print("No businesses with websites found."); return

    # Load existing results to prevent duplicates
    try:
        df_processed = pd.read_csv(FINAL_OUTPUT_FILE)
        processed_websites = set(df_processed['Website'])
    except FileNotFoundError:
        df_processed = pd.DataFrame()
        processed_websites = set()

    businesses_to_process = [b for b in businesses_to_analyze if b.get('website') not in processed_websites]
    if not businesses_to_process: print("✅ All businesses from this search have already been processed."); return
    print(f"🔬 Starting analysis for {len(businesses_to_process)} new businesses...")

    # Process in Parallel
    with ThreadPoolExecutor(max_workers=3) as executor:
        future_to_business = {executor.submit(find_and_classify_business, b, business_group): b.get('name') for b in businesses_to_process}

        for future in as_completed(future_to_business):
            try:
                result = future.result()
                if "error" not in result:
                    # Append result and save progress immediately
                    temp_df = pd.DataFrame([result])
                    if not os.path.exists(FINAL_OUTPUT_FILE):
                        temp_df.to_csv(FINAL_OUTPUT_FILE, index=False)
                    else:
                        temp_df.to_csv(FINAL_OUTPUT_FILE, mode='a', header=False, index=False)
                    print(f"✔️ Finished & Saved: {result['Business_Name']}")
            except Exception as e: print(f"❌ Worker thread error for {future_to_business[future]}: {e}")

    print(f"\n🎉 Pipeline complete! All results are saved in '{FINAL_OUTPUT_FILE}'")

# --- Run Pipeline ---
if __name__ == "__main__":
    try:
        main()
    finally:
        # Final cleanup
        close_thread_drivers()
        print("\n✅ Script finished.")

🚀 Loading and parsing all configuration files...
✅ All configurations loaded and parsed.
🔍 Enter your business search query (e.g., 'bakeries in los angeles'): bakeries in LA
🔎 Identified Business Group: 'Food & Beverage Establishments'. Finding businesses...
🔬 Starting analysis for 1 new businesses...

-> Analyzing: Black Forest Bakery (https://www.blackforestbakery.com/)
  - AI Analysis ERROR: Expecting value: line 1 column 1 (char 0)
✔️ Finished & Saved: Black Forest Bakery

🎉 Pipeline complete! All results are saved in 'Outputs/final_classified_businesses.csv'

✅ Script finished.
