In [None]:
pip install selenium

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3~=2.5.0 (from urllib3[socks]~=2.5.0->selenium)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m31

In [None]:
!pip install googlemaps


Collecting googlemaps
  Downloading googlemaps-4.10.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googlemaps
  Building wheel for googlemaps (setup.py) ... [?25l[?25hdone
  Created wheel for googlemaps: filename=googlemaps-4.10.0-py3-none-any.whl size=40714 sha256=1e5371c712a7dcb6875e29b93fb811166b1a87598cbdc1a88aadb2aa8fa92e94
  Stored in directory: /root/.cache/pip/wheels/f1/09/77/3cc2f5659cbc62341b30f806aca2b25e6a26c351daa5b1f49a
Successfully built googlemaps
Installing collected packages: googlemaps
Successfully installed googlemaps-4.10.0


In [None]:
import re
import json
import requests
import openai
import time
import csv
from dataclasses import dataclass
from typing import List, Optional, Dict, Any
from datetime import datetime
import os

In [None]:
@dataclass
class Business:
    name: str
    address: str
    location: Dict[str, float]
    website: Optional[str] = None
    phone: Optional[str] = None
    description: str = ""
    place_id: str = ""
    rating: Optional[float] = None
    business_type: List[str] = None
    potential_jobs: List[str] = None
    required_skills: List[str] = None
    job_categories: List[str] = None
    hours: Optional[Dict] = None
    reviews_count: int = 0
    hiring_likelihood: int = 5

@dataclass
class ParsedBusinessQuery:
    business_type_keywords: List[str]
    location_keywords: List[str]
    industry_keywords: List[str]
    size_indicators: List[str]
    original_query: str
    confidence_score: float

In [None]:


class SmartBusinessScraper:
    def __init__(self, google_api_key: str, openai_api_key: str):
        self.google_api_key = google_api_key
        self.openai_client = openai.OpenAI(api_key=openai_api_key)
        self.places_base_url = "https://maps.googleapis.com/maps/api/place"

        # Comprehensive skill dictionary organized by categories
        self.skill_dictionary = {
            "food_service": [
                "cooking", "food preparation", "dishwashing", "serving", "bartending",
                "barista", "food safety", "customer service", "cash handling", "menu knowledge",
                "kitchen operations", "food plating", "inventory management", "cleaning",
                "order taking", "table service", "beverage preparation", "food handling"
            ],
            "retail": [
                "sales", "customer service", "cash register", "inventory", "merchandising",
                "product knowledge", "visual display", "stock management", "cashier",
                "point of sale", "customer relations", "loss prevention", "fitting room"
            ],
            "hospitality": [
                "front desk", "housekeeping", "guest services", "reservations", "concierge",
                "room service", "event coordination", "customer relations", "hospitality management",
                "cleaning", "laundry", "maintenance", "security"
            ],
            "healthcare": [
                "patient care", "medical assistance", "receptionist", "scheduling", "filing",
                "insurance processing", "medical records", "customer service", "cleaning",
                "administrative support", "data entry"
            ],
            "office_admin": [
                "data entry", "filing", "receptionist", "phone answering", "scheduling",
                "customer service", "administrative support", "document management",
                "computer skills", "organization", "communication"
            ],
            "manual_labor": [
                "construction", "maintenance", "cleaning", "landscaping", "delivery",
                "warehouse", "loading", "unloading", "assembly", "repair", "installation",
                "heavy lifting", "equipment operation"
            ],
            "creative": [
                "graphic design", "photography", "writing", "social media", "marketing",
                "content creation", "video editing", "web design", "art", "creative writing"
            ],
            "transportation": [
                "driving", "delivery", "logistics", "vehicle maintenance", "customer service",
                "navigation", "time management", "safety protocols"
            ],
            "technology": [
                "software development", "programming", "IT support", "data analysis", "cybersecurity",
                "web development", "database management", "system administration", "technical support"
            ],
            "education": [
                "teaching", "tutoring", "curriculum development", "student support", "administration",
                "classroom management", "educational technology", "assessment", "counseling"
            ],
            "finance": [
                "accounting", "bookkeeping", "financial analysis", "tax preparation", "auditing",
                "banking", "investment", "insurance", "payroll", "budgeting"
            ]
        }

        # Business type patterns for query parsing
        self.business_type_patterns = [
            r'\b(restaurant|cafe|coffee|food|dining|bar|pub|eatery|bistro|diner)\b',
            r'\b(shop|store|retail|boutique|market|mall|outlet|plaza)\b',
            r'\b(hotel|motel|resort|inn|lodge|hospitality|accommodation)\b',
            r'\b(hospital|clinic|medical|healthcare|dental|pharmacy|wellness)\b',
            r'\b(office|corporate|business|professional|consulting|agency)\b',
            r'\b(construction|contractor|builder|renovation|maintenance)\b',
            r'\b(creative|design|marketing|advertising|media|studio)\b',
            r'\b(transportation|logistics|delivery|shipping|trucking)\b',
            r'\b(tech|technology|software|IT|startup|digital)\b',
            r'\b(school|education|university|college|training|academy)\b',
            r'\b(bank|financial|accounting|insurance|investment)\b'
        ]

        self.location_patterns = [
            r'\bin\s+([A-Za-z\s,]+)(?:\s|$)',
            r'\b(near|around|at|located)\s+([A-Za-z\s,]+)(?:\s|$)',
            r'\b([A-Za-z\s]+),?\s+(CA|California|Los Angeles|LA|NY|New York|TX|Texas)\b'
        ]

    def parse_business_query(self, query: str) -> ParsedBusinessQuery:
        """Parse natural language business search query"""
        query_lower = query.lower().strip()

        # Extract business type keywords
        business_type_keywords = self._extract_business_type_keywords(query_lower)

        # Extract location keywords
        location_keywords = self._extract_location_keywords(query_lower)

        # Extract industry keywords
        industry_keywords = self._extract_industry_keywords(query_lower)

        # Extract size indicators
        size_indicators = self._extract_size_indicators(query_lower)

        # Calculate confidence score
        confidence_score = self._calculate_confidence_score(
            business_type_keywords, location_keywords, industry_keywords
        )

        return ParsedBusinessQuery(
            business_type_keywords=business_type_keywords,
            location_keywords=location_keywords,
            industry_keywords=industry_keywords,
            size_indicators=size_indicators,
            original_query=query,
            confidence_score=confidence_score
        )

    def _extract_business_type_keywords(self, query: str) -> List[str]:
        """Extract business type keywords from query"""
        keywords = []

        for pattern in self.business_type_patterns:
            matches = re.findall(pattern, query, re.IGNORECASE)
            keywords.extend(matches)

        # Extract other meaningful business words
        words = re.findall(r'\b[a-zA-Z]{3,}\b', query)
        business_words = [
            word for word in words
            if word not in ['looking', 'for', 'find', 'search', 'near', 'in', 'around', 'businesses']
        ]

        keywords.extend(business_words[:3])  # Limit to prevent noise
        return list(set(keywords))

    def _extract_location_keywords(self, query: str) -> List[str]:
        """Extract location information from query"""
        locations = []

        for pattern in self.location_patterns:
            matches = re.findall(pattern, query, re.IGNORECASE)
            for match in matches:
                if isinstance(match, tuple):
                    locations.extend([loc.strip() for loc in match if loc.strip()])
                else:
                    locations.append(match.strip())

        # Default to Los Angeles if no location specified
        if not locations:
            locations = ["Los Angeles, CA"]

        return locations

    def _extract_industry_keywords(self, query: str) -> List[str]:
        """Extract industry-related keywords"""
        industry_patterns = [
            r'\b(tech|technology|healthcare|finance|education|retail|manufacturing|startup|enterprise)\b',
            r'\b(ai|artificial intelligence|machine learning|blockchain|cloud|mobile|web)\b',
            r'\b(hospital|clinic|school|university|bank|restaurant|store)\b'
        ]

        keywords = []
        for pattern in industry_patterns:
            matches = re.findall(pattern, query, re.IGNORECASE)
            keywords.extend(matches)

        return keywords

    def _extract_size_indicators(self, query: str) -> List[str]:
        """Extract company size indicators"""
        size_pattern = r'\b(startup|small|large|enterprise|corporation|big|fortune|chain|local)\b'
        matches = re.findall(size_pattern, query, re.IGNORECASE)
        return matches

    def _calculate_confidence_score(self, business_keywords: List[str],
                                  location_keywords: List[str],
                                  industry_keywords: List[str]) -> float:
        """Calculate confidence score for parsed query"""
        score = 0.0

        if business_keywords:
            score += 0.4 * min(len(business_keywords) / 3, 1.0)

        if location_keywords:
            score += 0.3

        if industry_keywords:
            score += 0.2

        score += 0.1  # Base score

        return round(min(score, 1.0), 2)

    def search_businesses(self, parsed_query: ParsedBusinessQuery, max_results: int = 20) -> List[Dict]:
        """Search for businesses using Google Places API based on parsed query"""
        all_results = []

        # Build search queries
        search_queries = self._build_search_queries(parsed_query)

        for search_query in search_queries:
            try:
                places = self._search_places_api(search_query, parsed_query.location_keywords[0])
                all_results.extend(places)
            except Exception as e:
                print(f"Error searching for '{search_query}': {e}")
                continue

        # Remove duplicates based on place_id
        unique_places = {}
        for place in all_results:
            place_id = place.get('place_id')
            if place_id and place_id not in unique_places:
                unique_places[place_id] = place

        return list(unique_places.values())[:max_results]

    def _build_search_queries(self, parsed_query: ParsedBusinessQuery) -> List[str]:
        """Build search queries based on parsed information"""
        queries = []

        # Primary searches based on business type
        for business_type in parsed_query.business_type_keywords[:3]:
            queries.append(business_type)

            # Combine with industry keywords
            for industry in parsed_query.industry_keywords:
                queries.append(f"{industry} {business_type}")

        # Fallback general searches
        if not queries:
            queries = ["businesses", "companies", "stores", "restaurants", "offices"]

        return queries[:5]  # Limit API calls

    def _search_places_api(self, query: str, location: str) -> List[Dict]:
        """Search Google Places API"""
        url = f"{self.places_base_url}/textsearch/json"

        params = {
            'query': f"{query} in {location}",
            'key': self.google_api_key,
            'type': 'establishment'
        }

        response = requests.get(url, params=params)

        if response.status_code != 200:
            raise Exception(f"Google Places API error: {response.status_code}")

        data = response.json()

        if data.get("status") != "OK":
            if data.get("status") == "ZERO_RESULTS":
                return []
            raise Exception(f"Google Places API error: {data.get('error_message', 'Unknown error')}")

        return data.get("results", [])

    def get_business_details(self, place_id: str) -> Dict:
        """Get detailed information about a specific business"""
        details_url = f"{self.places_base_url}/details/json"

        params = {
            "place_id": place_id,
            "fields": "name,formatted_address,geometry,website,formatted_phone_number,business_status,opening_hours,rating,reviews,types,editorial_summary",
            "key": self.google_api_key
        }

        response = requests.get(details_url, params=params)

        if response.status_code == 200:
            data = response.json()
            if data.get("status") == "OK":
                return data.get("result", {})

        return {}

    def analyze_business_with_ai(self, business_info: Dict) -> Dict:
        """Use OpenAI to analyze business and match skills"""
        business_name = business_info.get("name", "")
        business_types = business_info.get("types", [])
        reviews = business_info.get("reviews", [])
        editorial_summary = business_info.get("editorial_summary", {}).get("overview", "")

        # Create context from reviews and business types
        review_text = " ".join([review.get("text", "")[:200] for review in reviews[:3]])
        business_context = f"Business types: {', '.join(business_types)}. Summary: {editorial_summary}. Recent reviews: {review_text}"

        prompt = f"""
        Analyze this business and identify potential job opportunities and required skills:

        Business Name: {business_name}
        Business Context: {business_context}

        Available Skill Categories and Skills:
        {json.dumps(self.skill_dictionary, indent=2)}

        Please provide a JSON response with:
        1. "description": A brief description of the business and what they likely do
        2. "potential_jobs": List of specific job roles this business might hire for
        3. "required_skills": List of skills from the skill dictionary that would be relevant
        4. "job_categories": List of job categories from the skill dictionary that apply
        5. "hiring_likelihood": Score from 1-10 indicating how likely they are to hire entry-level workers

        Focus on entry-level positions that don't require extensive experience.
        Only use skills that are provided in the skill dictionary.
        """

        try:
            response = self.openai_client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are an expert at analyzing businesses and identifying job opportunities. Always respond with valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3
            )

            content = response.choices[0].message.content
            # Try to extract JSON from the response
            json_match = re.search(r'\{.*\}', content, re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
            else:
                return json.loads(content)

        except Exception as e:
            print(f"AI analysis error: {e}")
            return {
                "description": "Analysis unavailable",
                "potential_jobs": [],
                "required_skills": [],
                "job_categories": [],
                "hiring_likelihood": 5
            }

    def process_business(self, place_data: Dict) -> Business:
        """Process a single business and create Business object"""
        # Get detailed information
        place_id = place_data.get("place_id")
        details = self.get_business_details(place_id) if place_id else {}

        # Merge basic and detailed data
        business_info = {**place_data, **details}

        # Get AI analysis
        ai_analysis = self.analyze_business_with_ai(business_info)

        # Extract location
        geometry = business_info.get("geometry", {})
        location = geometry.get("location", {})

        return Business(
            name=business_info.get("name", ""),
            address=business_info.get("formatted_address", ""),
            location={"lat": location.get("lat"), "lng": location.get("lng")},
            website=business_info.get("website"),
            phone=business_info.get("formatted_phone_number"),
            description=ai_analysis.get("description", ""),
            place_id=place_id,
            rating=business_info.get("rating"),
            business_type=business_info.get("types", []),
            potential_jobs=ai_analysis.get("potential_jobs", []),
            required_skills=ai_analysis.get("required_skills", []),
            job_categories=ai_analysis.get("job_categories", []),
            hours=business_info.get("opening_hours", {}).get("weekday_text"),
            reviews_count=business_info.get("user_ratings_total", 0),
            hiring_likelihood=ai_analysis.get("hiring_likelihood", 5)
        )

    def scrape_businesses(self, query: str, max_results: int = 20) -> List[Business]:
        """Main method to scrape businesses based on natural language query"""
        print(f"🔍 Parsing query: '{query}'")

        # Parse the query
        parsed_query = self.parse_business_query(query)

        print(f"📊 Query Analysis:")
        print(f"  • Business Types: {', '.join(parsed_query.business_type_keywords)}")
        print(f"  • Location: {', '.join(parsed_query.location_keywords)}")
        print(f"  • Industry: {', '.join(parsed_query.industry_keywords) or 'General'}")
        print(f"  • Confidence: {parsed_query.confidence_score}/1.0")
        print()

        # Search for businesses
        places = self.search_businesses(parsed_query, max_results)

        print(f"🏢 Found {len(places)} businesses to analyze")

        businesses = []
        for i, place in enumerate(places):
            business_name = place.get('name', 'Unknown')
            print(f"Processing business {i+1}/{len(places)}: {business_name}")

            try:
                business = self.process_business(place)
                businesses.append(business)

                # Rate limiting
                time.sleep(0.5)

            except Exception as e:
                print(f"Error processing {business_name}: {e}")
                continue

        # Sort by hiring likelihood and rating
        businesses.sort(key=lambda b: (b.hiring_likelihood, b.rating or 0), reverse=True)

        return businesses

    def export_results(self, businesses: List[Business], filename: str = "business_opportunities.csv"):
        """Export results to CSV file"""
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)

            # Write header
            writer.writerow([
                'name', 'address', 'latitude', 'longitude', 'website', 'phone',
                'description', 'rating', 'reviews_count', 'business_type',
                'potential_jobs', 'required_skills', 'job_categories',
                'hiring_likelihood', 'hours'
            ])

            # Write data
            for business in businesses:
                writer.writerow([
                    business.name,
                    business.address,
                    business.location.get('lat', ''),
                    business.location.get('lng', ''),
                    business.website or '',
                    business.phone or '',
                    business.description,
                    business.rating or '',
                    business.reviews_count,
                    '; '.join(business.business_type),
                    '; '.join(business.potential_jobs),
                    '; '.join(business.required_skills),
                    '; '.join(business.job_categories),
                    business.hiring_likelihood,
                    '; '.join(business.hours) if business.hours else ''
                ])

        print(f"✅ Results exported to {filename}")

    def display_results(self, businesses: List[Business]):
        """Display formatted results"""
        if not businesses:
            print("❌ No businesses found matching your criteria.")
            return

        print(f"\n🎯 Found {len(businesses)} Business Opportunities")
        print("=" * 80)

        for i, business in enumerate(businesses, 1):
            print(f"\n{i}. 🏢 {business.name}")
            print(f"    Address: {business.address}")
            print(f"    Phone: {business.phone or 'Not available'}")
            print(f"    Website: {business.website or 'Not available'}")
            print(f"    Rating: {business.rating or 'N/A'}/5.0 ({business.reviews_count} reviews)")
            print(f"    Description: {business.description}")
            print(f"    Potential Jobs: {', '.join(business.potential_jobs) or 'General positions'}")
            print(f"    Required Skills: {', '.join(business.required_skills) or 'Basic skills'}")
            print(f"    Job Categories: {', '.join(business.job_categories) or 'General'}")


            print("-" * 80)
def main():
    """Main function for interactive use"""
    print(" Business Scraping with Skill Matching")
    print("=" * 60)

    # Get API keys
    google_api_key = input("Enter your Google Places API Key: ").strip()
    openai_api_key = input("Enter your OpenAI API Key: ").strip()

    if not google_api_key or not openai_api_key:
        print("❌ Error: Both API keys are required")
        return

    scraper = SmartBusinessScraper(google_api_key, openai_api_key)

    print("\n🔍 Enter business search queries in natural language")
    print("Examples:")
    print("  • 'restaurants in downtown LA'")
    print("  • 'retail stores in Santa Monica'")

    print("\nType 'quit' to exit\n")

    while True:
        query = input("🔍 Enter business search: ").strip()

        if query.lower() in ['quit', 'exit', 'q']:
            print("👋 Thanks for using the Smart Business Scraper!")
            break

        if not query:
            continue

        try:
            # Scrape businesses
            businesses = scraper.scrape_businesses(query, max_results=15)

            # Display results
            scraper.display_results(businesses)

            # Ask if user wants to export
            export = input("\n💾 Export results to CSV? (y/n): ").strip().lower()
            if export == 'y':
                filename = f"businesses_{query.replace(' ', '_').replace(',', '').lower()}.csv"
                scraper.export_results(businesses, filename)

            print("\n" + "=" * 80 + "\n")

        except Exception as e:
            print(f"❌ Error: {e}")
            print("Please try a different search query.\n")

if __name__ == "__main__":
    main()




 Business Scraping with Skill Matching


KeyboardInterrupt: Interrupted by user

In [None]:
import re
import json
import requests
import openai
import time
import csv
import pandas as pd
from dataclasses import dataclass
from typing import List, Optional, Dict, Any
from datetime import datetime
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from google.colab import drive
import urllib.parse

# API Keys - Replace with your actual keys
GOOGLE_API_KEY = ""     # add key
OPENAI_API_KEY = ""

@dataclass
class Business:
    place_id: str
    business_name: str
    address: str
    phone_number: Optional[str] = None
    latitude: Optional[float] = None
    longitude: Optional[float] = None
    website: Optional[str] = None
    hr_email: Optional[str] = None
    sales_email: Optional[str] = None
    careers_page: Optional[str] = None
    sales_page: Optional[str] = None
    social_media_links: Dict[str, str] = None
    business_type: List[str] = None
    skill_ids: List[str] = None
    skill_names: List[str] = None

class SmartBusinessScraper:
    def __init__(self, google_api_key: str, openai_api_key: str):
        self.google_api_key = google_api_key
        self.openai_client = openai.OpenAI(api_key=openai_api_key)
        self.places_base_url = "https://maps.googleapis.com/maps/api/place"
        self.skill_tags_df = None
        self.website_context = {}  # Store scraped website context

        # Mount Google Drive
        drive.mount('/content/drive')

        # Load skill tags CSV
        self.load_skill_tags()

        # Setup selenium driver
        self.setup_driver()

    def setup_driver(self):
        """Setup Chrome driver for web scraping"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
        self.driver = webdriver.Chrome(options=chrome_options)

    def load_skill_tags(self):
        """Load skill tags from CSV file"""
        try:
            file_path = '/content/drive/MyDrive/Kobeyo Business Directory/skill tags.csv'
            self.skill_tags_df = pd.read_csv(file_path)

            # Print column names to debug
            print(f"CSV columns: {list(self.skill_tags_df.columns)}")

            # Map the actual column names to standardized names
            column_mapping = {
                'Skills Tags': 'skills_tags',
                'Prompt Rule': 'prompt_rule',
                'Skills IDs': 'skill_ids',
                'Skills Names': 'skill_names'
            }

            # Rename columns to match our code expectations
            self.skill_tags_df = self.skill_tags_df.rename(columns=column_mapping)

            print(f"✅ Loaded skill tags with {len(self.skill_tags_df)} entries")
            print(f"Columns after mapping: {list(self.skill_tags_df.columns)}")

        except Exception as e:
            print(f"❌ Error loading skill tags: {e}")
            self.skill_tags_df = pd.DataFrame()

    def scrape_website_content(self, url: str) -> str:
        """Scrape website content to extract skills-related information"""
        if not url:
            return ""

        try:
            # Normalize URL
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url

            self.driver.get(url)
            time.sleep(3)

            # Get page content
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            # Extract relevant text content
            text_content = soup.get_text()

            # Focus on skills-related content (first 2000 characters)
            skills_context = text_content[:2000].lower()

            # Store in memory
            self.website_context[url] = skills_context

            return skills_context

        except Exception as e:
            print(f"Error scraping website {url}: {e}")
            return ""

    def find_careers_and_sales_pages(self, website_url: str) -> Dict[str, str]:
        """Find careers and sales pages on the website"""
        careers_page = None
        sales_page = None

        if not website_url:
            return {"careers_page": careers_page, "sales_page": sales_page}

        try:
            if not website_url.startswith(('http://', 'https://')):
                website_url = 'https://' + website_url

            self.driver.get(website_url)
            time.sleep(2)

            # Look for careers/jobs links
            careers_keywords = ['careers', 'jobs', 'employment', 'work-with-us', 'join-our-team']
            for keyword in careers_keywords:
                try:
                    element = self.driver.find_element(By.PARTIAL_LINK_TEXT, keyword)
                    careers_page = element.get_attribute('href')
                    break
                except:
                    continue

            # Look for sales/contact links
            sales_keywords = ['sales', 'contact', 'get-quote', 'services', 'about']
            for keyword in sales_keywords:
                try:
                    element = self.driver.find_element(By.PARTIAL_LINK_TEXT, keyword)
                    sales_page = element.get_attribute('href')
                    break
                except:
                    continue

        except Exception as e:
            print(f"Error finding pages on {website_url}: {e}")

        return {"careers_page": careers_page, "sales_page": sales_page}

    def extract_emails_and_social_media(self, website_url: str) -> Dict[str, Any]:
        """Extract emails and social media links from website"""
        hr_email = None
        sales_email = None
        social_media = {"facebook": None, "instagram": None, "linkedin": None}

        if not website_url:
            return {"hr_email": hr_email, "sales_email": sales_email, "social_media": social_media}

        try:
            if not website_url.startswith(('http://', 'https://')):
                website_url = 'https://' + website_url

            self.driver.get(website_url)
            time.sleep(2)

            page_source = self.driver.page_source

            # Extract emails
            email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
            emails = re.findall(email_pattern, page_source)

            for email in emails:
                email_lower = email.lower()
                if any(keyword in email_lower for keyword in ['hr', 'jobs', 'careers', 'hiring']):
                    hr_email = email
                elif any(keyword in email_lower for keyword in ['sales', 'contact', 'info', 'hello']):
                    sales_email = email

            # Extract social media links
            social_patterns = {
                'facebook': r'https?://(?:www\.)?facebook\.com/[^/\s]+',
                'instagram': r'https?://(?:www\.)?instagram\.com/[^/\s]+',
                'linkedin': r'https?://(?:www\.)?linkedin\.com/[^/\s]+'
            }

            for platform, pattern in social_patterns.items():
                matches = re.findall(pattern, page_source)
                if matches:
                    social_media[platform] = matches[0]

        except Exception as e:
            print(f"Error extracting emails/social media from {website_url}: {e}")

        return {"hr_email": hr_email, "sales_email": sales_email, "social_media": social_media}

    def search_businesses(self, query: str, location: str = "Los Angeles, CA", max_results: int = 10) -> List[Dict]:
        """Search for businesses using Google Places API"""
        url = f"{self.places_base_url}/textsearch/json"
        params = {
            'query': f"{query} in {location}",
            'key': self.google_api_key,
            'type': 'establishment'
        }

        response = requests.get(url, params=params)

        if response.status_code != 200:
            raise Exception(f"Google Places API error: {response.status_code}")

        data = response.json()

        if data.get("status") != "OK":
            if data.get("status") == "ZERO_RESULTS":
                return []
            raise Exception(f"Google Places API error: {data.get('error_message', 'Unknown error')}")

        return data.get("results", [])[:max_results]

    def get_business_details(self, place_id: str) -> Dict:
        """Get detailed information about a specific business"""
        details_url = f"{self.places_base_url}/details/json"
        params = {
            "place_id": place_id,
            "fields": "name,formatted_address,geometry,website,formatted_phone_number,business_status,opening_hours,rating,reviews,types",
            "key": self.google_api_key
        }

        response = requests.get(details_url, params=params)

        if response.status_code == 200:
            data = response.json()
            if data.get("status") == "OK":
                return data.get("result", {})

        return {}

    def get_relevant_skills(self, business_types: List[str]) -> str:
        """Get only relevant skills from CSV based on business types"""
        if self.skill_tags_df.empty:
            return "No skill tags available"

        # Filter skills based on business types
        relevant_skills = []

        # Check if the required columns exist (after mapping)
        required_columns = ['skill_ids', 'skill_names', 'prompt_rule']
        missing_columns = [col for col in required_columns if col not in self.skill_tags_df.columns]

        if missing_columns:
            print(f"⚠️ Missing columns in CSV: {missing_columns}")
            print(f"Available columns: {list(self.skill_tags_df.columns)}")
            return "CSV format issue - missing required columns"

        try:
            for business_type in business_types:
                # Look for matching skills in the CSV
                matches = self.skill_tags_df[
                    self.skill_tags_df['prompt_rule'].str.contains(business_type, case=False, na=False)
                ]

                if not matches.empty:
                    relevant_skills.extend(matches.to_dict('records'))

            # If no specific matches, get general food/cafe related skills
            if not relevant_skills:
                food_keywords = ['restaurant', 'cafe', 'food', 'beverage', 'kitchen', 'service']
                for keyword in food_keywords:
                    matches = self.skill_tags_df[
                        self.skill_tags_df['prompt_rule'].str.contains(keyword, case=False, na=False)
                    ]
                    if not matches.empty:
                        relevant_skills.extend(matches.to_dict('records')[:3])  # Limit to 3 per keyword
                        break

            # Limit to top 5 relevant skills to reduce tokens
            relevant_skills = relevant_skills[:5]

            if not relevant_skills:
                return "No relevant skills found"

            # Format as concise string
            skills_text = ""
            for skill in relevant_skills:
                skills_text += f"ID: {skill.get('skill_ids', 'N/A')}, Name: {skill.get('skill_names', 'N/A')}, Rule: {skill.get('prompt_rule', 'N/A')[:100]}...\n"

            return skills_text

        except Exception as e:
            print(f"Error processing skills: {e}")
            return "Error processing skills from CSV"

    def tag_skills_with_ai(self, business_info: Dict, website_context: str) -> Dict[str, Any]:
        """Use OpenAI to tag skills based on business information and skill tags CSV"""
        business_name = business_info.get("name", "")
        business_types = business_info.get("types", [])

        # Get only relevant skills to reduce token usage
        relevant_skills = self.get_relevant_skills(business_types)

        # Create much shorter prompt
        prompt = f"""
Business: {business_name}
Types: {', '.join(business_types[:3])}
Context: {website_context[:200]}

Relevant Skills:
{relevant_skills}

Based on this business info, return relevant skill IDs and names as JSON:
{{
  "skill_ids": ["id1", "id2"],
  "skill_names": ["name1", "name2"]
}}
"""

        try:
            response = self.openai_client.chat.completions.create(
                model="gpt-3.5-turbo",  # Switch to cheaper model
                messages=[
                    {"role": "system", "content": "Tag business skills. Respond with valid JSON only."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                max_tokens=200  # Limit response tokens
            )

            content = response.choices[0].message.content
            json_match = re.search(r'\{.*\}', content, re.DOTALL)

            if json_match:
                return json.loads(json_match.group())
            else:
                return json.loads(content)

        except Exception as e:
            print(f"AI skill tagging error: {e}")
            return {"skill_ids": [], "skill_names": []}

    def process_business(self, place_data: Dict) -> Business:
        """Process a single business and create Business object"""
        place_id = place_data.get("place_id")
        details = self.get_business_details(place_id) if place_id else {}

        # Merge basic and detailed data
        business_info = {**place_data, **details}

        # Get website
        website = business_info.get("website")

        # Scrape website content for skills context
        website_context = self.scrape_website_content(website) if website else ""

        # Find careers and sales pages
        pages_info = self.find_careers_and_sales_pages(website)

        # Extract emails and social media
        contact_info = self.extract_emails_and_social_media(website)

        # Get AI skill tagging
        skill_info = self.tag_skills_with_ai(business_info, website_context)

        # Extract location
        geometry = business_info.get("geometry", {})
        location = geometry.get("location", {})

        return Business(
            place_id=place_id,
            business_name=business_info.get("name", ""),
            address=business_info.get("formatted_address", ""),
            phone_number=business_info.get("formatted_phone_number"),
            latitude=location.get("lat"),
            longitude=location.get("lng"),
            website=website,
            hr_email=contact_info.get("hr_email"),
            sales_email=contact_info.get("sales_email"),
            careers_page=pages_info.get("careers_page"),
            sales_page=pages_info.get("sales_page"),
            social_media_links=contact_info.get("social_media"),
            business_type=business_info.get("types", []),
            skill_ids=skill_info.get("skill_ids", []),
            skill_names=skill_info.get("skill_names", [])
        )

    def scrape_businesses(self, search_query: str, max_results: int = 10) -> List[Business]:
        """Main method to scrape businesses"""
        print(f"🔍 Searching for: '{search_query}'")

        # Extract location from query if present, otherwise use default
        location = "Los Angeles, CA"  # Default location
        if " in " in search_query.lower():
            parts = search_query.split(" in ")
            if len(parts) == 2:
                search_query = parts[0].strip()
                location = parts[1].strip()

        # Search for businesses
        places = self.search_businesses(search_query, location, max_results)
        print(f"🏢 Found {len(places)} businesses to analyze")

        businesses = []
        for i, place in enumerate(places):
            business_name = place.get('name', 'Unknown')
            print(f"Processing business {i+1}/{len(places)}: {business_name}")

            try:
                business = self.process_business(place)
                businesses.append(business)

                # Rate limiting
                time.sleep(1)

            except Exception as e:
                print(f"Error processing {business_name}: {e}")
                continue

        return businesses

    def save_to_google_drive(self, businesses: List[Business], filename: str = "business_directory.csv"):
        """Save results to Google Drive"""
        drive_path = f"/content/drive/MyDrive/Kobeyo Business Directory/{filename}"

        # Check if file exists to append or create new
        file_exists = os.path.exists(drive_path)

        with open(drive_path, 'a' if file_exists else 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)

            # Write header only if file is new
            if not file_exists:
                writer.writerow([
                    'place_id', 'business_name', 'address', 'phone_number', 'latitude', 'longitude',
                    'website', 'hr_email', 'sales_email', 'careers_page', 'sales_page',
                    'social_media_facebook', 'social_media_instagram', 'social_media_linkedin',
                    'business_type', 'skill_ids', 'skill_names'
                ])

            # Write data
            for business in businesses:
                social_media = business.social_media_links or {}
                writer.writerow([
                    business.place_id,
                    business.business_name,
                    business.address,
                    business.phone_number or '',
                    business.latitude or '',
                    business.longitude or '',
                    business.website or '',
                    business.hr_email or '',
                    business.sales_email or '',
                    business.careers_page or '',
                    business.sales_page or '',
                    social_media.get('facebook', ''),
                    social_media.get('instagram', ''),
                    social_media.get('linkedin', ''),
                    '; '.join(business.business_type) if business.business_type else '',
                    '; '.join(business.skill_ids) if business.skill_ids else '',
                    '; '.join(business.skill_names) if business.skill_names else ''
                ])

        print(f"✅ Results saved to {drive_path}")

    def display_results(self, businesses: List[Business]):
        """Display formatted results - simplified version"""
        if not businesses:
            print("❌ No businesses found matching your criteria.")
            return

        print(f"\n🎯 Found {len(businesses)} Businesses")
        print("=" * 80)

        for i, business in enumerate(businesses, 1):
            print(f"\n{i}. 🏢 {business.business_name}")
            print(f"   Address: {business.address}")
            print(f"   Website: {business.website or 'Not available'}")
            print(f"   Phone: {business.phone_number or 'Not available'}")
            print(f"   Skill IDs: {', '.join(business.skill_ids) if business.skill_ids else 'None'}")
            print(f"   Skill Names: {', '.join(business.skill_names) if business.skill_names else 'None'}")
            print("-" * 80)

    def __del__(self):
        """Clean up selenium driver"""
        if hasattr(self, 'driver'):
            self.driver.quit()

def get_user_input():
    """Get user input for search query"""
    print("🔧 Smart Business Scraper")
    print("=" * 40)

    search_query = input("\nEnter your search query (e.g., 'cafes in Los Angeles', 'restaurants near me'): ").strip()

    if not search_query:
        search_query = "cafes in Los Angeles"
        print(f"Using default: {search_query}")

    return search_query

def main():
    """Main function to run the scraper"""
    # Get user input
    search_query = get_user_input()

    # Initialize scraper
    scraper = SmartBusinessScraper(GOOGLE_API_KEY, OPENAI_API_KEY)

    try:
        print(f"\n🔍 Processing: {search_query}")

        # Limit to 10 businesses to conserve OpenAI tokens
        businesses = scraper.scrape_businesses(search_query, max_results=10)

        # Display results
        scraper.display_results(businesses)

        # Save to Google Drive
        scraper.save_to_google_drive(businesses)

        print(f"✅ Completed processing")

    except Exception as e:
        print(f"❌ Error processing '{search_query}': {e}")

    print("\n🎉 Business scraping completed!")

if __name__ == "__main__":
    main()

🔧 Smart Business Scraper

Enter your search query (e.g., 'cafes in Los Angeles', 'restaurants near me'): cafes in Los Angele
Mounted at /content/drive
❌ Error loading skill tags: [Errno 2] No such file or directory: '/content/drive/MyDrive/Kobeyo Business Directory/skill tags.csv'

🔍 Processing: cafes in Los Angele
🔍 Searching for: 'cafes in Los Angele'
🏢 Found 10 businesses to analyze
Processing business 1/10: Urth Caffe
Processing business 2/10: Jurassic Magic
Processing business 3/10: Blue Elephant Café
Processing business 4/10: De La Tierra Café
Processing business 5/10: Dalian's Café
Processing business 6/10: Alcove
Processing business 7/10: Alchemist Coffee Project
Processing business 8/10: CAFE NIDO: Coffee & Books
Processing business 9/10: Bottega Louie
Processing business 10/10: Cafe Dulce

🎯 Found 10 Businesses

1. 🏢 Urth Caffe
   Address: 459 S Hewitt St, Los Angeles, CA 90013, USA
   Website: https://www.urthcaffe.com/
   Phone: (213) 797-4534
   Skill IDs: BS001, BS002
   