In [None]:
pip install selenium



In [None]:
# API Keys - Replace with your actual keys
GOOGLE_API_KEY = ""
OPENAI_API_KEY = ""

In [None]:
import spacy
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import time
import csv
import os
from datetime import datetime
from typing import List, Dict, Optional
from dataclasses import dataclass

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("Installing spaCy model...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

@dataclass
class Business:
    business_name: str
    address: str
    website: str
    skill_ids: List[str]
    skill_names: List[str]

class DirectoryManager:
    """Manages the directory structure and file operations"""
    def __init__(self, base_path: str = "/content/drive/MyDrive/Kobeyo Business Directory"):
        self.base_path = base_path
        self.output_folder = os.path.join(base_path, "Output")
        self.skill_tags_folder = os.path.join(base_path, "Skill tag sheets")

        # Ensure directories exist
        os.makedirs(self.output_folder, exist_ok=True)
        os.makedirs(self.skill_tags_folder, exist_ok=True)

        # Hardcoded business groups mapping
        self.business_groups = {
            "Animal Care & Services": ["Pet walking", "Pet grooming", "Pet Boarding & Pet Sitting", "Vetinary Services", "Pet Training", "Animal Shelter"],
            "Food & Beverage Establishments": ["Restaurants", "Quick Service Restaurants", "Bakeries", "Donuts", "Pastery Shops", "Sushi", "Cafes", "Coffee Roasters", "Breweries", "Distilleries", "Wine Bars", "Bars", "Sandwich Shops", "Tacos", "Bars (exclude grocery stores)", "Ice cream"],
            "Real Estate & Property Management": ["Real Estate Agencies", "Escrow Services", "Title Services", "Property Management"],
            "Cleaning & Remediation": ["maid services", "house cleaning services", "office cleaning services", "commercial cleaning services", "Dry cleaning", "Specialty cleaning services", "fire water and mold restoration", "HAZMAT cleaning services", "Window cleaning services", "Pool cleaning", "Carpet & upholstry cleaning services", "Grease trap cleaning", "Portable toilets services", "Septic cleaning services"],
            "Security": ["Security Services", "Security Alarms", "Fire Alarms"],
            "Logistics, Warehousing & Distribution": ["Last Mile Delivery", "Couriier Services", "Long Haul Trucking", "Warehouse", "Distribution Center", "Logistics", "Ports", "Distribution", "Dispatch & Routing", "Transport Services", "Limo Services", "Shuttle Services", "Pet Taxi", "Delivery Specialist", "Cannabis Delivery Specialis", "Freight brokerage services"],
            "Landscaping, Groundskeeping & Pest Control": ["Landscape Architects & Design Firms", "Landscapers", "Landscape Construction", "Tree Trimming Services", "Tree Specialists", "Pest Control", "Golf Course", "Pool Builder", "Pool Maintenance & Service"],
            "Bookkeeping, Accounting & Payroll Services": ["Bookkeeping", "Payroll", "Accounting Firms", "Personal Accountants"],
            "Recruiting & Staffing Agencies": ["Staffing & recruiting house cleaning", "Industrial Staffing & Recruiting", "Staffing & Recruiting Services", "Recruiter", "Accounting & Payroll Staffing & Recruiting", "Customer Service Staffing & Recruiting", "Sales Staffing & Recruiting", "Admin Staffing & Recruiting", "Manufacturing Staffing & Recruiting", "Security Staffing", "Video & Audio Production Staffing & Recruiting", "Property Management Staffing & Recruiting", "Staffing & Recruiting Mechanics"]
        }

    def get_business_group_from_type(self, business_type: str) -> Optional[str]:
        """Find the business group based on business type"""
        business_type_lower = business_type.lower()

        for group_name, business_types in self.business_groups.items():
            for biz_type in business_types:
                if business_type_lower in biz_type.lower() or biz_type.lower() in business_type_lower:
                    return group_name

        print(f"‚ö†Ô∏è No business group found for type: {business_type}")
        return None

    def get_all_business_types_in_group(self, group_name: str) -> List[str]:
        """Get all business types under a specific group"""
        return self.business_groups.get(group_name, [])

    def load_skill_tags_for_group(self, group_name: str) -> pd.DataFrame:
        """Load skill tags for a specific business group"""
        try:
            # Construct the skill tags file path
            skill_file_path = os.path.join(self.skill_tags_folder, f"skill tags - {group_name}.csv")

            if not os.path.exists(skill_file_path):
                print(f"‚ö†Ô∏è Skill tags file not found: {skill_file_path}")
                return pd.DataFrame()

            df = pd.read_csv(skill_file_path)
            print(f"‚úÖ Loaded skill tags for {group_name} with {len(df)} entries")
            return df
        except Exception as e:
            print(f"‚ùå Error loading skill tags for {group_name}: {e}")
            return pd.DataFrame()

    def get_output_file_path(self, group_name: str) -> str:
        """Get the output file path for a specific business group"""
        return os.path.join(self.output_folder, f"businesses_{group_name.replace(' ', '_').replace('&', 'and')}.csv")

class NERBusinessScraper:
    def __init__(self, google_api_key: str):
        self.google_api_key = google_api_key
        self.places_base_url = "https://maps.googleapis.com/maps/api/place"
        self.directory_manager = DirectoryManager()
        self.current_group = None
        self.current_skill_tags_df = pd.DataFrame()
        self.setup_driver()

    def setup_driver(self):
        """Setup Chrome driver for web scraping"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
        self.driver = webdriver.Chrome(options=chrome_options)

    def extract_skills_with_ner(self, text: str) -> Dict[str, List[str]]:
        """Extract skills using basic NER and keyword matching"""
        if self.current_skill_tags_df.empty:
            return {"skill_ids": [], "skill_names": []}

        # Process text with spaCy
        doc = nlp(text.lower())

        # Extract entities and keywords
        entities = [ent.text for ent in doc.ents]
        tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]

        # Combine entities and tokens for matching
        text_features = set(entities + tokens)

        matched_skills = {"skill_ids": [], "skill_names": []}

        # Match against skill tags
        for _, row in self.current_skill_tags_df.iterrows():
            skill_id = str(row.get('Skills IDs', row.get('skill_ids', '')))
            skill_name = str(row.get('Skills Names', row.get('skill_names', '')))

            # Check if skill name appears in text
            if skill_name.lower() in text.lower():
                matched_skills["skill_ids"].append(skill_id)
                matched_skills["skill_names"].append(skill_name)
                continue

            # Check for keyword matches
            skill_keywords = skill_name.lower().split()
            if any(keyword in text_features for keyword in skill_keywords):
                matched_skills["skill_ids"].append(skill_id)
                matched_skills["skill_names"].append(skill_name)

        return matched_skills

    def scrape_website_content(self, url: str) -> str:
        """Scrape website content for skill extraction"""
        if not url:
            return ""

        try:
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url

            self.driver.get(url)
            time.sleep(3)

            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            # Extract text content
            text_content = soup.get_text()

            # Focus on first 2000 characters
            return text_content[:2000]

        except Exception as e:
            print(f"Error scraping website {url}: {e}")
            return ""

    def search_businesses(self, query: str, location: str = "Los Angeles, CA", max_results: int = 10) -> List[Dict]:
        """Search for businesses using Google Places API"""
        url = f"{self.places_base_url}/textsearch/json"
        params = {
            'query': f"{query} in {location}",
            'key': self.google_api_key,
            'type': 'establishment'
        }

        response = requests.get(url, params=params)

        if response.status_code != 200:
            raise Exception(f"Google Places API error: {response.status_code}")

        data = response.json()

        if data.get("status") != "OK":
            if data.get("status") == "ZERO_RESULTS":
                return []
            raise Exception(f"Google Places API error: {data.get('error_message', 'Unknown error')}")

        return data.get("results", [])[:max_results]

    def get_business_details(self, place_id: str) -> Dict:
        """Get detailed information about a specific business"""
        details_url = f"{self.places_base_url}/details/json"
        params = {
            "place_id": place_id,
            "fields": "name,formatted_address,website",
            "key": self.google_api_key
        }

        response = requests.get(details_url, params=params)

        if response.status_code == 200:
            data = response.json()
            if data.get("status") == "OK":
                return data.get("result", {})

        return {}

    def process_business(self, place_data: Dict) -> Business:
        """Process a single business and create Business object"""
        place_id = place_data.get("place_id")
        details = self.get_business_details(place_id) if place_id else {}

        # Merge basic and detailed data
        business_info = {**place_data, **details}

        # Get website
        website = business_info.get("website", "")

        # Scrape website content for skills
        website_content = self.scrape_website_content(website) if website else ""

        # Extract skills using NER
        skill_info = self.extract_skills_with_ner(website_content)

        return Business(
            business_name=business_info.get("name", ""),
            address=business_info.get("formatted_address", ""),
            website=website,
            skill_ids=skill_info.get("skill_ids", []),
            skill_names=skill_info.get("skill_names", [])
        )

    def scrape_businesses_by_group(self, business_type: str, location: str = "Los Angeles, CA", max_results: int = 10) -> List[Business]:
        """Main method to scrape businesses with group-based categorization"""
        print(f"üîç Analyzing business type: '{business_type}' in {location}")

        # Get business group
        group_name = self.directory_manager.get_business_group_from_type(business_type)
        if not group_name:
            print(f"‚ö†Ô∏è No group found for {business_type}")
            return []

        # Load skill tags
        self.current_group = group_name
        self.current_skill_tags_df = self.directory_manager.load_skill_tags_for_group(group_name)

        # Get business types in group
        all_business_types = self.directory_manager.get_all_business_types_in_group(group_name)

        # Search for businesses
        all_businesses = []
        for biz_type in all_business_types[:2]:  # Limit to 2 types
            print(f"üîç Searching for: {biz_type}")
            try:
                places = self.search_businesses(biz_type, location, max_results//2)

                for place in places:
                    business_name = place.get('name', 'Unknown')
                    print(f"Processing: {business_name}")

                    try:
                        business = self.process_business(place)
                        all_businesses.append(business)
                        time.sleep(1)  # Rate limiting
                    except Exception as e:
                        print(f"Error processing {business_name}: {e}")
                        continue

            except Exception as e:
                print(f"Error searching for {biz_type}: {e}")
                continue

        return all_businesses

    def save_results(self, businesses: List[Business]):
        """Save results to CSV file"""
        if not businesses or not self.current_group:
            print("‚ùå No businesses to save")
            return

        output_file_path = self.directory_manager.get_output_file_path(self.current_group)

        with open(output_file_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['business_name', 'address', 'website', 'skill_ids', 'skill_names'])

            for business in businesses:
                writer.writerow([
                    business.business_name,
                    business.address,
                    business.website,
                    '; '.join(business.skill_ids),
                    '; '.join(business.skill_names)
                ])

        print(f"‚úÖ Results saved to {output_file_path}")

    def display_results(self, businesses: List[Business]):
        """Display results"""
        if not businesses:
            print("‚ùå No businesses found")
            return

        print(f"\nüéØ Found {len(businesses)} businesses")
        print("=" * 50)

        for i, business in enumerate(businesses, 1):
            print(f"\n{i}. {business.business_name}")
            print(f"   Address: {business.address}")
            print(f"   Website: {business.website}")
            print(f"   Skills: {', '.join(business.skill_names)}")
            print("-" * 50)

    def __del__(self):
        """Clean up selenium driver"""
        if hasattr(self, 'driver'):
            self.driver.quit()

def main():
    """Main function"""
    business_type = input("Enter business type: ").strip()
    location = input("Enter location: ").strip()

    if not business_type:
        business_type = "restaurant"
    if not location:
        location = "Los Angeles, CA"

    scraper = NERBusinessScraper(GOOGLE_API_KEY)

    try:
        businesses = scraper.scrape_businesses_by_group(business_type, location)
        scraper.display_results(businesses)
        scraper.save_results(businesses)

    except Exception as e:
        print(f"‚ùå Error: {e}")

if __name__ == "__main__":
    main()

Enter business type: animal care
Enter location: LA
üîç Analyzing business type: 'animal care' in LA
‚ö†Ô∏è No business group found for type: animal care
‚ö†Ô∏è No group found for animal care
‚ùå No businesses found
‚ùå No businesses to save
