In [None]:
pip install selenium

Collecting selenium
  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.13.2 (from selenium)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.33.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m27.

In [None]:
pip install requests openai



In [None]:
import requests
import openai
import json
import time
from typing import List, Dict, Optional
from dataclasses import dataclass
import re


In [None]:
@dataclass
class Business:
    name: str
    address: str
    location: Dict[str, float]
    website: Optional[str]
    phone: Optional[str]
    description: str
    place_id: str
    rating: Optional[float]
    business_type: List[str]
    potential_jobs: List[str]
    required_skills: List[str]
    job_categories: List[str]


In [None]:
class BusinessScraper:
    def __init__(self, google_api_key: str, openai_api_key: str):
        self.google_api_key = google_api_key
        self.openai_client = openai.OpenAI(api_key=openai_api_key)

        # Comprehensive skill dictionary organized by categories
        self.skill_dictionary = {
            "food_service": [
                "cooking", "food preparation", "dishwashing", "serving", "bartending",
                "barista", "food safety", "customer service", "cash handling", "menu knowledge",
                "kitchen operations", "food plating", "inventory management", "cleaning",
                "order taking", "table service", "beverage preparation", "food handling"
            ],
            "retail": [
                "sales", "customer service", "cash register", "inventory", "merchandising",
                "product knowledge", "visual display", "stock management", "cashier",
                "point of sale", "customer relations", "loss prevention", "fitting room"
            ],
            "hospitality": [
                "front desk", "housekeeping", "guest services", "reservations", "concierge",
                "room service", "event coordination", "customer relations", "hospitality management",
                "cleaning", "laundry", "maintenance", "security"
            ],
            "healthcare": [
                "patient care", "medical assistance", "receptionist", "scheduling", "filing",
                "insurance processing", "medical records", "customer service", "cleaning",
                "administrative support", "data entry"
            ],
            "office_admin": [
                "data entry", "filing", "receptionist", "phone answering", "scheduling",
                "customer service", "administrative support", "document management",
                "computer skills", "organization", "communication"
            ],
            "manual_labor": [
                "construction", "maintenance", "cleaning", "landscaping", "delivery",
                "warehouse", "loading", "unloading", "assembly", "repair", "installation",
                "heavy lifting", "equipment operation"
            ],
            "creative": [
                "graphic design", "photography", "writing", "social media", "marketing",
                "content creation", "video editing", "web design", "art", "creative writing"
            ],
            "transportation": [
                "driving", "delivery", "logistics", "vehicle maintenance", "customer service",
                "navigation", "time management", "safety protocols"
            ]
        }

    def search_businesses(self, query: str, location: str = "", radius: int = 5000) -> List[Dict]:
        """Search for businesses using Google Places API"""

        # Text search endpoint for broader queries
        text_search_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"

        params = {
            "query": f"{query} {location}".strip(),
            "key": self.google_api_key,
            "radius": radius
        }

        response = requests.get(text_search_url, params=params)

        if response.status_code != 200:
            raise Exception(f"Google Places API error: {response.status_code}")

        data = response.json()

        if data.get("status") != "OK":
            raise Exception(f"Google Places API error: {data.get('error_message', 'Unknown error')}")

        return data.get("results", [])

    def get_business_details(self, place_id: str) -> Dict:
        """Get detailed information about a specific business"""

        details_url = "https://maps.googleapis.com/maps/api/place/details/json"

        params = {
            "place_id": place_id,
            "fields": "name,formatted_address,geometry,website,formatted_phone_number,business_status,opening_hours,rating,reviews,types,editorial_summary",
            "key": self.google_api_key
        }

        response = requests.get(details_url, params=params)

        if response.status_code == 200:
            data = response.json()
            if data.get("status") == "OK":
                return data.get("result", {})

        return {}

    def analyze_business_with_ai(self, business_info: Dict) -> Dict:
        """Use OpenAI to analyze business and identify potential job opportunities and required skills"""

        business_name = business_info.get("name", "")
        business_types = business_info.get("types", [])
        reviews = business_info.get("reviews", [])
        editorial_summary = business_info.get("editorial_summary", {}).get("overview", "")

        # Create context from reviews and business types
        review_text = " ".join([review.get("text", "")[:200] for review in reviews[:3]])
        business_context = f"Business types: {', '.join(business_types)}. Summary: {editorial_summary}. Recent reviews: {review_text}"

        prompt = f"""
        Analyze this business and identify potential job opportunities and required skills:

        Business Name: {business_name}
        Business Context: {business_context}

        Available Skill Categories and Skills:
        {json.dumps(self.skill_dictionary, indent=2)}

        Please provide a JSON response with:
        1. "description": A brief description of the business and what they likely do
        2. "potential_jobs": List of specific job roles this business might hire for
        3. "required_skills": List of skills from the skill dictionary that would be relevant
        4. "job_categories": List of job categories from the skill dictionary that apply
        5. "hiring_likelihood": Score from 1-10 indicating how likely they are to hire entry-level workers

        Focus on entry-level positions that don't require extensive experience.
        """

        try:
            response = self.openai_client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are an expert at analyzing businesses and identifying job opportunities. Always respond with valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3
            )

            content = response.choices[0].message.content
            # Try to extract JSON from the response
            json_match = re.search(r'\{.*\}', content, re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
            else:
                return json.loads(content)

        except Exception as e:
            print(f"AI analysis error: {e}")
            return {
                "description": "Analysis unavailable",
                "potential_jobs": [],
                "required_skills": [],
                "job_categories": [],
                "hiring_likelihood": 5
            }

    def process_business(self, place_data: Dict) -> Business:
        """Process a single business and create Business object"""

        # Get detailed information
        place_id = place_data.get("place_id")
        details = self.get_business_details(place_id) if place_id else {}

        # Merge basic and detailed data
        business_info = {**place_data, **details}

        # Get AI analysis
        ai_analysis = self.analyze_business_with_ai(business_info)

        # Extract location
        geometry = business_info.get("geometry", {})
        location = geometry.get("location", {})

        return Business(
            name=business_info.get("name", ""),
            address=business_info.get("formatted_address", ""),
            location={"lat": location.get("lat"), "lng": location.get("lng")},
            website=business_info.get("website"),
            phone=business_info.get("formatted_phone_number"),
            description=ai_analysis.get("description", ""),
            place_id=place_id,
            rating=business_info.get("rating"),
            business_type=business_info.get("types", []),
            potential_jobs=ai_analysis.get("potential_jobs", []),
            required_skills=ai_analysis.get("required_skills", []),
            job_categories=ai_analysis.get("job_categories", [])
        )

    def scrape_businesses(self, query: str, max_results: int = 20) -> List[Business]:
        """Main method to scrape businesses based on query"""

        print(f"Searching for: {query}")

        # Search for businesses
        places = self.search_businesses(query)

        businesses = []
        for i, place in enumerate(places[:max_results]):
            print(f"Processing business {i+1}/{min(len(places), max_results)}: {place.get('name', 'Unknown')}")

            try:
                business = self.process_business(place)
                businesses.append(business)

                # Rate limiting
                time.sleep(0.5)

            except Exception as e:
                print(f"Error processing business: {e}")
                continue

        return businesses

    def export_results(self, businesses: List[Business], filename: str = "businesses.json"):
        """Export results to CSV file"""

        import csv

        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)

            # Write header
            writer.writerow([
                'name', 'address', 'latitude', 'longitude', 'website', 'phone',
                'description', 'rating', 'business_type', 'potential_jobs',
                'required_skills', 'job_categories'
            ])

            # Write data
            for business in businesses:
                writer.writerow([
                    business.name,
                    business.address,
                    business.location.get('lat', ''),
                    business.location.get('lng', ''),
                    business.website or '',
                    business.phone or '',
                    business.description,
                    business.rating or '',
                    '; '.join(business.business_type),
                    '; '.join(business.potential_jobs),
                    '; '.join(business.required_skills),
                    '; '.join(business.job_categories)
                ])

        print(f"Results exported to {filename}")


# Usage example
def main():
    # Initialize with your API keys
    GOOGLE_API_KEY = ""  #add keys
    OPENAI_API_KEY = ""

    scraper = BusinessScraper(GOOGLE_API_KEY, OPENAI_API_KEY)

    # Example queries
    queries = [
        "Cafes in downtown LA",

    ]

    for query in queries:
        print(f"\n{'='*50}")
        print(f"Processing: {query}")
        print(f"{'='*50}")

        businesses = scraper.scrape_businesses(query, max_results=10)

        # Display results
        for business in businesses:
            print(f"\n📍 {business.name}")
            print(f"   Address: {business.address}")
            print(f"   Website: {business.website or 'Not available'}")
            print(f"   Description: {business.description}")
            print(f"   Potential Jobs: {', '.join(business.potential_jobs)}")
            print(f"   Required Skills: {', '.join(business.required_skills)}")
            print(f"   Job Categories: {', '.join(business.job_categories)}")

        # Export results
        filename = f"businesses_{query.replace(' ', '_').replace(',', '').lower()}.csv"
        scraper.export_results(businesses, filename)

if __name__ == "__main__":
    main()


Processing: Cafes in downtown LA
Searching for: Cafes in downtown LA
Processing business 1/10: Dalian's Café
Processing business 2/10: Urth Caffe
Processing business 3/10: Cafe Dulce
Processing business 4/10: Fleur Café
Processing business 5/10: Tierra Mia Coffee
Processing business 6/10: OwWhoo Cafe
Processing business 7/10: ilCaffè
Processing business 8/10: Cafe Fig
Processing business 9/10: Hilltop Coffee + Kitchen - DTLA
Processing business 10/10: Le Cafe Bonjour

📍 Dalian's Café
   Address: 530 S Grand Ave, Los Angeles, CA 90071, USA
   Website: http://dalianscafe.com/
   Description: Dalian's Café is a cozy establishment that serves as a café, bakery, and store. They offer a variety of food and beverages, including sandwiches, pastries, and hot drinks. The café prides itself on its quality food, excellent customer service, and inviting atmosphere.
   Potential Jobs: Barista, Cashier, Food Preparation Worker, Bakery Assistant, Dishwasher, Server, Retail Sales Associate
   Require