In [6]:
import asyncio
from playwright.async_api import async_playwright
import json

async def run():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page()

        # Optimize: Block media to load fast
        await page.route("**/*", lambda route: route.abort() 
            if route.request.resource_type in ["image", "media", "font"] 
            else route.continue_())

        # --- THE FIX: Go directly to the results page ---
        print("üöÄ Bypassing homepage... going straight to search results.")
        await page.goto("https://public-library.safetyculture.io/search?q=Safety", timeout=60000)

        print("‚è≥ Waiting for results to load...")
        
        # Wait for the checklist cards (grid items) to appear
        try:
            # We wait for the specific grid item class or link
            await page.wait_for_selector('a[href*="/products/"]', state="visible", timeout=30000)
        except Exception:
            print("‚ùå Results didn't load. Taking debug screenshot.")
            await page.screenshot(path="debug_error_3.png")
            print("Check debug_error_3.png")
            await browser.close()
            return

        print("üëÄ Extracting checklist links...")

        checklists = []
        
        # Scroll down 5 times to load ~100 items
        for i in range(5): 
            print(f"   Scrolling batch {i+1}...")
            await page.mouse.wheel(0, 4000)
            await asyncio.sleep(2) # Wait for network to catch up

        # Extract the links
        cards = await page.query_selector_all('a[href*="/products/"]')

        for card in cards:
            title_el = await card.query_selector("h3, h4") 
            title = await title_el.inner_text() if title_el else "Untitled"
            href = await card.get_attribute("href")
            
            if href:
                full_link = f"https://public-library.safetyculture.io{href}"
                # Deduplicate
                if not any(c['url'] == full_link for c in checklists):
                    checklists.append({"title": title, "url": full_link})
                    print(f"   Found: {title}")

        print(f"‚úÖ Successfully scraped {len(checklists)} templates.")
        
        # Save results
        with open('my_checklists.json', 'w') as f:
            json.dump(checklists, f, indent=2)

        await browser.close()

# Run it
await run()

üöÄ Bypassing homepage... going straight to search results.
‚è≥ Waiting for results to load...
üëÄ Extracting checklist links...
   Scrolling batch 1...
   Scrolling batch 2...
   Scrolling batch 3...
   Scrolling batch 4...
   Scrolling batch 5...
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
   Found: Untitled
‚úÖ Successfully scraped 30 templates.


In [13]:
# --- 1. CONFIGURATION ---
import google.generativeai as genai
import json
import time
import os
import random

# --- 1. CONFIGURATION ---
os.environ["GOOGLE_API_KEY"] = "AIzaSyDWutW-2BV4WDYeb9KZz-n8FQcF6yWvTAE"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
model = genai.GenerativeModel('gemini-2.5-flash') 

# --- 2. THE MASSIVE INDIAN INDUSTRY LIST ---
# I have organized this based on the top sectors in the Indian Economy
categories = {
    "üè≠ Manufacturing & Production": [
        "Textile/Garment Factory (Needle Policy & Cutting Room)",
        "Pharma Manufacturing (GMP & Clean Room Audit)",
        "Automotive Assembly Line (5S & Safety)",
        "Chemical Plant (Hazmat Storage & Spill Kits)",
        "Food Processing Plant (FSSAI Schedule 4)",
        "Steel/Heavy Engineering (Crane & Hoist Safety)"
    ],
    "üöö Logistics & Supply Chain": [
        "E-commerce Warehouse (Picking Accuracy & racking)",
        "Cold Chain/Cold Storage (Temperature & Compressors)",
        "Fleet Management (Truck Tyre & Driver Chalan Check)",
        "Last Mile Delivery Hub (Bike Condition & Bag Hygiene)",
        "Godown/Depot Pre-Monsoon Leakage Check"
    ],
    "üèóÔ∏è Construction & Real Estate": [
        "High-Rise Construction Site (Fall Protection & Netting)",
        "Excavation & Piling Work (Soil Stability)",
        "Residential Society/RWA (Security Gate & Lift Audit)",
        "Labour Camp Hygiene (Water & Sanitation)",
        "Electrical Safety (Temporary DB & Cabling)"
    ],
    "ü•ó Hospitality & Cloud Kitchens": [
        "Cloud Kitchen/Dark Kitchen (Zomato/Swiggy Compliance)",
        "Fine Dining Restaurant (Front of House & Washrooms)",
        "Corporate Cafeteria (Bulk Food Handling)",
        "Hotel Housekeeping (Room Turnaround & Mini-bar)",
        "Bar & Pub Safety (Fire Exits & Stock Audit)"
    ],
    "üõçÔ∏è Retail & Malls": [
        "Supermarket/Hypermarket (Expiry Check & FIFO)",
        "Fashion Retail Store (Trial Room & loss Prevention)",
        "Jewelry Store (Opening/Closing Security Protocol)",
        "Mall Common Area (Escalators & Fire Hydrants)",
        "Electronics Showroom (Demo Unit Functionality)"
    ],
    "üè¢ Facility Management (FM)": [
        "IT Park Server Room (AC Cooling & Fire Gas)",
        "Corporate Washroom Cleaning Log (Hourly)",
        "DG Set (Diesel Generator) Maintenance Check",
        "STP/WTP (Sewage Treatment Plant) Daily Log",
        "Pest Control Service Report (Cockroach/Rodent)"
    ]
}

final_database = []

print(f"üöÄ Starting Generation for {sum(len(v) for v in categories.values())} Indian Industries...")

# --- 3. GENERATION LOOP ---
for category, checklist_titles in categories.items():
    
    category_obj = {
        "category_name": category,
        "checklists": []
    }
    
    print(f"\nüìÇ Processing: {category}")

    for title in checklist_titles:
        print(f"   ‚ö° Generating: {title}...")
        
        # --- 4. THE PROMPT (Indian Context + Mixed Inputs) ---
        prompt = f"""
        Act as a Senior Auditor in India. Create a JSON checklist for: "{title}".
        
        CONTEXTUAL INSTRUCTIONS:
        1. **Indian Reality:** Use terms relevant to India (e.g., 'DG Set', 'Maid/Housekeeping', 'Security Guard', 'Chalan', 'LPG Bank', 'FSSAI License Display', 'IST Standard').
        2. **Variety:** You MUST use a mix of these input types:
           - 'radio' (Pass/Fail)
           - 'text' (Readings like Temp, Pressure, Serial No)
           - 'slider' (Percentage 0-100% or Scale 1-10)
           - 'checkbox' (Multi-select for defects)
           - 'rating' (1-5 Stars)
        
        OUTPUT JSON FORMAT:
        {{
            "title": "{title}",
            "description": "Short professional description.",
            "items": [
                {{
                    "question": "Specific audit question",
                    "type": "radio | text | slider | checkbox | rating",
                    "options": ["Opt1", "Opt2"],
                    "placeholder": "e.g. Enter value",
                    "min": 0, "max": 100, "unit": "%",
                    "allow_notes": true,
                    "allow_media": true
                }}
            ]
        }}
        
        Generate exactly 6 items. Make them sound professional and technical.
        """

        try:
            # Generate content
            response = model.generate_content(prompt, generation_config={"response_mime_type": "application/json"})
            
            # Parse JSON
            checklist_data = json.loads(response.text)
            
            # Add to category
            category_obj["checklists"].append(checklist_data)
            print("      ‚úÖ Success.")
            
        except Exception as e:
            print(f"      ‚ùå Error: {e}")
            
        # Sleep to avoid rate limits
        time.sleep(2)

    final_database.append(category_obj)

# --- 5. SAVE ---
with open('data.json', 'w') as f:
    json.dump(final_database, f, indent=2)

print("\nüéâ BHARAT DATABASE COMPLETE! Saved to 'data.json'")

üöÄ Starting Generation for 31 Indian Industries...

üìÇ Processing: üè≠ Manufacturing & Production
   ‚ö° Generating: Textile/Garment Factory (Needle Policy & Cutting Room)...
      ‚úÖ Success.
   ‚ö° Generating: Pharma Manufacturing (GMP & Clean Room Audit)...
      ‚úÖ Success.
   ‚ö° Generating: Automotive Assembly Line (5S & Safety)...
      ‚úÖ Success.
   ‚ö° Generating: Chemical Plant (Hazmat Storage & Spill Kits)...
      ‚úÖ Success.
   ‚ö° Generating: Food Processing Plant (FSSAI Schedule 4)...
      ‚úÖ Success.
   ‚ö° Generating: Steel/Heavy Engineering (Crane & Hoist Safety)...
      ‚úÖ Success.

üìÇ Processing: üöö Logistics & Supply Chain
   ‚ö° Generating: E-commerce Warehouse (Picking Accuracy & racking)...
      ‚úÖ Success.
   ‚ö° Generating: Cold Chain/Cold Storage (Temperature & Compressors)...
      ‚úÖ Success.
   ‚ö° Generating: Fleet Management (Truck Tyre & Driver Chalan Check)...
      ‚úÖ Success.
   ‚ö° Generating: Last Mile Delivery Hub (Bike Condi