In [1]:
import sys
!{sys.executable} -m pip install playwright pandas
!playwright install
!apt-get install libxcomposite1 libgtk-3-0 libatk1.0-0 -y

Collecting playwright
  Downloading playwright-1.57.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.57.0-py3-none-manylinux1_x86_64.whl (46.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.57.0 pyee-13.0.0
Downloading Chromium 143.0.7499.4 (playwright build v1200)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1200/chromium-linux.zip[22m
[1G164.7 MiB [] 0% 0.0s[0K[1G164.7 MiB [] 0% 18.1s[0K[1G164.7 MiB [] 0% 11.9s[0K[1G164.7 MiB [] 0% 6.2s[0K[1G164.7 MiB [] 1% 3.7s[0K[1G164.7 MiB [] 2% 2.6s[0K[1G164.7 MiB [] 4% 2.3s[0K[1G164.7 MiB [] 4% 2.4s[0K[1G164.7 MiB [] 5% 2.0s[0K[1G164.7 MiB [] 7% 1.8s[

In [2]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import time

# ~500 companies
TARGET_COUNT = 500
YC_URL = "https://www.ycombinator.com/companies"

async def scrape_yc():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True) # Changed to headless=True
        page = await browser.new_page()

        print(f"Loading {YC_URL}...")
        await page.goto(YC_URL)

#-------PHASE 1 TO 4 --------------------------------------
        # --- PHASE 1: SCROLL & LOAD COMPANIES ---
        companies_data = []
        unique_urls = set()

        # Locator for company cards (generic selector to be robust against class name changes)
        # Looking for links that go to /companies/
        company_locator = page.locator('a[href^="/companies/"]')

        print("Scrolling to load companies...")
        while len(unique_urls) < TARGET_COUNT:
            # Scroll to bottom
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await page.wait_for_timeout(1000) # Wait for content to load

            # Extract current list
            # We filter for links that look like company cards (usually contain the company name header)
            # Note: YC class names are hashed (e.g., _company_86jzd), so we use structure/attributes.
            count = await company_locator.count()
            print(f"Loaded {count} companies...")

            if count >= TARGET_COUNT:
                break

        # --- PHASE 2: PARSE MAIN LIST ---
        print("Parsing loaded list...")
        # Get all company elements
        elements = await company_locator.all()

        for el in elements[:TARGET_COUNT]:
            href = await el.get_attribute("href")
            full_url = f"https://www.ycombinator.com{href}"

            if full_url in unique_urls:
                continue
            unique_urls.add(full_url)

            # Extract basic info from the card text
            # The card usually contains: Name, Location, Description, Batch
            text_content = await el.inner_text()
            lines = text_content.split('\n')

            # Simple heuristic parsing (structure varies, but usually Name is top)
            name = lines[0] if lines else "N/A"
            description = lines[-1] if len(lines) > 1 else "N/A"

            # Batch is often clearly labeled or in a specific tag, but extraction from list text
            # can be messy. We can also grab it from the detail page to be safe.

            companies_data.append({
                "Company Name": name,
                "URL": full_url,
                "Short Description": description,
                # Placeholders to be filled in Phase 3
                "Batch": "Pending",
                "Founder Name(s)": [],
                "Founder LinkedIn URL(s)": []
            })

        print(f"Collected {len(companies_data)} unique profiles. Starting detailed scrape...")

        # --- PHASE 3: DETAILED SCRAPING (Profile Visits) ---
        # For efficiency, we could use asyncio.gather, but let's loop for simplicity/rate-limiting
        for i, company in enumerate(companies_data):
            try:
                print(f"[{i+1}/{len(companies_data)}] Scraping {company['Company Name']}...")
                await page.goto(company['URL'])
                await page.wait_for_load_state('domcontentloaded')

                # 1. Extract Batch (often in a pill header)
                # Look for text like "W24", "S23", "Winter 2024"
                batch_locator = page.locator('a[href*="batch="], span:has-text("Winter"), span:has-text("Summer"), span:has-text("Spring"), span:has-text("Fall")')
                if await batch_locator.count() > 0:
                     company["Batch"] = await batch_locator.first.inner_text()

                # 2. Extract Founders
                # Founders are usually in a section. We look for the "Founders" header
                # or containers with founder info.
                founders = []
                linkedins = []

                # Strategy: Find the founder section, then find names and links inside it
                # Robust selector: Look for div cards inside a container that has "Active Founders" or just "Founders"
                founder_elements = page.locator('.space-y-5 > div') # Common layout for founder rows

                # If specific layout fails, generic approach:
                # Find all links to linkedin.com on the page
                linkedin_els = page.locator('a[href*="linkedin.com"]')
                count = await linkedin_els.count()

                for j in range(count):
                    url = await linkedin_els.nth(j).get_attribute('href')
                    # Often the name is the text of the link or the parent container's text
                    # Here we take a simplified approach: Gather all unique LinkedIn URLs
                    if url and "ycombinator" not in url:
                        linkedins.append(url)

                # Try to get names specifically (usually h3 or bold text near the photo)
                name_els = page.locator('h3') # Founders often have h3 headings
                name_count = await name_els.count()
                for k in range(name_count):
                    text = await name_els.nth(k).inner_text()
                    if text and text not in ["Latest News", "Similar Companies"]:
                        founders.append(text)

                company["Founder Name(s)"] = ", ".join(list(set(founders)))
                company["Founder LinkedIn URL(s)"] = ", ".join(list(set(linkedins)))

            except Exception as e:
                print(f"Error scraping {company['Company Name']}: {e}")

        await browser.close()

        # --- PHASE 4: SAVE TO CSV ---
        df = pd.DataFrame(companies_data)
        df.to_csv("yc_startups_500.csv", index=False)
        print("Done! Data saved to yc_startups_500.csv")

# Run the async function
await scrape_yc()

Loading https://www.ycombinator.com/companies...
Scrolling to load companies...
Loaded 40 companies...
Loaded 60 companies...
Loaded 80 companies...
Loaded 100 companies...
Loaded 120 companies...
Loaded 140 companies...
Loaded 160 companies...
Loaded 180 companies...
Loaded 200 companies...
Loaded 220 companies...
Loaded 240 companies...
Loaded 260 companies...
Loaded 280 companies...
Loaded 300 companies...
Loaded 320 companies...
Loaded 340 companies...
Loaded 360 companies...
Loaded 380 companies...
Loaded 400 companies...
Loaded 420 companies...
Loaded 440 companies...
Loaded 460 companies...
Loaded 480 companies...
Loaded 500 companies...
Parsing loaded list...
Collected 500 unique profiles. Starting detailed scrape...
[1/500] Scraping DoorDashSan Francisco, CA, USA...
[2/500] Scraping AirbnbSan Francisco, CA, USA...
[3/500] Scraping CoinbaseSan Francisco, CA, USA...
[4/500] Scraping OkloSanta Clara, CA, USA...
[5/500] Scraping GrowwBengaluru, KA, India...
[6/500] Scraping Instac