In [1]:
import pandas as pd
from playwright.async_api import async_playwright

URL = "https://www.gsb.stanford.edu/faculty-research/faculty"

async def scrape_stanford_gsb_manual_fast():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False, slow_mo=50)
        page = await browser.new_page()

        # Do not use networkidle
        await page.goto(URL, timeout=120000)
        await page.wait_for_timeout(4000)

        # Close cookie banner (if exists)
        try:
            btn = page.get_by_role("button", name="Accept")
            if await btn.count() > 0:
                await btn.click()
                await page.wait_for_timeout(1000)
        except Exception:
            pass

        print("ðŸ‘‰ Browser is ready. Please scroll down manually until the bottom of the list.")
        print("   I will wait for 30 seconds before starting the scraping.")

        # Wait for manual scrolling â€” changed to 30 seconds
        await page.wait_for_timeout(30000)

        raw_list = await page.eval_on_selector_all(
            "div.c-person.c-person--faculty-main-list.views-row",
            """
            els => els.map(el => {
                // 1. Typical structure: <h2 class="c-person__name"><a>Name</a></h2>
                let nameEl =
                    el.querySelector('h2.c-person__name') ||
                    el.querySelector('.c-person__name a') ||
                    el.querySelector('a.c-person__name');

                // 2. Fallback: any faculty profile link
                let linkEl = el.querySelector('a[href*="/faculty-research/faculty/"]');

                const titleEl = el.querySelector('div.c-person__position');
                const areaEl  = el.querySelector('div.c-person__area');

                const nameText = nameEl
                    ? nameEl.textContent.trim()
                    : (linkEl ? linkEl.textContent.trim() : '');

                return {
                    name: nameText,
                    title: titleEl ? titleEl.textContent.trim() : '',
                    area:  areaEl  ? areaEl.textContent.trim()  : '',
                    url:   linkEl  ? linkEl.href : ''
                };
            })
            """
        )

        print(f"Scraping finished. Total {len(raw_list)} records collected.")

        await browser.close()

        df = pd.DataFrame(raw_list)
        return df


In [2]:
df = await scrape_stanford_gsb_manual_fast()
len(df)
df.head()

ðŸ‘‰ Browser is ready. Please scroll down manually until the bottom of the list.
   I will wait for 30 seconds before starting the scraping.
Scraping finished. Total 341 records collected.


Unnamed: 0,name,title,area,url
0,Jennifer Aaker,"Professor, Marketing",Marketing,https://www.gsb.stanford.edu/faculty-research/...
1,Douglas Abbey,"Lecturer, Finance",Finance,https://www.gsb.stanford.edu/faculty-research/...
2,Matt Abrahams,"Lecturer, Organizational Behavior",Organizational Behavior,https://www.gsb.stanford.edu/faculty-research/...
3,Avidit Acharya,"Professor (by courtesy), Political E...",Political Economy,https://www.gsb.stanford.edu/faculty-research/...
4,Anat R. Admati,"Professor, Finance",Finance,https://www.gsb.stanford.edu/faculty-research/...


In [None]:
df.to_excel("stanford_gsb_faculty.xlsx", index=False)