In [2]:
# NOTE: The Stanford GSB site provides abstracts but also employs anti-scraping protection.

import asyncio
from pathlib import Path
import re

import pandas as pd
from tqdm import tqdm
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

# ----------------- Basic Config -----------------
BASE_URL = "https://www.gsb.stanford.edu/faculty-research/publications"
ROOT = "https://www.gsb.stanford.edu"

OUT_EXCEL = "stanford_gsb_publications_2015plus.xlsx"
OUT_AUTOSAVE = "stanford_gsb_publications_2015plus_autosave.xlsx"
SAVE_EVERY = 10          # frequent autosave to prevent loss
YEAR_KEEP_FROM = 2015    # keep only >= 2015
YEAR_STOP_AT = 2014      # stop pagination when encountering <= 2014


def parse_year_from_text(text: str):
    """Fallback: extract a year from a text block using regex"""
    text = (text or "").strip()
    m = re.search(r"(19|20)\d{2}", text)
    return int(m.group(0)) if m else None


async def scrape():
    """
    Pagination rule:
    - page = 0 => first page (latest)
    - page = 1 => second page
    - page = 2 => third page
    ...
    Stop pagination once any card on a page shows year <= 2014.
    """

    # 0️⃣ Resume capability
    if Path(OUT_EXCEL).exists():
        existing_df = pd.read_excel(OUT_EXCEL)
        records = existing_df.to_dict(orient="records")
        processed_urls = set(
            str(u).strip()
            for u in existing_df["Detail URL"].dropna().tolist()
        )
        print(f"Loaded {len(records)} existing records from {OUT_EXCEL}")
    else:
        records = []
        processed_urls = set()
        print("No existing Excel found, start from scratch.")

    new_since_save = 0

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # set True for headless
        page = await browser.new_page()

        stop_due_to_old_year = False
        page_idx = 0  # start from page 0

        while True:
            # 1️⃣ Construct URL
            if page_idx == 0:
                url = BASE_URL
            else:
                url = f"{BASE_URL}?page={page_idx}"

            print(f"\n=== Now scraping list page {page_idx} : {url} ===")

            # 2️⃣ Open list page
            try:
                await page.goto(url, wait_until="domcontentloaded", timeout=120000)
            except PlaywrightTimeoutError:
                print(f"[List] Timeout loading page {url}, stop here.")
                break

            # Slight scroll to ensure listing blocks render fully
            for _ in range(8):
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(200)

            cards = page.locator("div.views-row")
            n = await cards.count()
            print(f"List page {page_idx}: found {n} cards.")

            if n == 0:
                print("No cards on this page, stop.")
                break

            # 3️⃣ Process each card on the page
            for i in tqdm(range(n), desc=f"Scraping list page {page_idx}"):
                card = cards.nth(i)

                # ---------- Title + Link ----------
                title_el = card.locator("h2 a")
                if not await title_el.count():
                    continue

                title = (await title_el.first.inner_text()).strip()
                href = await title_el.first.get_attribute("href")
                if href:
                    detail_url = href if href.startswith("http") else ROOT + href
                else:
                    detail_url = ""

                # Skip if already scraped
                if detail_url and detail_url in processed_urls:
                    continue

                # ---------- Authors ----------
                authors = ""
                authors_el = card.locator("div.field.publication-authors")
                if await authors_el.count():
                    authors = (await authors_el.first.inner_text()).strip()
                else:
                    alt_authors_el = card.locator(".c-node-publication__authors")
                    if await alt_authors_el.count():
                        authors = (await alt_authors_el.first.inner_text()).strip()

                # ---------- Source / Journal + Year ----------
                journal = ""
                year = None
                pubinfo_raw = ""

                pubinfo_wrapper = card.locator("div.c-node-publication__publication-info")
                if await pubinfo_wrapper.count():
                    pubinfo_raw = (await pubinfo_wrapper.first.inner_text()).strip()

                    src_el = pubinfo_wrapper.locator("span.source-publication")
                    if await src_el.count():
                        journal = (await src_el.first.inner_text()).strip()

                    year_el = pubinfo_wrapper.locator("span.year")
                    if await year_el.count():
                        try:
                            year_text = (await year_el.first.inner_text()).strip()
                            year = int(year_text)
                        except ValueError:
                            year = parse_year_from_text(pubinfo_raw)
                    else:
                        year = parse_year_from_text(pubinfo_raw)
                else:
                    year = None
                    journal = ""
                    pubinfo_raw = ""

                # ⚠️ Stop when encountering <= 2014 (older pages follow)
                if (year is not None) and (year <= YEAR_STOP_AT):
                    print(f"Encountered year {year} <= {YEAR_STOP_AT} on list page {page_idx}, stop further pages.")
                    stop_due_to_old_year = True
                    break

                # Keep only >= 2015
                if (year is not None) and (year < YEAR_KEEP_FROM):
                    continue

                # Skip if year parsing failed
                if year is None:
                    continue

                # ---------- Scrape abstract ----------
                abstract = ""
                error = ""

                if detail_url:
                    dp = await browser.new_page()
                    try:
                        try:
                            await dp.goto(
                                detail_url,
                                wait_until="domcontentloaded",
                                timeout=120000,
                            )
                        except PlaywrightTimeoutError:
                            print(f"[Detail] Timeout on domcontentloaded, retry simple goto: {detail_url}")
                            await dp.goto(detail_url, timeout=120000)

                        desc = dp.locator("div.c-node-publication__description")
                        if await desc.count():
                            abstract = (await desc.inner_text()).strip()
                    except Exception as e:
                        error = f"abstract_error: {e}"
                        print(f"[Error] {detail_url}: {e}")
                    finally:
                        await dp.close()

                record = {
                    "Title": title,
                    "Authors": authors,
                    "Journal": journal,
                    "Year": year,
                    "Publication Info Raw": pubinfo_raw,
                    "Detail URL": detail_url,
                    "Abstract": abstract,
                    "Error": error,
                    "List Page Index": page_idx,
                }

                records.append(record)
                if detail_url:
                    processed_urls.add(detail_url)

                new_since_save += 1

                # Autosave
                if new_since_save >= SAVE_EVERY:
                    df_save = pd.DataFrame(records)
                    df_save = df_save[df_save["Year"].notna() & (df_save["Year"] >= YEAR_KEEP_FROM)]
                    df_save.to_excel(OUT_EXCEL, index=False)
                    df_save.to_excel(OUT_AUTOSAVE, index=False)
                    print(f"\nAutosaved {len(df_save)} rows to {OUT_EXCEL} / {OUT_AUTOSAVE}")
                    new_since_save = 0

            # Stop pagination if reached old year
            if stop_due_to_old_year:
                print("Stop due to encountering 2014 or earlier.")
                break

            # Otherwise go to next page
            page_idx += 1

        # 4️⃣ Final save
        df_final = pd.DataFrame(records)
        df_final = df_final[df_final["Year"].notna() & (df_final["Year"] >= YEAR_KEEP_FROM)]
        df_final.to_excel(OUT_EXCEL, index=False)
        df_final.to_excel(OUT_AUTOSAVE, index=False)
        print(f"\nFinal save (Year >= {YEAR_KEEP_FROM}): {len(df_final)} rows to {OUT_EXCEL} / {OUT_AUTOSAVE}")

        await browser.close()


In [None]:
import asyncio
await scrape()

In [3]:
import pandas as pd
df = pd.read_excel("stanford_gsb_publications_2015plus.xlsx")
print("Total rows:", df.shape[0])
df.head(20)       

Total rows: 2157


Unnamed: 0,Title,Authors,Journal,Year,Publication Info Raw,Detail URL,Abstract,Error,List Page Index
0,Why Do People Choose Extreme Candidates? The R...,"Mohamed Hussein, Zakary Tormala, S. Christian ...",Journal of Experimental Social Psychology,2026,Journal of Experimental Social Psychology Janu...,https://www.gsb.stanford.edu/faculty-research/...,Elected officials are increasingly extreme. Re...,,
1,Fifteen Reasons You Should Read This Paper: Ho...,"Abigail Bergman, Mohamed Hussein, Rhia Catapan...",Personality and Social Psychology Bulletin,2026,Personality and Social Psychology Bulletin 202...,https://www.gsb.stanford.edu/faculty-research/...,People generally believe more is better in per...,,
2,Talking About What We Support Versus Oppose Af...,"Rhia Catapano, Zakary Tormala",Journal of Personality and Social Psychology,2026,Journal of Personality and Social Psychology 2026,https://www.gsb.stanford.edu/faculty-research/...,People’s unwillingness to engage with others w...,,
3,Veto Players and Policy Development,"Alexander V. Hirsch, Ken Shotts",American Journal of Political Science (forthco...,2026,American Journal of Political Science (forthco...,https://www.gsb.stanford.edu/faculty-research/...,We analyze the effects of veto players when th...,,
4,Putting Climate into Context: How Culture and ...,"Alison V. Hall, Derek R. Avery, Michele J. Gel...",Organization Science,2025,Organization Science November72025,https://www.gsb.stanford.edu/faculty-research/...,Existing evidence demonstrates that an organiz...,,
5,Bond Convenience Yields in the Eurozone Curren...,"Zhengyang Jiang, Hanno Lustig, Stijn Van Nieuw...",Review of Financial Studies (forthcoming),2025,Review of Financial Studies (forthcoming) Nove...,https://www.gsb.stanford.edu/faculty-research/...,This paper analyzes bond convenience yields in...,,
6,Comparing Experimental and Nonexperimental Met...,"Guido W. Imbens, Yiqing Xu",Journal of Economic Perspectives,2025,Journal of Economic Perspectives November2025 ...,https://www.gsb.stanford.edu/faculty-research/...,"In 1986, Robert LaLonde published an article c...",,
7,Insurance Versus Moral Hazard in Income-Contin...,Tim de Silva,The Quarterly Journal of Economics,2025,The Quarterly Journal of Economics November202...,https://www.gsb.stanford.edu/faculty-research/...,Student loans with income-contingent repayment...,,
8,Japan's Debt Puzzle: Sovereign Wealth Fund fro...,"Yili Chien, Wenxin Du, Hanno Lustig",Journal of Economic Perspectives,2025,Journal of Economic Perspectives November2025 ...,https://www.gsb.stanford.edu/faculty-research/...,We analyze the risks associated with Japan’s p...,,
9,Just a Few Seeds More: The Value of Network Da...,"Mohammad Akbarpour, Suraj Malladi, Amin Saberi",American Economic Review,2025,American Economic Review November2025 Vol. 115...,https://www.gsb.stanford.edu/faculty-research/...,Identifying the optimal set of individuals to ...,,
