In [None]:
# STEP 1 ‚Äî Scrape the Kellogg listing page (research cards)
# This script collects all publication cards from the listing
# page, extracts basic metadata (title, type, year, journal,
# authors) and especially the detail_url.

# -*- coding: utf-8 -*-
from __future__ import annotations
import re, time
from typing import List, Dict, Optional

import pandas as pd
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException, StaleElementReferenceException


# =============== Configurable Parameters ===============
BASE_LIST_URL = (
    "https://www.kellogg.northwestern.edu/academics-research/research/"
    "?publicationType=Journal%20Article"
)
OUTPUT_XLSX = "kellogg_journal_articles.xlsx"
# =======================================================

def make_driver(headless: bool = False) -> webdriver.Chrome:
    opts = Options()
    if headless:
        # You can change "--headless=new" back to classic headless if needed: "--headless"
        opts.add_argument("--headless=new")
        opts.add_argument("--window-size=1280,1800")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--start-maximized")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    return driver


def gentle_scroll(driver, px: int = 800, pause: float = 0.3):
    driver.execute_script(f"window.scrollBy(0, {px});")
    time.sleep(pause)


def close_cookie_banner(driver):
    # Kellogg site may show a consent banner; handle safely
    try:
        gentle_scroll(driver, 400, 0.2)
        # Try common ‚ÄúAccept/Continue‚Äù buttons
        for sel in [
            "button[mode='primary']",
            "button[aria-label*='accept']",
            "button[aria-label*='consent']",
            "#onetrust-accept-btn-handler",
        ]:
            btns = driver.find_elements(By.CSS_SELECTOR, sel)
            if btns:
                try:
                    driver.execute_script("arguments[0].click();", btns[0])
                    time.sleep(0.4)
                    break
                except Exception:
                    pass
    except Exception:
        pass


def wait_first_batch(driver, timeout: int = 30):
    """Wait until the first batch of cards is rendered."""
    close_cookie_banner(driver)
    # Slight scrolling to trigger lazy loading
    for _ in range(4):
        gentle_scroll(driver, 500, 0.25)
    driver.execute_script("window.scrollTo(0, 0);")

    sel_any_card = (
        "a.publication-card__link, "
        ".publication-card a[href*='/academics-research/research/detail/'], "
        "a.card__link"
    )
    WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, sel_any_card))
    )


def find_more_button(driver):
    # ‚ÄúMORE RESEARCH‚Äù button is usually an <a> or <button>
    candidates = driver.find_elements(By.XPATH,
        "//a[normalize-space()='More Research' or normalize-space()='MORE RESEARCH']"
        " | //button[normalize-space()='More Research' or normalize-space()='MORE RESEARCH']"
    )
    if candidates:
        return candidates[0]
    # Fallback: class name contains more/cta
    c2 = driver.find_elements(By.CSS_SELECTOR, "a.call-to-action, a[class*='more']")
    for el in c2:
        txt = (el.text or "").strip().lower()
        if "more research" in txt:
            return el
    return None


def click_more_until_end(driver, sleep: float = 0.8, max_clicks: Optional[int] = None) -> int:
    """Keep clicking ‚ÄúMORE RESEARCH‚Äù until card count stops increasing or button disappears."""
    wait_first_batch(driver)

    def count_cards():
        return len(driver.find_elements(
            By.CSS_SELECTOR,
            ".publication-card, a.publication-card__link, a.card__link"
        ))

    last, idle = 0, 0
    clicks = 0
    pbar = tqdm(desc="Loading pages", unit="click")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep)

        btn = find_more_button(driver)
        if not btn:
            driver.execute_script("window.scrollBy(0, -400);")
            time.sleep(sleep)
            btn = find_more_button(driver)

        if not btn:
            now = count_cards()
            if now <= last:
                break
            last = now
            continue

        try:
            driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
            time.sleep(0.3)
            driver.execute_script("arguments[0].click();", btn)
        except (ElementClickInterceptedException, StaleElementReferenceException):
            try:
                btn.click()
            except Exception:
                pass

        clicks += 1
        pbar.update(1)
        if max_clicks and clicks >= max_clicks:
            break

        # Wait for new cards
        for _ in range(10):
            time.sleep(0.4)
            now = count_cards()
            if now > last:
                last = now
                idle = 0
                break
        else:
            idle += 1

        if idle >= 2:
            break

    pbar.close()
    return last


def clean_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip())


def parse_card_block(card_el) -> Dict[str, str]:
    """Extract fields from a card block on the listing page."""
    row = {"type": "", "year": "", "title": "", "journal": "", "authors": "", "detail_url": ""}

    # Title + detail link
    title_a = None
    for sel in ["a.publication-card__link", "a.card__link", "a[href*='/academics-research/research/detail/']"]:
        links = card_el.find_elements(By.CSS_SELECTOR, sel)
        if links:
            title_a = links[0]; break
    if title_a:
        row["title"] = clean_spaces(title_a.text)
        row["detail_url"] = title_a.get_attribute("href") or ""

    # Type
    for sel in [".publication-card__type", "[class*='publication'] [class*='type']"]:
        els = card_el.find_elements(By.CSS_SELECTOR, sel)
        if els:
            row["type"] = clean_spaces(els[0].text)
            break

    # Year
    candidates = card_el.find_elements(By.XPATH, ".//*[self::div or self::span][string-length(normalize-space())<=6]")
    year = ""
    for el in candidates[:6]:
        m = re.search(r"(19|20)\d{2}", (el.text or ""))
        if m:
            year = m.group(0); break
    if not year:
        m = re.search(r"(19|20)\d{2}", card_el.text)
        if m: year = m.group(0)
    row["year"] = year

    # Journal (italic text)
    for sel in ["i", "em", ".publication-card__journal"]:
        j = card_el.find_elements(By.CSS_SELECTOR, sel)
        if j:
            row["journal"] = clean_spaces(j[0].text)
            break

    # Authors
    authors = card_el.find_elements(By.XPATH, ".//a[contains(@href,'/academics-research/faculty/')]")
    if authors:
        row["authors"] = ", ".join([clean_spaces(a.text) for a in authors])
    else:
        lines = [clean_spaces(x.text) for x in card_el.find_elements(By.XPATH, ".//*") if clean_spaces(x.text)]
        if lines:
            row["authors"] = lines[-1]

    return row


import pandas as pd

def collect_all_cards_via_js(driver):
    script = r"""
    // Find all links pointing to research detail pages
    const links = Array.from(
      document.querySelectorAll('a[href*="/academics-research/research/"]')
    ).filter(a => /\/academics-research\/research\/detail\//.test(a.href));

    // Deduplicate
    const seen = new Set();
    const uniq = [];
    for (const a of links) {
      if (!seen.has(a.href)) { seen.add(a.href); uniq.push(a); }
    }

    // Extract fields from card-like containers
    const rows = uniq.map(a => {
      const card = a.closest('.publication-card, article, li, .card, .academic-research-listing__item') || a.parentElement;

      const title = (a.innerText || '').trim();

      const allText = (card ? card.innerText : a.innerText || '').replace(/\s+/g,' ').trim();
      const mYear = allText.match(/\b(19|20)\d{2}\b/);
      const year = mYear ? mYear[0] : '';

      const journal = card ? ((card.querySelector('em, i') || {}).innerText || '').trim() : '';

      let authors = '';
      if (card) {
        const nameLinks = Array.from(card.querySelectorAll('a[href*="/faculty/"], a[href*="/academics/faculty/"]'))
                               .map(n => (n.innerText || '').trim())
                               .filter(Boolean);
        if (nameLinks.length) {
          authors = nameLinks.join(', ');
        } else {
          const parts = allText.split(/‚Äî+|‚Äì+|‚Äî|\n/).map(s=>s.trim()).filter(Boolean);
          authors = parts.length ? parts[parts.length-1] : '';
        }
      }

      let type = '';
      const mType = allText.match(/\b(Journal Article|Book Chapter|Book|Case|Working Paper)\b/i);
      if (mType) type = mType[0];

      return {
        title, type, year, journal, authors,
        detail_url: a.href
      };
    });

    return rows;
    """
    return driver.execute_script(script)

def run_manual(output_xlsx="kellogg_journal_articles.xlsx", headless=False):
    driver = make_driver(headless=headless)
    try:
        driver.get("https://www.kellogg.northwestern.edu/academics-research/research/?publicationType=Journal%20Article")
        print("üëâ Listing page opened: scroll manually and repeatedly click 'MORE RESEARCH' until no new cards appear; also close any cookie/consent banners.")
        input("‚úÖ Ready? Press Enter to parse the current page... ")

        rows = collect_all_cards_via_js(driver)
        print(f"[debug] collected {len(rows)} rows from DOM")

        df = pd.DataFrame(rows, columns=["title","type","year","journal","authors","detail_url"])
        df.drop_duplicates(subset=["detail_url"], inplace=True)
        df.to_excel(output_xlsx, index=False)
        print(f"Saved {len(df)} rows -> {output_xlsx}")
    finally:
        driver.quit()


# Run:
if __name__ == "__main__":
    run_manual("kellogg_journal_articles.xlsx", headless=False)


In [2]:
import pandas as pd

df = pd.read_excel("kellogg_journal_articles.xlsx")
df.head(10)


Unnamed: 0,title,type,year,journal,authors,detail_url
0,Do Auditors Understand the Implications of ESG...,,,,Aaron Yoon,https://www.kellogg.northwestern.edu/academics...
1,Liquidity Crises and the Market-Maker of Last ...,,,,Robert L. McDonald,https://www.kellogg.northwestern.edu/academics...
2,The role of pilot studies in financial regulation,,,,Robert L. McDonald,https://www.kellogg.northwestern.edu/academics...
3,Slippery slope thinking links religiosity to p...,,,,Maryam Kouchaki,https://www.kellogg.northwestern.edu/academics...
4,Online Causal Inference for Advertising in Rea...,,,,Caio Waisman,https://www.kellogg.northwestern.edu/academics...
5,Physical Fit: The Role of Sports in Elite Hiri...,,,,Lauren Rivera,https://www.kellogg.northwestern.edu/academics...
6,Parallel Experimentation and Competitive Inter...,,,,Caio Waisman,https://www.kellogg.northwestern.edu/academics...
7,Essentializing Merit: Disability and Exclusion...,,,,Lauren Rivera,https://www.kellogg.northwestern.edu/academics...
8,From Stigma to Support: ‚ÄòBlack-Owned‚Äô Labels a...,,,,Chethana Achar,https://www.kellogg.northwestern.edu/academics...
9,When the Principal is the Firm's Problem: Prin...,,,,Edward J. Zajac,https://www.kellogg.northwestern.edu/academics...


In [None]:
# STEP 2 ‚Äî Detail Page Scraper
# This script uses the output generated in the previous cell
# (kellogg_journal_articles.xlsx) and visits each detail_url
# to scrape additional information from the official website,
# including abstract, citation year, authors, and full title.

import re
import time
from typing import List, Dict, Optional, Tuple

import pandas as pd
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry
from tqdm.auto import tqdm

INPUT_XLSX = "kellogg_journal_articles.xlsx"
URL_COLUMN = "detail_url"   # Change here if your column name is different
N = None                      # Only scrape the first N rows

# ---------------- HTTP session (with retry) ----------------
def make_session() -> requests.Session:
    s = requests.Session()
    retries = Retry(
        total=5, connect=3, read=3, backoff_factor=0.8,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=frozenset(["GET"])
    )
    adapter = HTTPAdapter(max_retries=retries, pool_connections=50, pool_maxsize=50)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    s.headers.update({
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/123.0 Safari/537.36"),
        "Accept-Language": "en,zh-CN;q=0.9,zh;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    })
    return s

# ---------------- Text utilities ----------------
YEAR_RE = re.compile(r"\b(19|20)\d{2}\b")

def normspace(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def text_or_empty(node) -> str:
    return normspace(node.get_text(" ", strip=True)) if node else ""

# ---------------- Parsing functions ----------------
def extract_title(soup: BeautifulSoup) -> str:
    h1 = soup.find("h1")
    if h1:
        return text_or_empty(h1)
    node = soup.select_one(".page-header__content") or soup.select_one(".publication-detail__title")
    if node:
        return text_or_empty(node)
    og = soup.find("meta", property="og:title")
    return normspace(og["content"]) if og and og.get("content") else ""

def extract_authors(soup: BeautifulSoup) -> List[str]:
    authors: List[str] = []
    left = soup.select_one(".publication-detail__left")
    if left:
        for n in left.select(".publication-detail__author"):
            t = text_or_empty(n)
            if t and not t.lower().startswith("author"):
                authors.append(t)
    if not authors:
        for n in soup.select(".publication-detail__author"):
            t = text_or_empty(n)
            if t and not t.lower().startswith("author"):
                authors.append(t)
    if not authors:
        blocks = soup.find_all(text=re.compile(r"Author", re.I))
        for b in blocks:
            parent = b.parent
            if not parent:
                continue
            sibs = parent.find_next_siblings(limit=6) or []
            if parent.parent:
                sibs += parent.parent.find_all("p", recursive=False)
            for n in sibs:
                t = text_or_empty(n)
                if t and not t.lower().startswith("author"):
                    authors.append(t)
    # Deduplicate
    seen, clean = set(), []
    for a in authors:
        a = normspace(a.replace("‚Ä¢", " ").replace("|", " "))
        if a and a not in seen:
            seen.add(a)
            clean.append(a)
    return clean

def extract_citation_year(soup: BeautifulSoup) -> Optional[str]:
    node = soup.select_one(".publication-detail__citations")
    if not node:
        node = soup.find(text=re.compile(r"Citation|Citations", re.I))
        node = node.parent if node else None
    text = text_or_empty(node) if node else ""
    if not text:
        around = soup.select_one(".publication-detail__right") or soup.body
        text = text_or_empty(around)[:4000] if around else ""
    m = YEAR_RE.search(text)
    return m.group(0) if m else None

def extract_abstract(soup: BeautifulSoup) -> str:
    node = soup.select_one(".publication-detail__abstract")
    if node:
        t = text_or_empty(node).strip("‚Äú‚Äù\"' \n\t")
        if t:
            return t
    cand = soup.find(lambda tag: tag.name in ("div", "p")
                     and tag.get_text(strip=True).lower().startswith("abstract"))
    if cand:
        t = text_or_empty(cand)
        return re.sub(r"^\s*abstract[:\s-]*", "", t, flags=re.I).strip()
    return ""  # Allow empty abstracts

def parse_detail(html: str) -> Tuple[str, List[str], Optional[str], str]:
    soup = BeautifulSoup(html, "lxml")
    return (
        extract_title(soup),
        extract_authors(soup),
        extract_citation_year(soup),
        extract_abstract(soup),
    )

# ---------------- Main process (scrape first N rows) ----------------
def main():
    df = pd.read_excel(INPUT_XLSX, engine="openpyxl")
    if URL_COLUMN not in df.columns:
        raise SystemExit(f"Column '{URL_COLUMN}' not found, please check the Excel file.")

    urls = (
        df[URL_COLUMN].astype(str)
        .dropna().drop_duplicates()
        .tolist()
    )
    if not urls:
        raise SystemExit("No valid URLs detected.")

    session = make_session()
    records: List[Dict] = []

    for url in tqdm(urls, desc=f"Scraping first {len(urls)} URLs", ncols=88):
        row = {"url": url, "title": "", "authors": "", "year": "", "abstract": "", "status": "error", "error": ""}
        try:
            resp = session.get(url, timeout=15)
            if resp.status_code == 403:
                # Adaptive fallback
                session.headers.pop("Accept-Language", None)
                time.sleep(1.0)
                resp = session.get(url, timeout=15)
            resp.raise_for_status()
            title, authors, year, abstract = parse_detail(resp.text)
            row.update({
                "title": title,
                "authors": "; ".join(authors) if authors else "",
                "year": year or "",
                "abstract": abstract,
                "status": "ok",
            })
        except Exception as e:
            row["error"] = str(e)
        records.append(row)
        time.sleep(0.4)  # Courtesy delay to avoid hitting server too fast

    out = pd.DataFrame(records, columns=["url", "title", "authors", "year", "abstract", "status", "error"])
    out.to_csv("scraped_kellogg.csv", index=False, encoding="utf-8-sig")
    with pd.ExcelWriter("scraped_kellogg.xlsx", engine="openpyxl") as w:
        out.to_excel(w, index=False, sheet_name="scraped")


if __name__ == "__main__":
    main()


In [3]:
import pandas as pd

df = pd.read_excel("scraped_kellogg.xlsx")
df.head(10)

Unnamed: 0,url,title,authors,year,abstract,status,error
0,https://www.kellogg.northwestern.edu/academics...,Do Auditors Understand the Implications of ESG...,Aaron Yoon; Daniel Aobdia,2026.0,,ok,
1,https://www.kellogg.northwestern.edu/academics...,Liquidity Crises and the Market-Maker of Last ...,Charles Kahn; David Marshall; Robert L. McDonald,2026.0,We study market illiquidity in an economy subj...,ok,
2,https://www.kellogg.northwestern.edu/academics...,The role of pilot studies in financial regulation,Lawrence Harris; Charles Kahn; Robert L. McDon...,2026.0,Financial regulators considering the desirabil...,ok,
3,https://www.kellogg.northwestern.edu/academics...,Slippery slope thinking links religiosity to p...,Rajen Anderson; Benjamin Ruisch; Maryam Kouchaki,2026.0,,ok,
4,https://www.kellogg.northwestern.edu/academics...,Online Causal Inference for Advertising in Rea...,Caio Waisman; Harikesh Nair; Carlos Carrion,2025.0,"Real-time bidding systems, which utilize aucti...",ok,
5,https://www.kellogg.northwestern.edu/academics...,Physical Fit: The Role of Sports in Elite Hiri...,Lisa S√∏lvberg; Lauren Rivera,2025.0,Sports participation serves as an important ma...,ok,
6,https://www.kellogg.northwestern.edu/academics...,Parallel Experimentation and Competitive Inter...,Caio Waisman; Navdeep Sahni; Harikesh Nair; Xi...,2025.0,This paper studies the measurement of advertis...,ok,
7,https://www.kellogg.northwestern.edu/academics...,Essentializing Merit: Disability and Exclusion...,Estela Diaz; Lauren Rivera,2025.0,"Historically, elite schools have selected stud...",ok,
8,https://www.kellogg.northwestern.edu/academics...,From Stigma to Support: ‚ÄòBlack-Owned‚Äô Labels a...,Chethana Achar; Nidhi Agrawal; Keyaira Adweumnni,2025.0,We examine the effect of ‚ÄòBlack-owned‚Äô labelin...,ok,
9,https://www.kellogg.northwestern.edu/academics...,When the Principal is the Firm's Problem: Prin...,Edward J. Zajac; Maria Goranova,2025.0,,ok,
