In [6]:
import asyncio
import re
import os

import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError
from IPython.display import display
from tqdm.notebook import tqdm  # progress bar for Jupyter

DIR_URL = "https://giesbusiness.illinois.edu/about/faculty-staff-directory"
CONCURRENCY = 10  # number of profiles fetched in parallel


def norm(s: str) -> str:
    """Normalize whitespace: collapse multiple spaces and strip."""
    return re.sub(r"\s+", " ", s or "").strip()


def first_year(s: str):
    """Extract the first 4-digit year (19xx or 20xx) from a string."""
    m = re.search(r"\b(19|20)\d{2}\b", s or "")
    return m.group(0) if m else ""


# ========== Step 1: scrape faculty directory ==========

async def get_directory_df(playwright):
    """
    Visit the directory page and collect all faculty names + profile URLs.
    Returns a DataFrame with columns: Name, ProfileURL.
    """
    browser = await playwright.chromium.launch(headless=True)
    ctx = await browser.new_context()
    page = await ctx.new_page()

    await page.goto(DIR_URL, wait_until="networkidle", timeout=60_000)

    # Try to show as many entries as possible on one page
    try:
        await page.select_option("#pagination-top", "999")
    except:
        pass
    try:
        await page.select_option("#display-type", "list")
    except:
        pass

    await page.wait_for_timeout(1500)
    html = await page.content()
    await browser.close()

    soup = BeautifulSoup(html, "html.parser")
    rows = []

    for a in soup.select('a[href*="/profile/"]'):
        href = a.get("href", "")
        if not href:
            continue
        if href.startswith("/"):
            href = "https://giesbusiness.illinois.edu" + href
        name = norm(a.get_text())
        if name:
            rows.append({"Name": name, "ProfileURL": href})

    df = pd.DataFrame(rows).drop_duplicates("ProfileURL").reset_index(drop=True)
    print(f"[directory] Parsed {len(df)} faculty profiles.")
    return df


# ========== Step 2: parse one profile page ==========
def parse_profile_publications(html: str):
    """
    Parse one profile page and extract:
      - Recent publications: list of strings
      - Other publications: list of (Subsection, Entry) pairs

    Based on the DOM structure:
      <details>
        <summary><h3>Recent Publications</h3></summary>
        <ul>  <!-- recent list -->
          <li>...</li>
        </ul>
        <details class="allpub">
          <summary>...</summary>
          <h4>Articles</h4>
          <ul><li>...</li></ul>
          <h4>Working Papers</h4>
          <ul><li>...</li></ul>
          ...
        </details>
      </details>
    """
    soup = BeautifulSoup(html, "html.parser")
    recent, other = [], []

    # ----- 1) Find the <details> block for "Recent Publications" -----
    det_recent = None
    for det in soup.find_all("details"):
        h3 = det.find("h3")
        if h3 and "recent publications" in h3.get_text(strip=True).lower():
            det_recent = det
            break

    if not det_recent:
        # No recent publications block found
        return recent, other

    # ----- 2) Recent: first direct <ul> under the "Recent Publications" details -----
    recent_ul = None
    for child in det_recent.children:
        if getattr(child, "name", None) == "ul":
            recent_ul = child
            break

    if recent_ul:
        for li in recent_ul.find_all("li", recursive=False):
            recent.append(norm(li.get_text(" ", strip=True)))

    # ----- 3) All publications: details.class="allpub" -----
    allpub = det_recent.find("details", class_="allpub")
    if allpub:
        current_subsection = None

        # Iterate over children in order: <summary>, <h4>, <ul>, <h4>, <ul>, ...
        for child in allpub.children:
            tag_name = getattr(child, "name", None)

            if tag_name == "h4":
                # Heading like "Articles", "Working Papers", etc.
                current_subsection = child.get_text(strip=True)

            elif tag_name == "ul" and current_subsection:
                # All <li> inside this <ul> belong to current_subsection
                for li in child.find_all("li", recursive=False):
                    entry_text = norm(li.get_text(" ", strip=True))
                    other.append((current_subsection, entry_text))

    return recent, other



async def fetch_one_profile(sem, ctx, name, url):
    """
    Fetch one profile page under a semaphore (for concurrency control),
    then return a list of dict rows for all publications of this person.
    """
    async with sem:
        page = await ctx.new_page()
        try:
            await page.goto(url, wait_until="networkidle", timeout=60_000)
            html = await page.content()
            recent, other = parse_profile_publications(html)

            rows = []
            for entry in recent:
                rows.append({
                    "Name": name,
                    "ProfileURL": url,
                    "Section": "Recent",
                    "Subsection": "",
                    "Entry": entry,
                    "Year": first_year(entry),
                })
            for sub, entry in other:
                rows.append({
                    "Name": name,
                    "ProfileURL": url,
                    "Section": "Other",
                    "Subsection": sub,
                    "Entry": entry,
                    "Year": first_year(entry),
                })

            await page.close()
            return rows

        except TimeoutError:
            await page.close()
            return [{
                "Name": name,
                "ProfileURL": url,
                "Section": "ERROR",
                "Subsection": "",
                "Entry": "TimeoutError",
                "Year": "",
            }]
        except Exception as e:
            await page.close()
            return [{
                "Name": name,
                "ProfileURL": url,
                "Section": "ERROR",
                "Subsection": "",
                "Entry": str(e),
                "Year": "",
            }]


# ========== Step 3: concurrently scrape all profiles ==========

async def scrape_all_publications(df_dir, playwright):
    """
    Given a directory DataFrame (Name, ProfileURL),
    visit all profile pages concurrently and collect all publications.

    Returns:
        DataFrame with columns:
        Name, ProfileURL, Section, Subsection, Entry, Year
    """
    browser = await playwright.chromium.launch(headless=True)
    ctx = await browser.new_context()
    sem = asyncio.Semaphore(CONCURRENCY)

    to_scrape = df_dir.copy()
    total = len(to_scrape)
    print(f"[scrape] Total profiles to scrape: {total}")

    all_results = []

    # Use tqdm progress bar over batches
    with tqdm(total=total, desc="Scraping profiles") as pbar:
        for batch_start in range(0, total, CONCURRENCY):
            batch = to_scrape.iloc[batch_start:batch_start + CONCURRENCY]
            tasks = [
                fetch_one_profile(sem, ctx, r["Name"], r["ProfileURL"])
                for _, r in batch.iterrows()
            ]
            results = await asyncio.gather(*tasks)

            # Flatten and collect all rows
            for r in results:
                all_results.extend(r)

            pbar.update(len(batch))

    await browser.close()

    pubs_df = pd.DataFrame(
        all_results,
        columns=["Name", "ProfileURL", "Section", "Subsection", "Entry", "Year"],
    )
    print(f"[scrape] Collected {len(pubs_df)} publication rows.")
    return pubs_df


# ========== Step 4: main orchestration ==========

async def main():
    async with async_playwright() as p:
        # 1) scrape the directory
        df_dir = await get_directory_df(p)
        print("\nSample of directory (first 5 rows):")
        display(df_dir.head())

        # 2) scrape all publications
        pubs_df = await scrape_all_publications(df_dir, p)
        print("\nSample of publications (first 20 rows):")
        display(pubs_df.head(20))

        # Optional: you can filter, groupby, etc., directly in notebook
        # For example, print how many publications per person:
        print("\nTop 10 by number of publications:")
        display(
            pubs_df.groupby("Name")["Entry"]
            .count()
            .sort_values(ascending=False)
            .head(10)
            .to_frame("PublicationCount")
        )




In [7]:
await main()


[directory] Parsed 891 faculty profiles.

Sample of directory (first 5 rows):


Unnamed: 0,Name,ProfileURL
0,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...
1,"Abed, Emad",https://giesbusiness.illinois.edu/profile/emad...
2,"Abel, Richard",https://giesbusiness.illinois.edu/profile/rich...
3,"Abolt, Kaci",https://giesbusiness.illinois.edu/profile/kaci...
4,"Adair, Jennifer",https://giesbusiness.illinois.edu/profile/jenn...


[scrape] Total profiles to scrape: 891


Scraping profiles:   0%|          | 0/891 [00:00<?, ?it/s]

[scrape] Collected 8263 publication rows.

Sample of publications (first 20 rows):


Unnamed: 0,Name,ProfileURL,Section,Subsection,Entry,Year
0,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Recent,,"Abdel-Khalik, A. (2019). Failing Faithful Repr...",2019
1,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Recent,,"Abdel-Khalik, A. (2017). How Enron Used Accoun...",2017
2,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Recent,,"Abdel-Khalik, A., & Chen, P. (2015). Growth in...",2015
3,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Other,Articles,"Abdel-Khalik, A. (2014). Prospect Theory Predi...",2014
4,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Other,Articles,"Abdel-Khalik, A. (2014). CEO Risk Preference a...",2014
5,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Other,Articles,"Abdel-Khalik, A. (2009). Fair Value Accounting...",2009
6,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Other,Articles,"Abdel-Khalik, A. (2007). An Empirical Investig...",2007
7,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Other,Articles,"Abdel-Khalik, A. (2003). Self Sorting, Incenti...",2003
8,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Other,Articles,"Abdel-Khalik, A. (2002). Reforming Corporate G...",2002
9,"Abdel-Khalik, A. Rashad",https://giesbusiness.illinois.edu/profile/a-ra...,Other,Articles,"Abdel-Khalik, A., Wong, K., & Wu, A. (1999). C...",1999



Top 10 by number of publications:


Unnamed: 0_level_0,PublicationCount
Name,Unnamed: 1_level_1
"Kindt, John",337
"Shavitt, Sharon",248
"Shaw, Michael J.",233
"Monroe, Kent",230
"Otnes, Cele",205
"Kahn, Charles",186
"Shah, Sonali",180
"Brunner, Robert",177
"Pearson, Neil",140
"Peecher, Mark",136
