In [14]:
import re
from urllib.parse import urljoin

import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from tqdm.notebook import tqdm

URL = "https://giesbusiness.illinois.edu/about/faculty-staff-directory"


# -----------------------------
# Small utilities
# -----------------------------
def norm(s: str) -> str:
    """Normalize whitespace: collapse multiple spaces and strip."""
    return re.sub(r"\s+", " ", s or "").strip()


def extract_email_phone(td):
    """Extract unique email(s) and phone(s) from a <td> element."""
    emails = {a.get_text(strip=True) for a in td.select('a[href^="mailto:"]')}
    phones = {a.get_text(strip=True) for a in td.select('a[href^="tel:"]')}
    return list(emails), list(phones)


def split_name_title_basic(full_name: str, explicit_title: str = ""):
    """
    First-stage split of name & title using common patterns.

    This does NOT handle all weird cases. Further clean-up will be done
    by fix_prefix_in_title() and fix_trailing_in_name().
    """
    full_name = norm(full_name)

    # 1) If title is provided by a separate node, trust it.
    if explicit_title:
        return full_name, norm(explicit_title)

    # 2) Try em/en dash: 'John Doe — Clinical Professor'
    for sep in ["—", "–"]:
        if sep in full_name:
            left, right = map(norm, full_name.split(sep, 1))
            return left, right

    # 3) Try comma + job keywords: 'Brown, Senior Lecturer'
    title_keywords = (
        "professor", "lecturer", "instructor", "faculty", "researcher",
        "scientist", "chair", "dean", "director", "associate", "assistant",
        "senior", "advisor", "adviser", "coordinator", "manager",
        "specialist", "officer", "administrator", "fellow", "executive",
        "engineer", "analyst",
    )

    if "," in full_name:
        left, right = map(norm, full_name.split(",", 1))
        if any(k in right.lower() for k in title_keywords):
            return left, right

    return full_name, ""


# -----------------------------
# Clean-up helpers for Name/Title
# -----------------------------

def fix_prefix_in_title(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fix cases where a given name fragment appears at the beginning of Title.

    Examples:
      1) Abdel-Khalik row
         Name  = 'Abdel-Khalik, A.'
         Title = 'Rashad Professor of Accountancy and V.K. Zimmer...'
         -> Name  = 'Abdel-Khalik, A. Rashad'
            Title = 'Professor of Accountancy and V.K. Zimmer...'

      2) Zulauf row
         Name  = 'Zulauf'
         Title = 'Ning Associate Director Employer Engagement'
         -> Name  = 'Zulauf, Ning'
            Title = 'Associate Director Employer Engagement'
    """

    # Job-title keywords. First occurrence marks the start of the real title.
    kw_re = re.compile(
        r"\b(professor|lecturer|instructor|director|manager|coordinator|"
        r"advisor|adviser|dean|chair|specialist|officer|administrator|"
        r"fellow|executive|scientist|researcher|engineer|analyst)\b",
        re.I,
    )

    # Words that are typical title modifiers, not part of a first name
    NON_NAME_WORDS = {
        "senior", "assistant", "associate", "adjunct",
        "visiting", "clinical", "interim", "emeritus",
        "distinguished", "co"
    }

    cleaned = []

    for _, row in df.iterrows():
        name = str(row.get("Name", "")).strip()
        title = str(row.get("Title", "")).strip()

        if not title:
            cleaned.append(row)
            continue

        m = kw_re.search(title)
        if not m:
            cleaned.append(row)
            continue

        prefix = title[:m.start()].strip()  # before the first job keyword
        rest = title[m.start():].strip()    # from keyword to the end

        if not prefix:
            cleaned.append(row)
            continue

        tokens = prefix.split()

        # Case 1: Name already like 'Last, A.'  -> entire prefix is extra given name(s)
        if "," in name:
            if prefix.lower() not in name.lower():
                name = f"{name} {prefix}".strip()
                title = rest or title

        # Case 2: Name is just a surname (no comma)
        else:
            if (
                len(tokens) >= 2
                and tokens[1].lower() in NON_NAME_WORDS  # 'Ning Associate Director...'
            ):
                # treat first token as first name, second+ as part of title
                first_name = tokens[0]
                title_prefix = " ".join(tokens[1:])  # 'Associate'
                if first_name.lower() not in name.lower():
                    name = f"{name}, {first_name}".strip()
                title = f"{title_prefix} {rest}".strip()
            else:
                # treat the whole prefix as extra name(s), e.g. 'A. Rashad'
                if prefix.lower() not in name.lower():
                    name = f"{name}, {prefix}".strip()
                title = rest or title

        row = dict(row)
        row["Name"] = name
        row["Title"] = title
        cleaned.append(row)

    return pd.DataFrame(cleaned, columns=df.columns)


def fix_trailing_in_name(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fix cases where job-fragments leak to the end of Name, e.g.:

        Name  = 'Agnoletti, Brian Alumni Engagement'
        Title = 'Coordinator'

    We interpret 'Alumni Engagement' as title fragment:
        -> Name  = 'Agnoletti, Brian'
           Title = 'Alumni Engagement Coordinator'
    """
    cleaned = []

    for _, row in df.iterrows():
        name = str(row.get("Name", "")).strip()
        title = str(row.get("Title", "")).strip()

        if "," in name:
            last, rest = name.split(",", 1)
            last, rest = last.strip(), rest.strip()
            tokens = rest.split()

            if len(tokens) > 1:
                first_name = tokens[0]
                trailing = tokens[1:]
                if trailing:
                    trailing_title = " ".join(trailing)
                    name = f"{last}, {first_name}"
                    title = (trailing_title + " " + title).strip()

        row = dict(row)
        row["Name"] = name
        row["Title"] = title
        cleaned.append(row)

    return pd.DataFrame(cleaned, columns=df.columns)


# -----------------------------
# Parse TABLE layout
# -----------------------------
def parse_table(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")
    needed = ["name", "department", "office", "contact"]

    for table in soup.find_all("table"):
        headers = [norm(th.get_text()) for th in table.find_all("th")]
        lower = [h.lower() for h in headers]

        if not all(any(n in h for h in lower) for n in needed):
            continue

        col = {}
        for key in ["name", "title", "department", "office", "contact"]:
            for i, h in enumerate(lower):
                if key in h:
                    col[key] = i
                    break

        rows = []
        trs = table.find_all("tr")

        for tr in tqdm(trs, desc="Parsing table rows"):
            tds = tr.find_all("td")
            if not tds:
                continue

            # Profile URL from name cell
            profile_url = ""
            name_idx = col.get("name")
            if name_idx is not None and name_idx < len(tds):
                name_td = tds[name_idx]
                a = name_td.find("a", href=True)
                if a and not a["href"].startswith(("mailto:", "tel:")):
                    profile_url = urljoin(URL, a["href"])

            def cell(i):
                if i is None or i >= len(tds):
                    return ""
                return norm(tds[i].get_text(" ", strip=True))

            def contact_cell(i):
                if i is None or i >= len(tds):
                    return [], []
                return extract_email_phone(tds[i])

            full_name = cell(col.get("name"))
            explicit_title = cell(col.get("title"))
            name, title = split_name_title_basic(full_name, explicit_title)

            emails, phones = contact_cell(col.get("contact"))

            rows.append({
                "Name": name,
                "Title": title,
                "Department": cell(col.get("department")),
                "Office": cell(col.get("office")),
                "Email": "; ".join(emails),
                "Phone": "; ".join(phones),
                "URL": profile_url,
            })

        df = pd.DataFrame(
            rows,
            columns=["Name", "Title", "Department", "Office", "Email", "Phone", "URL"],
        )
        df = df[df["Name"] != ""].reset_index(drop=True)
        return df

    # No valid table found
    return pd.DataFrame(columns=["Name", "Title", "Department", "Office", "Email", "Phone", "URL"])


# -----------------------------
# Parse CARD layout (fallback)
# -----------------------------
def parse_cards(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")
    container = soup.select_one(".faculty-results")

    if not container:
        return pd.DataFrame(columns=["Name", "Title", "Department", "Office", "Email", "Phone", "URL"])

    rows = []
    cards = container.select("div, li")

    for card in tqdm(cards, desc="Parsing card rows"):
        name_el = card.select_one("h3, .name, .person-name, a.person-name, .card-title")
        if not name_el:
            continue

        full_name = norm(name_el.get_text())
        profile_url = ""

        if name_el.name == "a" and name_el.has_attr("href"):
            profile_url = urljoin(URL, name_el["href"])
        else:
            a = name_el.find("a", href=True)
            if a and not a["href"].startswith(("mailto:", "tel:")):
                profile_url = urljoin(URL, a["href"])

        title_el = card.select_one(".title, .position, .job-title, .card-subtitle, .field-title")
        explicit_title = norm(title_el.get_text()) if title_el else ""
        name, title = split_name_title_basic(full_name, explicit_title)

        dept_el = card.select_one(".department, .dept, .field-department")
        office_el = card.select_one(".office, .field-office")

        emails = {a.get_text(strip=True) for a in card.select('a[href^="mailto:"]')}
        phones = {a.get_text(strip=True) for a in card.select('a[href^="tel:"]')}

        rows.append({
            "Name": name,
            "Title": title,
            "Department": norm(dept_el.get_text()) if dept_el else "",
            "Office": norm(office_el.get_text()) if office_el else "",
            "Email": "; ".join(emails),
            "Phone": "; ".join(phones),
            "URL": profile_url,
        })

    df = pd.DataFrame(
        rows,
        columns=["Name", "Title", "Department", "Office", "Email", "Phone", "URL"],
    )
    df = df[df["Name"] != ""].reset_index(drop=True)
    return df


# -----------------------------
# Main async scraper
# -----------------------------
async def scrape():
    """Scrape the directory and return a cleaned DataFrame."""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        ctx = await browser.new_context()
        page = await ctx.new_page()

        await page.goto(URL, wait_until="networkidle")

        # Try to show as many results as possible in one page
        try:
            await page.select_option("#pagination-top", "999")
        except:
            pass
        try:
            await page.select_option("#display-type", "list")
        except:
            pass

        await page.wait_for_timeout(1500)
        html = await page.content()
        await browser.close()

    df = parse_table(html)
    if df.empty:
        df = parse_cards(html)

    # Clean up name/title anomalies
    df = fix_prefix_in_title(df)
    df = fix_trailing_in_name(df)

    print(f"\nTotal profiles found: {len(df)}\n")
    return df


In [15]:
df = await scrape()
df

Parsing table rows:   0%|          | 0/892 [00:00<?, ?it/s]


Total profiles found: 891



Unnamed: 0,Name,Title,Department,Office,Email,Phone,URL
0,"Abdel-Khalik, A.",Rashad Professor of Accountancy and V.K. Zimme...,Accountancy,2037 D Business Instructional Facility,rashad@illinois.edu,217-265-0539,https://giesbusiness.illinois.edu/profile/a-ra...
1,"Abed, Emad",,Business Administration,,eabed2@illinois.edu,,https://giesbusiness.illinois.edu/profile/emad...
2,Abel,Richard Senior Human Resource Generalist,Office of the Dean,320 Wohlers Hall,abel@illinois.edu,217-244-1109,https://giesbusiness.illinois.edu/profile/rich...
3,"Abolt, Kaci","Sr Associate Director, Undergraduate Admission...",Undergraduate Affairs,1055 A Business Instructional Facility,klabolt@illinois.edu,217-300-3735,https://giesbusiness.illinois.edu/profile/kaci...
4,"Adair, Jennifer",,Online Programs,,jjadair@illinois.edu,,https://giesbusiness.illinois.edu/profile/jenn...
...,...,...,...,...,...,...,...
886,"Zhou, Yucheng",PhD Student in Finance,Finance,107 J Irwin Hall,yz33@illinois.edu,,https://giesbusiness.illinois.edu/profile/yuch...
887,"Zhu, Wenting",,Online Programs,,wz49@illinois.edu,,https://giesbusiness.illinois.edu/profile/went...
888,"Zhuang, Xiao",,Online Programs,,xzhuang8@illinois.edu,,https://giesbusiness.illinois.edu/profile/xiao...
889,"Ziegler, Emily",Director of Residential Degrees,Graduate Programs,3019 M Business Instructional Facility,ekrickl@illinois.edu,217-300-5603,https://giesbusiness.illinois.edu/profile/emil...
