<a href="https://colab.research.google.com/github/Lsyyy623/Case-Competition-Pillar-1-Team-11/blob/main/code/UCLA/UCLA_faculty_profile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

URL = "https://www.anderson.ucla.edu/faculty-and-research/faculty-directory"
BASE = "https://www.anderson.ucla.edu"
OUT_CSV = "ucla_anderson_faculty_directory.csv"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

def clean(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def fetch_html(session: requests.Session, url: str) -> str:
    r = session.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

def card_text_tokens(card):
    """
    从卡片容器中提取“可见文本”token，过滤掉 Read Bio / Image: ... 等噪声
    """
    toks = []
    for s in card.stripped_strings:
        t = clean(s)
        if not t:
            continue
        # 过滤常见噪声
        if t.lower() == "read bio":
            continue
        if t.lower().startswith("image:"):
            continue
        if "portrait image" in t.lower():
            continue
        toks.append(t)
    return toks

def infer_group(card):
    """
    推断分组：取卡片之前最近的 h2/h3 标题（如 Current Faculty / Emeriti 等）
    """
    h = card.find_previous(["h2", "h3"])
    if not h:
        return ""
    g = clean(h.get_text(" ", strip=True))
    # 排除太短/太泛的标题
    if len(g) < 3:
        return ""
    return g

def find_card_container(a_tag):
    """
    找到包含该 'Read Bio' 的卡片容器。优先找 li/article，其次 div
    """
    card = a_tag.find_parent(["li", "article"])
    if card:
        return card
    card = a_tag.find_parent("div")
    return card if card else a_tag.parent

def main():
    session = requests.Session()

    print(f"Fetching: {URL}")
    html = fetch_html(session, URL)
    soup = BeautifulSoup(html, "lxml")

    rows = []
    seen = set()

    # 抓所有 “Read Bio” 链接（每位老师卡片都有）
    read_bio_links = []
    for a in soup.select("a[href]"):
        if clean(a.get_text(" ", strip=True)).lower() == "read bio":
            read_bio_links.append(a)

    print(f"Found Read Bio links: {len(read_bio_links)}")

    for a in read_bio_links:
        href = a.get("href", "")
        profile_link = urljoin(BASE, href)

        card = find_card_container(a)
        toks = card_text_tokens(card)

        # 一般 toks[0] 是姓名，toks[1:] 是头衔（可能有多段）
        name = toks[0] if len(toks) >= 1 else ""
        title = " ".join(toks[1:]) if len(toks) >= 2 else ""

        group = infer_group(card)

        # 去重：按 profile_link 去重最稳
        if profile_link in seen:
            continue
        seen.add(profile_link)

        rows.append({
            "name": name,
            "title": title,
            "group": group,
            "profile_link": profile_link
        })

    df = pd.DataFrame(rows, columns=["name", "title", "group", "profile_link"])
    df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
    print(f"Saved: {OUT_CSV} | rows={len(df)}")

    # 展示前几行
    print(df.head(10).to_string(index=False))

if __name__ == "__main__":
    # 礼貌延迟（可选）
    time.sleep(random.uniform(0.2, 0.6))
    main()


Fetching: https://www.anderson.ucla.edu/faculty-and-research/faculty-directory
Found Read Bio links: 128
Saved: ucla_anderson_faculty_directory.csv | rows=128
              name                                                                  title           group                                                                                                    profile_link
    Derek Alderton                                             Senior Continuing Lecturer Current Faculty                                    https://www.anderson.ucla.edu/faculty-and-research/strategy/faculty/alderton
 Corinne Bendersky                                                Professor of Management Current Faculty               https://www.anderson.ucla.edu/faculty-and-research/management-and-organizations/faculty/bendersky
Daniel J. Benjamin                    Professor of Behavioral Economics and Genoeconomics Current Faculty                  https://www.anderson.ucla.edu/faculty-and-research/behavioral-de