<a href="https://colab.research.google.com/github/Lsyyy623/Case-Competition-Pillar-1-Team-11/blob/main/code/UCSD/UCSD_faculty_profile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

URL = "https://rady.ucsd.edu/faculty-research/faculty-directory/index.html"
BASE = "https://rady.ucsd.edu"
OUT_CSV = "rady_faculty_directory.csv"

EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

def clean(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def fetch_html(url: str) -> str:
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

def best_container_for_h3(h3):
    """
    从 h3 往上找一个“像 faculty 卡片”的容器：
    - 容器文本里有 email
    - 容器里 h3 数量不多（避免抓到整页）
    """
    cur = h3
    for _ in range(8):
        cur = cur.parent
        if not cur or not hasattr(cur, "get_text"):
            break
        txt = cur.get_text(" ", strip=True)
        if EMAIL_RE.search(txt):
            h3s = cur.find_all("h3")
            if 1 <= len(h3s) <= 3:
                return cur
    return h3.parent

def parse_title_email(container, name_text):
    """
    卡片里通常是：Title ... email
    用 email 作为锚点，把 title 从全文里扣出来
    """
    full = clean(container.get_text(" ", strip=True))
    email_match = EMAIL_RE.search(full)
    email = email_match.group(0) if email_match else ""

    # 去掉 name 与 email 后剩下的尽量当 title
    title = full
    if name_text:
        title = title.replace(name_text, " ").strip()
    if email:
        title = title.replace(email, " ").strip()

    title = clean(title)
    return title, email

def parse():
    html = fetch_html(URL)
    soup = BeautifulSoup(html, "lxml")

    rows = []
    seen = set()

    # 目录正文里每个人的名字基本都在 h3（有时带 <a>，有时纯文本）
    for h3 in soup.find_all("h3"):
        name = clean(h3.get_text(" ", strip=True))
        if not name:
            continue

        container = best_container_for_h3(h3)
        container_text = container.get_text(" ", strip=True) if container else ""
        if not EMAIL_RE.search(container_text):
            # 没有邮箱的一般不是 faculty 卡片，跳过
            continue

        # 个人主页链接：如果 h3 里有 a，就拼完整链接；否则留空
        a = h3.find("a", href=True)
        profile_link = urljoin(BASE, a["href"]) if a else ""

        title, email = parse_title_email(container, name)

        key = (name, profile_link, email)
        if key in seen:
            continue
        seen.add(key)

        rows.append({
            "name": name,
            "title": title,
            "email": email,
            "profile_link": profile_link
        })

    df = pd.DataFrame(rows, columns=["name", "title", "email", "profile_link"])
    df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
    print(f"Saved: {OUT_CSV} (rows={len(df)})")

    # 展示前几行（你也可以改成 df.head(20)）
    print(df.head(10).to_string(index=False))

if __name__ == "__main__":
    parse()


Saved: rady_faculty_directory.csv (rows=117)
             name                                                                          title               email                                                                    profile_link
     Erik Johnson                         Visiting Assistant Professor of Economics and Strategy     erj013@ucsd.edu                                                                                
      Amos Schurr                                     Visiting Associate Professor of Management    aschurr@ucsd.edu                                                                                
      Ron Shachar                                               Visiting Professor of Management roschachar@ucsd.edu                                                                                
        Sujit Dey Adjunct Professor Jacobs Family Chair in Management and Engineering Leadership    dey@eng.ucsd.edu                              https://rady.ucsd.edu