<a href="https://colab.research.google.com/github/Lsyyy623/Case-Competition-Pillar-1-Team-11/blob/main/code/UCB/UCB_FAC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE = "https://haas.berkeley.edu"
START_URL = f"{BASE}/faculty/"
OUT_CSV = "haas_faculty_basic.csv"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

# 标题词表：按“长的优先”以避免 Professor 抢先匹配 Professor Emeritus 等
TITLES = [
    "Professor of the Graduate School",
    "Continuing Professional Faculty",
    "Visiting Associate Professor",
    "Acting Associate Professor",
    "Professor Emeritus",
    "Associate Professor",
    "Assistant Professor",
    "Teaching Professor",
    "Adjunct Professor",
    "Continuing Lecturer",
    "Professional Faculty",
    "Executive Director",
    "Affiliated Faculty",
    "Lecturer",
    "Director",
    "Dean",
    "Professor",
]

def clean(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def fetch(session: requests.Session, url: str) -> str:
    r = session.get(url, headers=HEADERS, timeout=30)
    if r.status_code == 404:
        return ""
    r.raise_for_status()
    return r.text

def split_name_title_dept(text: str):
    """
    输入类似：
      'Vinod K Aggarwal Professor Business & Public Policy'
      'Andrew G. Campbell Executive Director Energy Institute'
      'Jennifer A. Chatman Dean Management of Organizations'
    输出：name, title, dept
    """
    text = clean(text)
    if not text:
        return None

    for t in TITLES:
        # ^(name)\s+(title)(\s+dept)?$
        pat = re.compile(rf"^(?P<name>.+?)\s+(?P<title>{re.escape(t)})(?:\s+(?P<dept>.+))?$")
        m = pat.match(text)
        if m:
            name = clean(m.group("name"))
            title = clean(m.group("title"))
            dept = clean(m.group("dept") or "")
            return name, title, dept

    return None

def parse_directory_page(html: str):
    soup = BeautifulSoup(html, "lxml")
    results = []

    # 只抓 /faculty/xxx/ 这种个人页链接，排除分页 /faculty/page/x/
    for a in soup.select("a[href]"):
        href = a.get("href", "")
        if not href:
            continue

        full = urljoin(START_URL, href)

        if "/faculty/" not in full:
            continue
        if "/faculty/page/" in full:
            continue
        if full.rstrip("/") == START_URL.rstrip("/"):
            continue

        text = clean(a.get_text(" ", strip=True))
        parsed = split_name_title_dept(text)
        if not parsed:
            continue

        name, title, dept = parsed
        results.append((full, name, title, dept))

    return results

def main():
    session = requests.Session()
    seen_profiles = set()
    all_rows = []

    page = 1
    while True:
        url = START_URL if page == 1 else f"{START_URL}page/{page}/"
        print(f"Fetching page {page}: {url}")

        html = fetch(session, url)
        if not html:
            # 404 或空页面：认为到头
            break

        rows = parse_directory_page(html)

        new_count = 0
        for profile_link, name, title, dept in rows:
            if profile_link in seen_profiles:
                continue
            seen_profiles.add(profile_link)
            all_rows.append({"name": name, "title": title, "department": dept})
            new_count += 1

        print(f"  + new profiles: {new_count}")

        # 如果连续页抓不到新 profile，通常就是末尾/结构变化
        if new_count == 0 and page > 1:
            break

        page += 1
        time.sleep(0.6)  # 礼貌抓取

        if page > 80:  # 安全上限，避免死循环
            break

    df = pd.DataFrame(all_rows, columns=["name", "title", "department"])
    df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
    print(f"Saved {OUT_CSV} | rows={len(df)}")

if __name__ == "__main__":
    main()


Fetching page 1: https://haas.berkeley.edu/faculty/
  + new profiles: 35
Fetching page 2: https://haas.berkeley.edu/faculty/page/2/
  + new profiles: 35
Fetching page 3: https://haas.berkeley.edu/faculty/page/3/
  + new profiles: 35
Fetching page 4: https://haas.berkeley.edu/faculty/page/4/
  + new profiles: 35
Fetching page 5: https://haas.berkeley.edu/faculty/page/5/
  + new profiles: 34
Fetching page 6: https://haas.berkeley.edu/faculty/page/6/
  + new profiles: 34
Fetching page 7: https://haas.berkeley.edu/faculty/page/7/
  + new profiles: 35
Fetching page 8: https://haas.berkeley.edu/faculty/page/8/
  + new profiles: 34
Fetching page 9: https://haas.berkeley.edu/faculty/page/9/
  + new profiles: 35
Fetching page 10: https://haas.berkeley.edu/faculty/page/10/
  + new profiles: 19
Fetching page 11: https://haas.berkeley.edu/faculty/page/11/
Saved haas_faculty_basic.csv | rows=331


In [4]:
df = pd.read_csv("haas_faculty_basic.csv")
df.to_excel("/content/haas_faculty_basic.xlsx", index=False)
df.head()

Unnamed: 0,name,title,department
0,David A. Aaker,Professor Emeritus,Marketing
1,Kai Adams,Professional Faculty,Persuasive Communication & Interpersonal Dynamics
2,Mark Adams,Professional Faculty,Finance
3,Vinod K Aggarwal,Professor,Business & Public Policy
4,Mario Alvarez,Professional Faculty,Economic Analysis & Policy
