<a href="https://colab.research.google.com/github/Lsyyy623/Case-Competition-Pillar-1-Team-11/blob/main/code/ucla/ucla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# -*- coding: utf-8 -*-
import re
import time
import random
from urllib.parse import urljoin, urldefrag

import requests
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Tag

DIRECTORY_URL = "https://www.anderson.ucla.edu/faculty-and-research/faculty-directory"
OUT_CSV = "ucla_anderson_pubs_papers_only.csv"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

# 不要的类型
EXCLUDE_KEYWORDS = [
    "working paper", "working papers",
    "book", "books",
    "book chapter", "book chapters",
    "case", "cases",
    "monograph", "edited volume",
]

# “像论文”的判断：有年份 或 forthcoming/in press/accepted
PAPER_LIKE = re.compile(r"(19\d{2}|20\d{2}|forthcoming|in press|accepted)", re.IGNORECASE)
YEAR_RE = re.compile(r"\b(?:19|20)\d{2}\b")

# ✅ 关键：用正则匹配标题（包含 Select Published Papers）
HEADER_PATTERNS = [
    re.compile(r"\barticles\s+published\s+or\s+accepted\b", re.I),
    re.compile(r"\b(select(ed)?\s+)?published\s+papers?\b", re.I),   # Select/Selected Published Papers
    re.compile(r"\bselected\s+publications?\b", re.I),
    re.compile(r"\bpublications?\b", re.I),
    re.compile(r"\bpapers?\b", re.I),
]

def clean(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def normalize_url(base: str, href: str) -> str:
    if not href:
        return ""
    u = urljoin(base, href.strip())
    u, _ = urldefrag(u)
    return u.replace("http://", "https://")

def make_soup(html: str) -> BeautifulSoup:
    try:
        return BeautifulSoup(html, "lxml")
    except Exception:
        return BeautifulSoup(html, "html.parser")

def make_session() -> requests.Session:
    s = requests.Session()
    s.headers.update(HEADERS)
    return s

def fetch_html(session: requests.Session, url: str) -> str:
    r = session.get(url, timeout=30)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

def extract_year(entry: str) -> str:
    m = YEAR_RE.search(entry)
    return m.group(0) if m else ""

def looks_excluded(entry: str) -> bool:
    low = entry.lower()
    return any(k in low for k in EXCLUDE_KEYWORDS)

# -------------------------
# 1) 目录页抓所有教授 profile 链接
# -------------------------
def collect_profile_links(session: requests.Session):
    html = fetch_html(session, DIRECTORY_URL)
    soup = make_soup(html)

    links = []
    for a in soup.select("a[href]"):
        href = (a.get("href") or "").strip()
        if not href:
            continue
        u = normalize_url(DIRECTORY_URL, href)

        # UCLA Anderson 教授页常见：.../faculty-and-research/<area>/faculty/<slug>
        if ("/faculty-and-research/" in u) and ("/faculty/" in u) and ("/faculty-directory" not in u):
            if u.lower().endswith((".pdf", ".doc", ".docx")):
                continue
            links.append(u)

    # 去重保序
    seen, uniq = set(), []
    for u in links:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq

# -------------------------
# 2) 个人页抽取标题 + 条目
# -------------------------
def get_name(soup: BeautifulSoup) -> str:
    h1 = soup.find("h1")
    if h1:
        return clean(h1.get_text(" ", strip=True))
    h2 = soup.find("h2")
    return clean(h2.get_text(" ", strip=True)) if h2 else ""

def is_target_header(title: str) -> bool:
    t = clean(title)
    if not t:
        return False
    # 太短的比如 “About / Media” 不要
    if len(t) < 5:
        return False
    return any(p.search(t) for p in HEADER_PATTERNS)

def collect_entries_under_header(header: Tag):
    """
    从 header 开始抓到下一个 h2/h3/h4 之前。
    同时支持 p/li + 兜底按行拆文本（适配很多不同结构）。
    """
    entries = []

    for el in header.find_all_next():
        if not isinstance(el, Tag):
            continue

        if el.name in ["h2", "h3", "h4"] and el is not header:
            break

        if el.name in ["p", "li"]:
            txt = clean(el.get_text(" ", strip=True))
            if txt:
                entries.append(txt)

    # 兜底：如果没抓到 p/li，就用 block 文本按行切
    if not entries:
        block = []
        for el in header.find_all_next():
            if isinstance(el, Tag) and el.name in ["h2", "h3", "h4"] and el is not header:
                break
            if isinstance(el, Tag) and el.name in ["div", "section"]:
                t = el.get_text("\n", strip=True)
                if t:
                    block.append(t)
        merged = "\n".join(block)
        for line in merged.split("\n"):
            line = clean(line)
            if line:
                entries.append(line)

    # 过滤：像论文 + 排除 working paper/book
    out, seen = [], set()
    for e in entries:
        if looks_excluded(e):
            continue
        if not PAPER_LIKE.search(e):
            continue
        if e not in seen:
            seen.add(e)
            out.append(e)
    return out

def extract_entries_from_profile(session: requests.Session, profile_url: str):
    html = fetch_html(session, profile_url)
    soup = make_soup(html)
    name = get_name(soup)
    scope = soup.find("main") or soup.find("article") or soup.body or soup

    # ✅ 这里不再要求标题“完全等于”，而是用 is_target_header
    headers = []
    for h in scope.find_all(["h2", "h3", "h4"]):
        title = h.get_text(" ", strip=True)
        if is_target_header(title):
            headers.append(h)

    all_entries = []
    for h in headers:
        all_entries.extend(collect_entries_under_header(h))

    # 去重
    final, seen = [], set()
    for e in all_entries:
        if e not in seen:
            seen.add(e)
            final.append(e)

    return name, final, [clean(h.get_text(" ", strip=True)) for h in headers]

# -------------------------
# main
# -------------------------
def main():
    session = make_session()

    print("Fetching directory:", DIRECTORY_URL)
    profile_links = collect_profile_links(session)
    print(f"Found {len(profile_links)} profile links.")

    rows = []
    for i, url in enumerate(profile_links, 1):
        try:
            time.sleep(random.uniform(0.4, 0.9))
            name, entries, matched_headers = extract_entries_from_profile(session, url)

            if not entries:
                # 如果你想调试，可以把 matched_headers 打印出来看看这个教授页有哪些标题
                print(f"[{i}/{len(profile_links)}] {name or '(no name)'} -> no entries (matched headers={matched_headers[:3]})")
                continue

            print(f"[{i}/{len(profile_links)}] {name} -> {len(entries)} entries")

            for e in entries:
                rows.append({
                    "name": name,
                    "profile_link": url,
                    "entry": e,
                    "year": extract_year(e),
                })

        except Exception as ex:
            print(f"[{i}/{len(profile_links)}] ERROR on {url}: {type(ex).__name__}: {ex}")

    df = pd.DataFrame(rows, columns=["name", "profile_link", "entry", "year"])
    df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
    print(f"\nSaved: {OUT_CSV} (rows={len(df)})")

if __name__ == "__main__":
    main()


Fetching directory: https://www.anderson.ucla.edu/faculty-and-research/faculty-directory
Found 126 profile links.
[1/126] Derek Alderton -> no entries (matched headers=[])
[2/126] Corinne Bendersky -> 16 entries
[3/126] Daniel J. Benjamin -> 21 entries
[4/126] Tamara Berges -> no entries (matched headers=[])
[5/126] Antonio Bernardo -> 4 entries
[6/126] Anand V. Bodapati -> 5 entries
[7/126] Clement Bohr -> 14 entries
[8/126] Casey Borman -> no entries (matched headers=[])
[9/126] Maria Boss -> 3 entries
[10/126] Fernanda Bravo -> 8 entries
[11/126] Felipe Caro -> no entries (matched headers=[])
[12/126] Heather Maiirhe Caruso -> 5 entries
[13/126] Eugene M. Caruso -> 50 entries
[14/126] Francisco Castro -> 8 entries
[15/126] M. Keith Chen -> 9 entries
[16/126] Mikhail Chernov -> 7 entries
[17/126] Hanne Collins -> no entries (matched headers=[])
[18/126] Charles Corbett -> 29 entries
[19/126] Samuel Culbert -> 5 entries
[20/126] Hengchen Dai -> 10 entries
[21/126] Magali (Maggie) Delm

In [4]:
import pandas as pd

df = pd.read_csv("ucla_anderson_pubs_papers_only.csv")  # 改成你的实际文件名
df.head(20)


Unnamed: 0,name,profile_link,entry,year
0,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Pai, J. & Bendersky, C. (2020) “ Team Status C...",2020.0
1,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Danbold, F. & Bendersky, C. (2020) “ Balancing...",2020.0
2,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Bendersky, C. and Pai, J. (2018). “Status Dyna...",2018.0
3,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Bendersky, C. (2018). “ Making U.S. Fire Depar...",2018.0
4,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Bendersky, C., & Hays, N. A. 2017. The Positiv...",2017.0
5,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Tsai, M., and Bendersky, C. (2016) “The pursui...",2016.0
6,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Hays, N. A., & Bendersky, C. (2015). Not at al...",2015.0
7,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Weingart, L., Behfar, K., Bendersky, C., Todor...",2015.0
8,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Bendersky, C. (2014). Resolving Ideological Co...",2014.0
9,Corinne Bendersky,https://www.anderson.ucla.edu/faculty-and-rese...,"Bendersky, C. and Shah, N. (2013). The downfal...",2013.0
