<a href="https://colab.research.google.com/github/Lsyyy623/Case-Competition-Pillar-1-Team-11/blob/main/code/UCSD/ucsd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -*- coding: utf-8 -*-
import re
import time
import random
from urllib.parse import urljoin, urldefrag

import requests
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Tag

DIRECTORY_URL = "https://rady.ucsd.edu/faculty-research/faculty-directory/index.html"

OUT_PAPERS = "rady_papers_only.csv"
OUT_ROSTER = "rady_roster_check.csv"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

YEAR_OR_STATUS = re.compile(r"(19\d{2}|20\d{2}|forthcoming|in press|accepted)", re.IGNORECASE)

# 你不想要的部分：遇到这些 heading 就停止（后面都不抓）
STOP_KEYWORDS = [
    "working paper", "working papers",
    "book", "books",
    "book chapter", "book chapters",
    "case", "cases",
]

# 这些 heading 我们认为是“论文/期刊文章”部分（Jun Liu 属于这一类）
PAPER_SECTION_HINTS = [
    "papers",
    "articles published or accepted",
    "articles",
    "journal articles",
    "published",
    "published or accepted",
    "published/accepted",
]


def clean(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()


def norm_url(base: str, href: str) -> str:
    if not href:
        return ""
    href = href.strip()
    absu = urljoin(base, href)
    absu, _ = urldefrag(absu)
    absu = absu.replace("http://", "https://")
    return absu


def make_soup(html: str) -> BeautifulSoup:
    try:
        return BeautifulSoup(html, "lxml")
    except Exception:
        return BeautifulSoup(html, "html.parser")


def fetch_html(session: requests.Session, url: str) -> str:
    r = session.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text


def extract_year(entry: str):
    years = re.findall(r"(?:19|20)\d{2}", entry)
    return int(years[-1]) if years else pd.NA


# -------------------------
# 1) 从目录页抓所有 professor profile 链接
# -------------------------
def get_faculty_profiles(dir_soup: BeautifulSoup):
    profiles = []
    for a in dir_soup.select("a[href]"):
        href = (a.get("href") or "").strip()
        if not href:
            continue
        u = norm_url(DIRECTORY_URL, href)

        # 只抓 faculty 主页（形如 .../faculty/xxx.html）
        if "/faculty-research/faculty/" in u and u.endswith(".html"):
            name = clean(a.get_text(" ", strip=True))
            if name:
                profiles.append((name, u))

    # 去重
    seen = set()
    out = []
    for name, link in profiles:
        if (name, link) in seen:
            continue
        seen.add((name, link))
        out.append({"name": name, "profile_link": link})
    return out


# -------------------------
# 2) 在个人主页中定位 Publications 区域/页面
# -------------------------
def locate_publications_scope(session: requests.Session, profile_url: str, profile_soup: BeautifulSoup):
    """
    返回 (pub_url, pub_soup, scope_tag)
    - 如果 “Publications” tab 指向 #xxx：在同页找对应 id
    - 如果指向单独 URL：抓那个 URL
    - 如果找不到 tab：fallback 用 main/article
    """
    # 找到 tab 里的 Publications 链接
    pub_a = None
    for a in profile_soup.select("a[href]"):
        if clean(a.get_text(" ", strip=True)).lower() == "publications":
            pub_a = a
            break

    # 默认 scope
    def default_scope(soup):
        return soup.find("main") or soup.find("article") or soup.body or soup

    # 没找到 Publications tab
    if not pub_a:
        return profile_url, profile_soup, default_scope(profile_soup)

    href = (pub_a.get("href") or "").strip()
    if not href:
        return profile_url, profile_soup, default_scope(profile_soup)

    # 1) 同页锚点
    if href.startswith("#"):
        section_id = href[1:]
        scope = profile_soup.find(id=section_id)
        if scope:
            return profile_url, profile_soup, scope
        # 找不到 id 就 fallback
        return profile_url, profile_soup, default_scope(profile_soup)

    # 2) 可能是单独 URL
    pub_url = norm_url(profile_url, href)
    try:
        html2 = fetch_html(session, pub_url)
        soup2 = make_soup(html2)
        return pub_url, soup2, default_scope(soup2)
    except Exception:
        # 抓不到单独 URL 就 fallback 同页
        return profile_url, profile_soup, default_scope(profile_soup)


# -------------------------
# 3) 从 Publications scope 中提取“papers”条目
# -------------------------
def extract_paper_entries_from_scope(scope: Tag):
    """
    逻辑：
    - 在 scope 内按顺序走
    - 遇到 h2/h3/h4：
        - 如果标题含 STOP_KEYWORDS：停止（后面是 working papers/books）
        - 否则继续
    - 抓 p/li 作为条目（Jun Liu 这类通常在 p）
    - 条目过滤：必须包含 年份 或 forthcoming/in press/accepted 之类
    """
    entries = []
    stopped = False

    # 有些页面 Publications 内会先出现一个大标题(比如 Articles Published or Accepted)，
    # 我们不强制必须匹配它；只要在 STOP 前的 p/li 像论文就收。
    for el in scope.descendants:
        if not isinstance(el, Tag):
            continue

        if el.name in ["h2", "h3", "h4"]:
            title = clean(el.get_text(" ", strip=True)).lower()
            if any(k in title for k in STOP_KEYWORDS):
                stopped = True
            continue

        if stopped:
            continue

        if el.name in ["p", "li"]:
            txt = clean(el.get_text(" ", strip=True))
            if not txt:
                continue
            if len(txt) < 12:
                continue
            # 过滤掉邮箱/电话等
            if "@" in txt and " " not in txt[:20]:
                continue
            # 像论文才收
            if YEAR_OR_STATUS.search(txt):
                entries.append(txt)

    # 去重保持顺序
    out, seen = [], set()
    for e in entries:
        if e not in seen:
            seen.add(e)
            out.append(e)
    return out


def main():
    session = requests.Session()

    print("Fetching directory:", DIRECTORY_URL)
    dir_html = fetch_html(session, DIRECTORY_URL)
    dir_soup = make_soup(dir_html)

    faculty = get_faculty_profiles(dir_soup)
    print(f"Directory: found {len(faculty)} faculty profiles with links.")

    paper_rows = []
    roster_rows = []

    for i, f in enumerate(faculty, 1):
        name = f["name"]
        profile_url = f["profile_link"]

        status = "no_papers"
        n = 0

        try:
            time.sleep(random.uniform(0.2, 0.6))
            html = fetch_html(session, profile_url)
            soup = make_soup(html)

            pub_url, pub_soup, scope = locate_publications_scope(session, profile_url, soup)
            entries = extract_paper_entries_from_scope(scope)

            # 如果 scope 太宽导致漏抓（极少数），再做一次“全页 main”兜底
            if not entries:
                scope2 = pub_soup.find("main") or pub_soup.find("article") or pub_soup.body or pub_soup
                entries = extract_paper_entries_from_scope(scope2)

            n = len(entries)
            status = "ok" if n > 0 else "no_papers"

            for entry in entries:
                paper_rows.append({
                    "name": name,
                    "profile link": profile_url,
                    "entry": entry,
                    "year": extract_year(entry),
                })

        except Exception as e:
            status = f"error:{type(e).__name__}"

        roster_rows.append({
            "name": name,
            "profile link": profile_url,
            "status": status,
            "papers_found": n
        })

        if i % 10 == 0:
            print(f"[{i}/{len(faculty)}] {name} -> {status} (papers={n})")

    df_papers = pd.DataFrame(paper_rows, columns=["name", "profile link", "entry", "year"])
    df_papers["year"] = df_papers["year"].astype("Int64")
    df_papers.to_csv(OUT_PAPERS, index=False, encoding="utf-8-sig")

    df_roster = pd.DataFrame(roster_rows, columns=["name", "profile link", "status", "papers_found"])
    df_roster.to_csv(OUT_ROSTER, index=False, encoding="utf-8-sig")

    print("=" * 60)
    print(f"Saved: {OUT_PAPERS} (rows={len(df_papers)})")
    print(f"Saved: {OUT_ROSTER} (rows={len(df_roster)})")


if __name__ == "__main__":
    main()


Fetching directory: https://rady.ucsd.edu/faculty-research/faculty-directory/index.html
Directory: found 52 faculty profiles with links.
[10/52] Sanjiv Erat -> ok (papers=13)
[20/52] Ania Jaroszewicz -> no_papers (papers=0)
[30/52] Michael Reher -> ok (papers=2)
[40/52] Richard Townsend -> ok (papers=7)
[50/52] Bob Sullivan -> ok (papers=4)
Saved: rady_papers_only.csv (rows=603)
Saved: rady_roster_check.csv (rows=52)


In [2]:
import pandas as pd

df = pd.read_csv("rady_papers_only.csv")  # 改成你的实际文件名
df.head(20)

Unnamed: 0,name,profile link,entry,year
0,Marcus Brooks,https://rady.ucsd.edu/faculty-research/faculty...,After receiving his Ph.D. in Accounting from U...,2024.0
1,Marcus Brooks,https://rady.ucsd.edu/faculty-research/faculty...,"Before pursuing his doctoral studies, Brooks w...",2009.0
2,Daniel Fragiadakis,https://rady.ucsd.edu/faculty-research/faculty...,After graduating from Stanford University in 2...,2014.0
3,Daniel Fragiadakis,https://rady.ucsd.edu/faculty-research/faculty...,"From 2021-23, Fragiadakis worked at Upwork as ...",2021.0
4,Daniel Fragiadakis,https://rady.ucsd.edu/faculty-research/faculty...,"In 2024, he returned to academia, teaching cou...",2024.0
5,Daniel Fragiadakis,https://rady.ucsd.edu/faculty-research/faculty...,"In 2025, Fragiadakis started his current role ...",2025.0
6,Vincent R. Nijs,https://rady.ucsd.edu/faculty-research/faculty...,For 2023-2024,2024.0
7,Michael Meyer,https://rady.ucsd.edu/faculty-research/faculty...,"Since 2006, Meyer has been teaching “New Produ...",2006.0
8,Michael Meyer,https://rady.ucsd.edu/faculty-research/faculty...,"In 2004, Meyer was a Batten Fellow at the Dard...",2023.0
9,Michael Meyer,https://rady.ucsd.edu/faculty-research/faculty...,Product-Service Systems Design Education: Norm...,2023.0
