<a href="https://colab.research.google.com/github/Lsyyy623/Case-Competition-Pillar-1-Team-11/blob/main/code/UCB/ucb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import time
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup

BASE = "https://haas.berkeley.edu"
SEARCH_URL = BASE + "/faculty/?action=faculty-search&letter={letter}"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    )
}


def get_soup(url: str) -> BeautifulSoup:
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")


def collect_all_profile_links() -> list[str]:

    links = set()

    for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        url = SEARCH_URL.format(letter=letter)
        soup = get_soup(url)

        for a in soup.select("a[href]"):
            href = a.get("href", "").strip()
            if not href:
                continue

            full = urljoin(BASE, href)


            if re.match(r"^https://haas\.berkeley\.edu/faculty/[^/]+/?$", full):

                if full.rstrip("/") == (BASE + "/faculty").rstrip("/"):
                    continue
                links.add(full.rstrip("/") + "/")

        time.sleep(0.4)

    return sorted(links)


def extract_selected_papers(profile_url: str) -> tuple[str, list[str]]:

    soup = get_soup(profile_url)


    h1 = soup.find("h1")
    name = h1.get_text(" ", strip=True) if h1 else ""

    header = None
    for tag in soup.find_all(["h2", "h3", "h4"]):
        t = tag.get_text(" ", strip=True)
        if "Selected Papers and Publications" in t:
            header = tag
            break

    if not header:
        return name, []

    entries = []


    for elem in header.find_all_next():
        if elem == header:
            continue
        if elem.name in ["h2", "h3", "h4"]:
            break

        if elem.name == "li":
            txt = " ".join(elem.stripped_strings)
            if txt:
                entries.append(txt)
        elif elem.name == "p" and not entries:
            txt = " ".join(elem.stripped_strings)
            if txt:
                entries.append(txt)

    cleaned = []
    seen = set()
    for t in entries:
        t = re.sub(r"\s+", " ", t).strip()
        if t and t not in seen:
            cleaned.append(t)
            seen.add(t)

    return name, cleaned


def extract_year(entry: str):

    years = re.findall(r"(?:19|20)\d{2}", entry)
    return int(years[-1]) if years else pd.NA


def main():
    profile_links = collect_all_profile_links()
    print(f"Found {len(profile_links)} profile pages.")

    rows = []
    for i, url in enumerate(profile_links, 1):
        print(f"[{i}/{len(profile_links)}] {url}")
        try:
            name, entries = extract_selected_papers(url)
            for entry in entries:
                rows.append(
                    {
                        "name": name,
                        "profile link": url,
                        "entry": entry,
                        "year": extract_year(entry),
                    }
                )
        except Exception as e:
            print("  ERROR:", e)

        time.sleep(0.6)

    df = pd.DataFrame(rows, columns=["name", "profile link", "entry", "year"])
    df["year"] = df["year"].astype("Int64")
    out = "haas_selected_papers.csv"
    df.to_csv(out, index=False, encoding="utf-8-sig")
    print(f"Saved: {out}  (rows={len(df)})")


if __name__ == "__main__":
    main()


Found 359 profile pages.
[1/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=A/
[2/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=B/
[3/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=C/
[4/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=D/
[5/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=E/
[6/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=F/
[7/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=G/
[8/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=H/
[9/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=I/
[10/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=J/
[11/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=K/
[12/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter=L/
[13/359] https://haas.berkeley.edu/faculty/?action=faculty-search&letter

In [3]:
import pandas as pd

df = pd.read_csv("/content/haas_selected_papers.csv")
df.head(20)

Unnamed: 0,name,profile link,entry,year
0,Adam Rosenzweig,https://haas.berkeley.edu/faculty/adam-rosenzw...,"Andrew Isaacs, Natalia Costa i Coromina, and A...",2019.0
1,Adam Rosenzweig,https://haas.berkeley.edu/faculty/adam-rosenzw...,"Rosenzweig, A. (2017). Understanding and Under...",2017.0
2,Vinod K Aggarwal,https://haas.berkeley.edu/faculty/aggarwal-vinod/,Responding to China’s Rise: US and EU Strategi...,2014.0
3,Vinod K Aggarwal,https://haas.berkeley.edu/faculty/aggarwal-vinod/,“Do WTO rules preclude industrial policy? Evid...,2014.0
4,Vinod K Aggarwal,https://haas.berkeley.edu/faculty/aggarwal-vinod/,"“A Fragmenting Global Economy: A Weakened WTO,...",2013.0
5,Vinod K Aggarwal,https://haas.berkeley.edu/faculty/aggarwal-vinod/,“The ASEAN Economic Community 2015: Implicatio...,2013.0
6,Vinod K Aggarwal,https://haas.berkeley.edu/faculty/aggarwal-vinod/,“U.S. Free Trade Agreements and Linkages.” Int...,2013.0
7,Vinod K Aggarwal,https://haas.berkeley.edu/faculty/aggarwal-vinod/,Linking Trade and Security: Evolving Instituti...,2012.0
8,Vinod K Aggarwal,https://haas.berkeley.edu/faculty/aggarwal-vinod/,“Industrial Policy Choice during the Crisis Er...,2012.0
9,Ahmed Badruzzaman,https://haas.berkeley.edu/faculty/ahmed-badruz...,"Badruzzaman, A. (2024). Nuclear Logging in Geo...",2024.0
