In [1]:
# scrape_scotus_2025.py
import json
import re
import time
from dataclasses import asdict, dataclass, field
from typing import List, Optional, Dict
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://supreme.justia.com"
YEAR_PAGE = "https://supreme.justia.com/cases/federal/us/year/2025.html"
OUTPUT_FILE = "supreme_court_2025_cases.json"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

# --------- Helpers ---------
def clean_text(s: Optional[str]) -> Optional[str]:
    """Normalize whitespace, strip, and remove undesirable symbols/zero-width chars."""
    if s is None:
        return None
    # Replace newlines/tabs with spaces
    s = re.sub(r"[\r\n\t]+", " ", s)
    # Remove zero-width & control characters
    s = re.sub(r"[\u200B-\u200D\uFEFF\u2060\u00AD]", "", s)  # zero-width etc.
    s = re.sub(r"[\x00-\x1F\x7F]", " ", s)
    # Collapse multiple spaces
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip()

def get_soup(url: str, session: requests.Session, sleep_sec: float = 0.8) -> BeautifulSoup:
    """Fetch a URL with polite delay and return BeautifulSoup object."""
    time.sleep(sleep_sec)
    resp = session.get(url, headers=HEADERS, timeout=30)
    # Let requests guess encoding; fallback to apparent if missing
    if not resp.encoding:
        resp.encoding = resp.apparent_encoding or "utf-8"
    html = resp.text
    return BeautifulSoup(html, "lxml")

# --------- Data Model ---------
@dataclass
class CaseRecord:
    case_title: Optional[str] = None
    docket_number: Optional[str] = None
    court_name: Optional[str] = "U.S. Supreme Court"
    date: Optional[str] = None
    case_url: Optional[str] = None
    justia_opinion_summary: Optional[str] = None
    extra: Dict[str, Optional[str]] = field(default_factory=dict)
    # Optional: legal bias analysis scaffolding (left as None for later human review)
    legal_bias_analysis: Dict[str, Optional[str]] = field(default_factory=lambda: {
        "Gender Bias": None,
        "Religious Bias": None,
        "Racial Bias": None,
        "Age Bias": None,
        "Nationality Bias": None,
        "Sexual Orientation Bias": None,
        "Appearance Bias": None,
        "Socio-Economic Status Bias": None
    })

# --------- Parsing: Year Listing Page ---------
def parse_year_listing(session: requests.Session) -> List[Dict]:
    """
    Parse the 2025 year page to get per-case stubs:
    title, link, docket (if present on listing), and date (if present on listing).
    We try multiple CSS strategies to be resilient.
    """
    soup = get_soup(YEAR_PAGE, session)
    cases = []

    # The page typically has a wrapper like: div.results.zebra... > div (each case)
    containers = soup.select("div.results.zebra.has-negative-sides-30.-overflow-hidden > div")
    if not containers:
        # Fallback: try a more generic approach
        containers = soup.select("div.results div, div.results > *")

    for div in containers:
        # We expect at least a link to the case
        a = div.find("a", href=True)
        if not a:
            continue

        title = clean_text(a.get_text())
        href = urljoin(BASE_URL, a["href"])

        # Docket number often appears in <strong> tags or text like "Docket: 23-123"
        docket = None
        strongs = div.find_all("strong")
        for st in strongs:
            txt = clean_text(st.get_text())
            if txt and ("Docket" in txt or re.search(r"^\d{2}-\d+", txt)):
                # Either "Docket: 23-123" or raw "23-123"
                m = re.search(r"(\d{2}-\d+)", txt)
                docket = m.group(1) if m else txt
                break
        if not docket:
            # Try inline text
            text = clean_text(div.get_text()) or ""
            m = re.search(r"Docket[:\s]+(\d{2}-\d+)", text, flags=re.I)
            if m:
                docket = m.group(1)

        # Date sometimes present in small/span tags
        date = None
        maybe_date = div.find(["small", "span"], string=re.compile(r"\d{4}"))
        if maybe_date:
            date = clean_text(maybe_date.get_text())
        else:
            text = clean_text(div.get_text()) or ""
            # Very forgiving date catch (Month Day, 2025)
            m = re.search(
                r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+2025",
                text
            )
            if m:
                date = m.group(0)

        cases.append({
            "title": title,
            "url": href,
            "docket": docket,
            "date": date
        })
    # De-dup if the page has ads/duplicates
    unique = {}
    for c in cases:
        unique[c["url"]] = c
    return list(unique.values())

# --------- Parsing: Individual Case Page ---------
def parse_case_detail(case_url: str, session: requests.Session) -> Dict[str, Optional[str]]:
    """
    Visit the case page and retrieve Justia Opinion Summary (and possibly other bits).
    We try a few selector patterns that commonly occur across Justia pages.
    """
    soup = get_soup(case_url, session)

    # Try common locations for the Justia Opinion Summary
    # 1) A heading "Justia Opinion Summary" followed by a paragraph or div
    summary = None
    heading = soup.find(lambda tag: tag.name in ["h2", "h3"] and "Justia Opinion Summary" in tag.get_text())
    if heading:
        # grab next sibling that has text
        ns = heading.find_next_sibling()
        while ns and clean_text(ns.get_text()) in (None, ""):
            ns = ns.find_next_sibling()
        if ns:
            summary = clean_text(ns.get_text())

    # 2) Some pages use a dedicated box/section; try a few known classes
    if not summary:
        summary_box = soup.select_one(".opinion-summary, .justia-opinion-summary, section#opinion-summary, div#opinion-summary")
        if summary_box:
            summary = clean_text(summary_box.get_text())

    # 3) Fallback: search for a paragraph containing "Justia Opinion Summary:"
    if not summary:
        p = soup.find("p", string=re.compile(r"Justia Opinion Summary", re.I))
        if p:
            summary = clean_text(p.get_text())

    # Title on the detail page (sometimes more exact)
    page_title = None
    h1 = soup.find("h1")
    if h1:
        page_title = clean_text(h1.get_text())

    # Docket sometimes appears clearly on detail page
    docket = None
    text = clean_text(soup.get_text()) or ""
    m = re.search(r"Docket(?: No\.?|):\s*([0-9]{2}-\d+)", text, flags=re.I)
    if m:
        docket = m.group(1)

    # Date sometimes present near the caption / header
    date = None
    # Try meta or obvious patterns
    m2 = re.search(
        r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+2025",
        text
    )
    if m2:
        date = m2.group(0)

    return {
        "page_title": page_title,
        "summary": summary,
        "docket": docket,
        "date": date,
    }

# --------- Main Scrape ---------
def main():
    session = requests.Session()
    session.headers.update(HEADERS)

    listing = parse_year_listing(session)
    records: List[CaseRecord] = []

    for idx, item in enumerate(listing, start=1):
        url = item["url"]
        detail = parse_case_detail(url, session)

        title = detail.get("page_title") or item.get("title")
        docket = detail.get("docket") or item.get("docket")
        date = detail.get("date") or item.get("date")

        rec = CaseRecord(
            case_title=title,
            docket_number=docket,
            date=date,
            case_url=url,
            # Court name fixed for this collection
            court_name="U.S. Supreme Court",
            justia_opinion_summary=detail.get("summary"),
            extra={}
        )

        # Extra fields you might want to capture if present on listing
        if item.get("title") and item.get("title") != title:
            rec.extra["listing_title"] = item.get("title")

        # Clean everything one more time
        for k, v in list(asdict(rec).items()):
            if isinstance(v, str):
                setattr(rec, k, clean_text(v))
            elif isinstance(v, dict):
                # Clean dict string values
                for dk, dv in list(v.items()):
                    if isinstance(dv, str):
                        v[dk] = clean_text(dv)

        print(f"[{idx}/{len(listing)}] {rec.case_title or '(no title)'}")

        records.append(rec)

    # Serialize to JSON (ensure UTF-8; escape control chars removed already)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump([asdict(r) for r in records], f, ensure_ascii=False, indent=2)

    print(f"\nSaved {len(records)} cases to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


[1/63] Goldey v. Fields, 606 U.S. ___ (2025)
[2/63] Trump v. CASA, Inc., 606 U.S. ___ (2025)
[3/63] FCC v. Consumers' Research, 606 U.S. ___ (2025)
[4/63] Free Speech Coalition, Inc. v. Paxton, 606 U.S. ___ (2025)
[5/63] Mahmoud v. Taylor, 606 U.S. ___ (2025)
[6/63] Kennedy v. Braidwood Management, Inc., 606 U.S. ___ (2025)
[7/63] Riley v. Bondi, 606 U.S. ___ (2025)
[8/63] Medina v. Planned Parenthood South Atlantic, 606 U.S. ___ (2025)
[9/63] Hewitt v. United States, 606 U.S. ___ (2025)
[10/63] Gutierrez v. Saenz, 606 U.S. ___ (2025)
[11/63] Stanley v. City of Sanford, 606 U.S. ___ (2025)
[12/63] Fuld v. Palestine Liberation Organization, 606 U.S. ___ (2025)
[13/63] Diamond Alternative Energy, LLC v. Environmental Protection Agency, 606 U.S. ___ (2025)
[14/63] Esteras v. United States, 606 U.S. ___ (2025)
[15/63] McLaughlin Chiropractic Associates, Inc. v. McKesson Corp., 606 U.S. ___ (2025)
[16/63] Food and Drug Administration v. R.J. Reynolds Vapor Co., 606 U.S. ___ (2025)
[17/63] P