ER Wait Times Scraper for HospitalStats.org

This module fetches metro ER pages from HospitalStats.org, parses tabular/ semi-structured HTML into a normalized DataFrame, and outputs these to a csv.



In [1]:
# import statements
import re, time, csv, argparse, datetime as dt, difflib
from urllib.parse import urljoin
import requests
from pathlib import Path
from bs4 import BeautifulSoup, NavigableString, Tag
from requests.adapters import HTTPAdapter, Retry
import pandas as pd

In [5]:
MIDWEST_ABBR = {"IL","IN", "IA","KS","MI","MN","MO","NE","ND","OH","SD","WI"} 

Scraping the data from Chicago

In [2]:
# url and headers configuration
BASE_URL  = globals().get("BASE_URL", "https://www.hospitalstats.org")
METRO_URL = globals().get("METRO_URL", "https://www.hospitalstats.org/ER-Wait-Time/Chicago-IL-Metro.htm")
HEADERS   = globals().get("HEADERS", {"User-Agent": "ERWaitTimes/1.0 (+contact@example.com)"})

In [3]:
# configuration
STAMP      = dt.datetime.utcnow().strftime("%Y%m%d%H%M%S")
OUTDIR     = Path("out") / STAMP
OUTDIR.mkdir(parents=True, exist_ok=True)
DELAY = 0.8

# regex patterns
TIME_RE = re.compile(r'\b(?:(\d+)\s*h)?\s*(\d+)\s*m\b', re.I)
PCT_RE  = re.compile(r'(\d+(?:\.\d+)?)\s*%')
WS      = re.compile(r'\s+')


  STAMP      = dt.datetime.utcnow().strftime("%Y%m%d%H%M%S")


In [5]:
# convert (hour, minute) string to integer for minutes
def to_minutes(s: str | None) -> int | None:
    # Convert '2h 15m' or '54m' to minutes, 'N/A' -> None
    if s is None:
        return None
    s = str(s)
    if "N/A" in s:
        return None
    m = TIME_RE.search(s)
    if not m:
        return None
    h = int(m.group(1) or 0)
    mins = int(m.group(2))
    return h * 60 + mins

In [6]:
# strips and normalizes whitespace
def clean(s): 
    return WS.sub(" ", (s or "").strip())

In [7]:
# integer to percent 
def to_percent(s):
    if not s: return None
    m = PCT_RE.search(str(s))
    return float(m.group(1)) if m else None

In [8]:
# GET a URL using a shared Session with retry/backoff and polite headers
def fetch_html(url: str) -> str:
    with requests.Session() as s:
        s.headers.update(HEADERS)
        r = s.get(url, timeout=25)
        r.raise_for_status()
        return r.text

In [9]:
# try fast-path parsing with pandas.read_html for well-formed tables
def try_read_html_tables(html: str) -> pd.DataFrame | None:
    """
    Try to parse any tables directly with pandas.
    Returns a DataFrame with at least hospital name + wait text if possible.
    """
    try:
        tables = pd.read_html(html) 
    except ValueError:
        return None

    if not tables:
        return None

    # look for a table that lists hospitals & times
    candidates = []
    for t in tables:
        cols_lower = [str(c).lower() for c in t.columns]
        if any("hospital" in c for c in cols_lower) or any("wait" in c for c in cols_lower):
            candidates.append(t)

    if not candidates:
        return None

    # Take the largest table
    df = max(candidates, key=len).copy()
    df.columns = [str(c).strip().lower().replace(" ", "_") for c in df.columns]
    name_col = next((c for c in df.columns if "hospital" in c or c == "name"), None)

    if name_col is None:
        return None

    # Reduce to a minimal schema
    out = pd.DataFrame({
        "hospital_name": df[name_col].astype(str).str.strip(),
    })

    
    return out

In [10]:
def node_text(n) -> str:
    # Return visible text for any BeautifulSoup node (Tag or string)
    if n is None:
        return ""
    if isinstance(n, NavigableString):
        return str(n)
    if hasattr(n, "get_text"):
        return n.get_text(" ", strip=True)
    return str(n)

def safe_join(parts, sep=" "):
    # Join any list/iterable of nodes/strings safely as text
    return sep.join([node_text(p) for p in parts if p is not None and node_text(p)])


In [11]:
def bs4_fallback_extract(html: str) -> pd.DataFrame:
    """ Parse a metro page using BeautifulSoup when read_html is unreliable.
    Parameters: 
        html : str
            Raw HTML for a metro ER page (e.g., Chicago-IL-Metro).

    Returns:
        pd.DataFrame
            Frame with normalized columns such as:
            - 'Hospital'
            - 'City'
            - 'State'
            - 'WaitMinutes' (int or None)
            - 'LeftBeforeSeen' (float % or None)
            - 'NotRecommended' (float % or None) """
    if BeautifulSoup is None:
        raise RuntimeError("BeautifulSoup not installed; `pip install beautifulsoup4` or rely on read_html tables.")

    from urllib.parse import urljoin
    soup = BeautifulSoup(html, "html.parser")

    # Hospital links: /hospital-ratings/....
    links = soup.select('a[href*="/hospital-ratings/"]')
    rows = []
    # "Comment <City> <Wait/N/A>"
    row_re = re.compile(r'Comment\s+(.*?)\s+(?:(\d+\s*h\s*\d+\s*m|\d+\s*m)|N/A)\b', re.I)

    for a in links:
        name = a.get_text(strip=True)
        href = urljoin(BASE_URL, a.get("href","").strip())

        parent_text = a.parent.get_text(" ", strip=True)
        tail = parent_text.replace(name, "", 1).strip()
        m = row_re.search(tail)

        city = m.group(1).strip() if m else None
        wait_text = (m.group(2).strip() if (m and m.group(2)) else "N/A")

        rows.append({
            "hospital_name": name,
            "city": city,
            "wait_text": wait_text,
            "wait_minutes": to_minutes(wait_text),
            "detail_url": href,          
        })

    return pd.DataFrame(rows)


In [12]:
# find the text that follows a bold/label element within a soup tree
def text_after_b(soup, label_regex):
    """
    Find <b>Label:</b> VALUE in the same parent; collect text from following
    siblings up to the first <br>. Robust to Tag objects.
    Parameters:
        soup: bs4.BeautifulSoup | Tag
        label_regex: Pattern[str]
    """
    for b in soup.find_all("b"):
        if re.search(label_regex, b.get_text(" ", strip=True), re.I):
            parts = []
            for sib in b.next_siblings:
                if isinstance(sib, Tag) and sib.name == "br":
                    break
                parts.append(sib)
            txt = clean(safe_join(parts))
            if txt:
                return txt
    return None


In [13]:
# create a normalized key for hospital names to improve joins across pages
def norm_name(s: str) -> str:
    if not s: return ""
    s = s.strip()
    s = re.sub(r'(?i)\s*comment\s*$', '', s)  
    s = re.sub(r'\s+', ' ', s)             
    s = re.sub(r"[’'`]", "", s)         
    return s.lower()

In [14]:
def series_or_na(frame: pd.DataFrame, col: str) -> pd.Series:
    # Return frame[col] if it exists, else a NA Series aligned to frame.index
    return frame[col] if col in frame.columns else pd.Series(pd.NA, index=frame.index)

In [15]:
# annotate the metro DataFrame with per-hospital detail page URLs
def attach_detail_urls(html: str, df: pd.DataFrame) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")
    anchors = soup.select('a[href*="/hospital-ratings/"]')
    link_rows = []
    for a in anchors:
        text = a.get_text(strip=True)
        href = (a.get("href") or "").strip()
        if text and href:
            link_rows.append({
                "__key": norm_name(text),
                "anchor_name": text,
                "detail_url_from_anchor": urljoin(BASE_URL, href),
            })
    df_links = pd.DataFrame(link_rows).drop_duplicates(subset="__key")

    # Clean names and key
    if "hospital_name" not in df.columns:
        raise RuntimeError("Need a 'hospital_name' column on df.")
    df = df.copy()
    df["hospital_name"] = (
        df["hospital_name"].astype(str)
          .str.replace(r'(?i)\s*comment\s*$', '', regex=True)
          .str.strip()
    )
    df["__key"] = df["hospital_name"].map(norm_name)

    # Safe merge
    if "detail_url" in df.columns:
        df = df.rename(columns={"detail_url": "detail_url_existing"})

    df = df.merge(df_links[["__key", "anchor_name", "detail_url_from_anchor"]],
                  on="__key", how="left")
    df["hospital_name"] = df["anchor_name"].fillna(df["hospital_name"])
    existing = series_or_na(df, "detail_url_existing")
    df["detail_url"] = df["detail_url_from_anchor"].where(df["detail_url_from_anchor"].notna(), existing)

    # Cleanup
    for col in ["anchor_name", "detail_url_from_anchor", "detail_url_existing"]:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)
    return df

In [16]:
html = requests.get(METRO_URL, headers=HEADERS, timeout=25).text
soup = BeautifulSoup(html, "html.parser")

# read tables and pick the one with a Hospital column
tables = pd.read_html(html)
base = None
for t in tables:
    cols = [str(c).lower() for c in t.columns.astype(str)]
    if any("hospital" in c for c in cols):
        base = t.copy(); break
if base is None:
    raise RuntimeError("Could not find the hospital table on the metro page.")

# normalize and keep core fields
base.columns = [str(c).strip().lower().replace("name", "hospital_name") for c in base.columns]
name_col = [c for c in base.columns if "hospital" in c][0]
city_col = next((c for c in base.columns if "city" in c), None)
wait_col = next((c for c in base.columns if "wait" in c and "time" in c), None)

df = pd.DataFrame({
    "hospital_name": base[name_col].astype(str).str.strip(),
    "city": base[city_col] if city_col else pd.NA,
    "wait_text": base[wait_col] if wait_col else pd.NA,
})
df["wait_minutes"] = df["wait_text"].apply(to_minutes)

# attach detail_url from anchors
anchors = soup.select('a[href*="/hospital-ratings/"]')
link_rows = []
for a in anchors:
    text = a.get_text(strip=True)
    href = (a.get("href") or "").strip()
    if text and href:
        link_rows.append({
            "__key": norm_name(text),
            "anchor_name": text,
            "detail_url": urljoin(BASE_URL, href),
        })
df_links = pd.DataFrame(link_rows).drop_duplicates(subset="__key")

df["hospital_name"] = df["hospital_name"].str.replace(r'(?i)\s*comment\s*$', '', regex=True).str.strip()
df["__key"] = df["hospital_name"].map(norm_name)
df = df.merge(df_links, on="__key", how="left")
df["hospital_name"] = df["anchor_name"].fillna(df["hospital_name"])
df.drop(columns=["anchor_name","__key"], inplace=True)

# write staging + timeseries
now_iso = dt.datetime.now(dt.timezone.utc).isoformat()
df["scrape_ts"] = now_iso
df["source_url"] = METRO_URL

staging = pd.DataFrame({
    "source": "hospitalstats",
    "name": df["hospital_name"],
    "city": df["city"],
    "wait_minutes": df["wait_minutes"].astype("Int64"),
    "source_url": df["source_url"],
    "detail_url": df["detail_url"],
    "raw_wait_text": df["wait_text"],
    "scrape_ts": df["scrape_ts"],
})

timeseries = pd.DataFrame({
    "source": staging["source"],
    "name": staging["name"],
    "city": staging["city"],
    "ts": staging["scrape_ts"],
    "wait_minutes": staging["wait_minutes"],
    "source_url": staging["source_url"],
})

stamp = dt.datetime.utcnow().strftime("%Y%m%d%H%M%S")
outdir = Path("out") / stamp
outdir.mkdir(parents=True, exist_ok=True)

staging.to_csv(outdir / "staging_chicago.csv", index=False)
timeseries.to_csv(outdir / "wait_times_timeseries.csv", index=False)
staging.to_csv("staging_chicago.csv", index=False)
timeseries.to_csv("wait_times_timeseries.csv", index=False)

  tables = pd.read_html(html)
  stamp = dt.datetime.utcnow().strftime("%Y%m%d%H%M%S")


In [17]:
# Extract additional metrics from a hospital's detail page:
def parse_detail_page(html: str, url: str) -> dict:
    """ 
        address, phone, hospital type, emergency services,
        mortality rates (overall, heart attack, stroke, heart failure, pneumonia),
        infection cases (C. Diff, MRSA),
        average ER wait time,
        patient ratings (overall, positive points, negative points)
    """
    soup = BeautifulSoup(html, "html.parser")
    h1 = soup.find(["h1","h2"])
    name = clean(h1.get_text(" ", strip=True)) if h1 else None

    # address/phone block
    address=city=state=postal=phone=None
    left = soup.select('div[style*="float:left"][style*="width:40%"]')
    if left:
        addr_html = left[0]
        lines = [x for x in addr_html.get_text("\n", strip=True).split("\n") if x]
        if lines: address = clean(lines[0])
        if len(lines)>=2:
            m = re.search(r"(.+?),\s*([A-Z]{2})\s+(\d{5}(?:-\d{4})?)", lines[1])
            if m: city, state, postal = clean(m.group(1)), m.group(2), m.group(3)
        b_phone = addr_html.find("b", string=re.compile(r"^\s*Phone\s*:\s*$", re.I))
        if b_phone:
            parts = []
            for sib in b_phone.next_siblings:
                if isinstance(sib, Tag) and sib.name == "br":
                    break
                parts.append(sib)
            maybe_phone = safe_join(parts)          
            pm = re.search(r"\(?\d{3}\)?[ -]?\d{3}[ -]?\d{4}", maybe_phone)
            if pm:
                phone = pm.group(0)

    hosp_type = text_after_b(soup, r"Hospital\s*Type")
    emergency_services = text_after_b(soup, r"Emergency\s*Services")
    if emergency_services:
        es = emergency_services.upper()
        emergency_services = "YES" if "YES" in es else ("NO" if "NO" in es else emergency_services)

    # quality section
    quality_hdr = soup.find(id="Quality")
    mort_text=mort_pct=mort_dir=None
    ha=st=hf=pn=None
    if quality_hdr:
        span = quality_hdr.find_next("span", class_="bigstat")
        if span:
            mort_text = clean(span.get_text(" ", strip=True))
            m = re.search(r"(\d+(?:\.\d+)?)\s*%", mort_text);  mort_pct = float(m.group(1)) if m else None
            d = re.search(r"\b(better|worse)\b", mort_text, re.I); mort_dir = d.group(1).lower() if d else None
        tbl = quality_hdr.find_next("table")
        if tbl:
            for tr in tbl.find_all("tr"):
                th = tr.find("th"); tds = tr.find_all("td")
                if not th or not tds: continue
                label = clean(th.get_text(" ", strip=True))
                pct = to_percent(tds[0].get_text(" ", strip=True))
                if   re.search(r"Heart Attack", label, re.I): ha = pct
                elif re.search(r"Stroke", label, re.I):       st = pct
                elif re.search(r"Heart Failure", label, re.I): hf = pct
                elif re.search(r"Pneumonia", label, re.I):     pn = pct

    # infections
    c_diff=mrsa=None
    inf = soup.find(id="infectious")
    if inf:
        tbl = inf.find_next("table")
        if tbl:
            for tr in tbl.find_all("tr"):
                tds = tr.find_all("td")
                if len(tds)!=2: continue
                label = tds[0].get_text(" ", strip=True)
                val = re.sub(r"[^\d]","", tds[1].get_text(" ", strip=True)) or None
                val = int(val) if val else None
                if re.search(r"C\.\s*Diff", label, re.I): c_diff = val
                if re.search(r"MRSA", label, re.I):       mrsa   = val

    # ER wait
    avg_ed_min=None
    er = soup.find(id="erwait")
    if er:
        span = (er.find_parent() or er).find("span","bigstat")
        if span: avg_ed_min = to_minutes(span.get_text(" ", strip=True))

    # patient ratings
    overall_patient_rating = None
    positive_points = negative_points = None

    pr_hdr = soup.find(id="patientratings")
    if pr_hdr:
        # Overall rating 
        span = pr_hdr.find_next("span", class_="bigstat")
        if span:
            overall_patient_rating = clean(span.get_text(" ", strip=True))

        # positive box
        pos_h3 = soup.find("h3", string=re.compile(r"Positive\s+Patient\s+Ratings", re.I))
        if pos_h3:
            pos_box = pos_h3.find_parent("div")
            if pos_box:
                ul = pos_box.find("ul") 
                if ul and ul.find_all("li"):
                    positive_points = "; ".join(
                        clean(li.get_text(" ", strip=True)) for li in ul.find_all("li")
                    )
                else:
                    # Some pages have no <ul>, just "No consistently positive ratings"
                    raw = clean(pos_box.get_text(" ", strip=True))
                    raw = re.sub(r"^\s*Positive\s+Patient\s+Ratings\s*", "", raw, flags=re.I).strip()
                    if re.search(r"No\s+consistently\s+positive\s+ratings", raw, re.I):
                        positive_points = None  
                    elif raw:
                        positive_points = raw  

        # negative box 
        neg_h3 = soup.find("h3", string=re.compile(r"Negative\s+Patient\s+Ratings", re.I))
        if neg_h3:
            neg_box = neg_h3.find_parent("div")
            if neg_box:
                ul = neg_box.find("ul")
                if ul and ul.find_all("li"):
                    negative_points = "; ".join(
                        clean(li.get_text(" ", strip=True)) for li in ul.find_all("li")
                    )

        # Safety: if positive == negative (selector leak), blank out positive
        if positive_points and negative_points and positive_points == negative_points:
            positive_points = None

    return {
        "detail_url": url,
        "detail_name": name,
        "detail_address": address, "detail_city": city, "detail_state": state, "detail_zip": postal, "detail_phone": phone,
        "detail_hospital_type": hosp_type,
        "detail_emergency_services": emergency_services,
        "detail_mortality_overall_text": mort_text,
        "detail_mortality_overall_percent": mort_pct,
        "detail_mortality_overall_direction": mort_dir,
        "detail_mortality_heart_attack_percent": ha,
        "detail_mortality_stroke_percent": st,
        "detail_mortality_heart_failure_percent": hf,
        "detail_mortality_pneumonia_percent": pn,
        "detail_c_diff_cases": c_diff,
        "detail_mrsa_cases": mrsa,
        "detail_avg_time_in_ed_minutes": avg_ed_min,
        "detail_overall_patient_rating": overall_patient_rating,
        "detail_positive_patient_ratings": positive_points,
        "detail_negative_patient_ratings": negative_points,
    }

In [18]:
# visit each row's 'detail_url' (if present) and enrich the DataFrame in place
def enrich_df_inline(df: pd.DataFrame, delay: float = 0.8) -> pd.DataFrame:
    """ 
    Parameters
        df : pd.DataFrame
            Frame expected to contain a 'detail_url' column with absolute URLs.
        delay : float, default 0.8
            Sleep duration between requests to avoid overloading the site.
    Returns
        pd.DataFrame
            A new DataFrame with added/updated columns from detail pages.
    """
    if "detail_url" not in df.columns:
        raise ValueError("DataFrame must contain a 'detail_url' column.")
    urls = df["detail_url"].dropna().astype(str).unique().tolist()
    rows = []
    with requests.Session() as s:
        s.headers.update(HEADERS)
        for i, url in enumerate(urls, 1):
            try:
                time.sleep(delay)
                r = s.get(url, timeout=30)
                r.raise_for_status()
                rows.append(parse_detail_page(r.text, url))
            except Exception as e:
                rows.append({"detail_url": url, "detail_error": str(e)})
            print(f"[{i}/{len(urls)}] {url}")
    df_detail = pd.DataFrame(rows)
    return df.merge(df_detail, on="detail_url", how="left")

df_staging = pd.read_csv("staging_chicago.csv")
df_enriched = enrich_df_inline(df_staging, delay=0.8)
df_enriched.to_csv("staging_chicago_enriched.csv", index=False)
df_enriched.to_csv(outdir / "staging_chicago_enriched.csv", index=False)


[1/30] https://www.hospitalstats.org/hospital-ratings/presence-saint-joseph-hospital--chicago-chicago-il.htm
[2/30] https://www.hospitalstats.org/hospital-ratings/loretto-hospital-chicago-il.htm
[3/30] https://www.hospitalstats.org/hospital-ratings/thorek-memorial-hospital-chicago-il.htm
[4/30] https://www.hospitalstats.org/hospital-ratings/insight-hospital-and-medical-center-chicago-chicago-il.htm
[5/30] https://www.hospitalstats.org/hospital-ratings/holy-cross-hospital-chicago-il.htm
[6/30] https://www.hospitalstats.org/hospital-ratings/jackson-park-hospital-chicago-il.htm
[7/30] https://www.hospitalstats.org/hospital-ratings/humboldt-park-health-chicago-il.htm
[8/30] https://www.hospitalstats.org/hospital-ratings/provident-hospital-of-chicago-chicago-il.htm
[9/30] https://www.hospitalstats.org/hospital-ratings/saint-anthony-hospital-chicago-il.htm
[10/30] https://www.hospitalstats.org/hospital-ratings/methodist-hospital-of-chicago-chicago-il.htm
[11/30] https://www.hospitalstats.org

Scraping for the Midwest

In [6]:
BASE_URL_MIDWEST = "https://www.hospitalstats.org/ER-Wait-Time/"

def get_midwest_state_links(base_url: str = BASE_URL_MIDWEST) -> pd.DataFrame:
    """Build the table: state_abbr + absolute link to '<ABBR>-Counties.htm'."""
    r = requests.get(base_url, timeout=30, headers={"User-Agent": "MidwestERScraper/1.0"})
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Find the section with the state list
    container = None
    for h3 in soup.find_all("h3"):
        if "Browse Emergency Room Stats by State" in h3.get_text(strip=True):
            container = h3.parent
            break
    anchors = container.find_all("a", href=True) if container else soup.find_all("a", href=True)

    rows = []
    for a in anchors:
        href = a["href"]
        # Looks like "IL-Counties.htm"
        if re.fullmatch(r"[A-Z]{2}-Counties\.htm", href):
            abbr = a.get_text(strip=True).upper() or href.split("-")[0]
            if abbr in MIDWEST_ABBR:
                rows.append({
                    "state_abbr": abbr,
                    "midwest_state_link": urljoin(base_url, href)
                })
    return pd.DataFrame(rows).drop_duplicates().sort_values("state_abbr").reset_index(drop=True)

def discover_counties_for_state(state_abbr: str, state_url: str) -> pd.DataFrame:
    """
    For a given state '<ABBR>-Counties.htm' page, return county name + absolute URL.
    It targets anchors like:
        <a href="Ada-County-ID.htm">Ada</a>
    """
    r = requests.get(state_url, timeout=30, headers={"User-Agent": "MidwestERScraper/1.0"})
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Focus on the main content area if present; fallback to whole page
    content = soup.find(id="content") or soup
    out = []
    for a in content.find_all("a", href=True):
        href = a["href"]
        name = a.get_text(" ", strip=True)
        # County links typically contain "-County-" and end with ".htm"
        if "-County-" in href and href.lower().endswith(".htm"):
            out.append({
                "state_abbr": state_abbr,
                "county_name": name,
                "county_url": urljoin(BASE_URL, href)
            })
    return pd.DataFrame(out).drop_duplicates(subset=["county_url"]).reset_index(drop=True)

def build_midwest_counties_df(include_county_page_html: bool = False) -> pd.DataFrame:
    """
    1) Gets the Midwest states table (abbr + link)
    2) Visits each state page to list counties (name + absolute URL)
    3) Optionally fetches every county page HTML into a 'county_html' column
    """
    df_states = get_midwest_state_links()
    all_parts = []
    for _, row in df_states.iterrows():
        part = discover_counties_for_state(row["state_abbr"], row["midwest_state_link"])
        all_parts.append(part)
    df_counties = pd.concat(all_parts, ignore_index=True) if all_parts else pd.DataFrame(
        columns=["state_abbr","county_name","county_url"]
    )

    if include_county_page_html and not df_counties.empty:
        htmls = []
        for u in df_counties["county_url"].tolist():
            rr = requests.get(u, timeout=30, headers={"User-Agent": "MidwestERScraper/1.0"})
            rr.raise_for_status()
            htmls.append(rr.text)
        df_counties["county_html"] = htmls

    return df_counties

# Table of Midwest counties with names + absolute URLs:
df_midwest_counties = build_midwest_counties_df(include_county_page_html=False)
df_midwest_counties.head(10)


Unnamed: 0,state_abbr,county_name,county_url
0,IA,Adair,https://www.hospitalstats.org/Adair-County-IA.htm
1,IA,Adams,https://www.hospitalstats.org/Adams-County-IA.htm
2,IA,Allamakee,https://www.hospitalstats.org/Allamakee-County...
3,IA,Appanoose,https://www.hospitalstats.org/Appanoose-County...
4,IA,Audubon,https://www.hospitalstats.org/Audubon-County-I...
5,IA,Black Hawk,https://www.hospitalstats.org/Black-Hawk-Count...
6,IA,Boone,https://www.hospitalstats.org/Boone-County-IA.htm
7,IA,Bremer,https://www.hospitalstats.org/Bremer-County-IA...
8,IA,Buchanan,https://www.hospitalstats.org/Buchanan-County-...
9,IA,Buena Vista,https://www.hospitalstats.org/Buena-Vista-Coun...


In [None]:
def parse_county(county_url: str, county_html: str | None = None, delay: float = 0.8):
    """
    Return an enriched DataFrame for one county by:
      1) parsing hospitals from county HTML (read_html fast-path, else BS4 fallback)
      2) attaching detail page URLs
      3) visiting each detail page to enrich metrics
    """
    html = county_html if county_html is not None else fetch_html(county_url)

    # 1) county hospitals list
    df = try_read_html_tables(html)
    if df is None or getattr(df, "empty", True):
        df = bs4_fallback_extract(html)

    # 2) attach detail page links & 3) enrich in-line
    df = attach_detail_urls(html, df)
    df_enriched = enrich_df_inline(df, delay=delay)
    return df_enriched


In [None]:
county_frames = {}  
errors = []

for i, row in df_midwest_counties.iterrows():
    state_abbr = row["state_abbr"]
    county_name = row["county_name"]
    county_url  = row["county_url"]
    county_html = row.get("county_html", None)

    key = (state_abbr, county_name)
    try:
        df_enriched = parse_county(county_url, county_html=county_html, delay=0.8)
        # stamp county/state context onto each row for downstream grouping/filters
        if not df_enriched.empty:
            df_enriched = df_enriched.copy()
            if "State" not in df_enriched.columns:
                df_enriched["State"] = state_abbr  # harmonize with existing schema
            if "County" not in df_enriched.columns:
                df_enriched["County"] = county_name
        county_frames[key] = df_enriched

        print(f"[OK] {state_abbr} – {county_name}: {len(df_enriched)} rows")

    except Exception as e:
        errors.append({"state_abbr": state_abbr, "county_name": county_name,
                       "county_url": county_url, "error": str(e)[:500]})
        print(f"[ERR] {state_abbr} – {county_name}: {e}")


In [None]:
if county_frames:
    df_all_midwest_hospitals = pd.concat(
        [v for v in county_frames.values() if v is not None],
        ignore_index=True
    )
else:
    df_all_midwest_hospitals = pd.DataFrame()

print("All Midwest hospitals:", df_all_midwest_hospitals.shape)
if not df_all_midwest_hospitals.empty:
    if "detail_error" in df_all_midwest_hospitals.columns:
        df_all_midwest_hospitals = df_all_midwest_hospitals[
            df_all_midwest_hospitals["detail_error"].isna() | (df_all_midwest_hospitals["detail_error"] == "")
        ].drop(columns=["detail_error"])
    if "hospital" in df_all_midwest_hospitals.columns:
        df_all_midwest_hospitals = df_all_midwest_hospitals[
            df_all_midwest_hospitals["hospital"].notna() & (df_all_midwest_hospitals["hospital"].astype(str).str.strip() != "")
        ].reset_index(drop=True)
df_all_midwest_hospitals.head(20)
#export as a CSV
df_all_midwest_hospitals.to_csv("midwest_hospitals_enriched.csv", index=False)


All Midwest hospitals: (1059, 27)


  df_all_midwest_hospitals = pd.concat(
