In [40]:
import os
import re
import time
import requests
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from urllib.parse import urlparse
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException


holdings = pd.read_excel("Higher Ed cusips.xlsx",sheet_name='Holdings')
index = pd.read_excel("Higher Ed cusips.xlsx",sheet_name='Index')

df = pd.concat([holdings[['Cusip 8','CREDIT']],index[['Cusip 8','CREDIT']]]).reset_index(drop=True)

df = df.groupby('CREDIT')['Cusip 8'].first().reset_index() #get unique universities and a cusip for each university

list_cusip = df['Cusip 8'].to_list()

In [42]:
# check number of unique names\
print(f"# of unique universities {len(df)}")

# of unique universities 529


In [29]:
def click_disclosure_tab_with_retry(driver, retries=3):
    for attempt in range(retries):
        try:
            disclosure_tab = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, '//a[@href="#tabDisclosureDocuments"]'))
            )
            driver.execute_script('arguments[0].scrollIntoView({block: "center"});', disclosure_tab)
            time.sleep(0.4)
            driver.execute_script("arguments[0].click();", disclosure_tab)
            return
        except StaleElementReferenceException:
            print(f"Attempt {attempt + 1}: Disclosure tab became stale. Retrying...")
            time.sleep(1)
    raise Exception("Could not click Disclosure tab after several attempts.")


def extract_tooltip_pdfs(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    results = []

    for tooltip in soup.select("a.ihpQtipHelp.rtTip[help]"):
        help_html = tooltip.get("help")
        section_name = tooltip.get_text(strip=True)  # This gives "Annual Financial Disclosures Posted 11/25/2024"
        if not help_html:
            continue

        inner_soup = BeautifulSoup(help_html, "html.parser")
        for a in inner_soup.find_all("a"):
            href = a.get("href")
            doc_text = a.text.strip()
            if href and href.endswith(".pdf"):
                full_url = f"https://emma.msrb.org{href}" if not href.startswith("http") else href
                # Prepend the section name to the document text
                combined_name = f"{section_name} - {doc_text}"
                results.append({
                    "document_name": combined_name,
                    "pdf_url": full_url
                })
    return results

def handle_cookie_consent(driver):
    try:
        accept_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.ID, "ctl00_mainContentArea_disclaimerContent_yesButton"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", accept_button)
        time.sleep(0.3)
        accept_button.click()
        print("Clicked 'Accept' button")

        # Simulate a click to release focus/overlay trap
        time.sleep(0.5)
        body = driver.find_element(By.TAG_NAME, "body")
        webdriver.ActionChains(driver).move_to_element_with_offset(body, 0, 0).click().perform()
        print("Performed dummy click to re-enable page interaction")

    except TimeoutException:
        print("Cookie disclaimer not found")
    except Exception as e:
        print(f"Error during cookie handling: {e}")



## Scraper

In [32]:

# Initialize chrome driver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
driver.get("https://emma.msrb.org/")

cookie_handled = False


final_df = pd.DataFrame()

start_time = time.time()

for c in list_cusip:
    print(f"\nProcessing CUSIP: {c}")
    try:
        search_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "quickSearchText"))
        )
        search_box.clear()
        search_box.send_keys(c)
        search_box.send_keys(Keys.RETURN)

        # Handle cookies only once
        if not cookie_handled:
            handle_cookie_consent(driver)
            cookie_handled = True

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, '//ul[contains(@class, "ui-tabs-nav")]'))
        )

        click_disclosure_tab_with_retry(driver)

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID, "tabDisclosureDocuments"))
        )

        print("Disclosure tab opened")

        # Select filter "All"
        try:
            radio = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[name="Filter.SelectedPredefinedDateRange"][value="All"]'))
            )
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", radio)
            time.sleep(0.3)
            if not radio.get_attribute("checked"):
                radio.click()
            print("Clicked 'All' filter")
        except Exception as e:
            print(f"Could not click 'All' filter: {e}")

        pdf_data = []
        pdf_links = driver.find_elements(By.XPATH, '//a[contains(@ga-name, "clickDisclosureDocuments") and contains(@href, ".pdf")]')

        for i, link in enumerate(pdf_links):
            try:
                text = link.text.strip()
                href = link.get_attribute("href")
                if text and href and href.endswith(".pdf"):
                    full_url = href if href.startswith("http") else f"https://emma.msrb.org{href}"
                    pdf_data.append({
                        "CUSIP": c,
                        "document_name": text,
                        "pdf_url": full_url
                    })
            except StaleElementReferenceException:
                print(f"Skipping stale link at index {i} for CUSIP {c}")

        hidden_pdfs = extract_tooltip_pdfs(driver)
        for pdf in hidden_pdfs:
            pdf["CUSIP"] = c
            pdf_data.append(pdf)

        if pdf_data:
            temp_df = pd.DataFrame(pdf_data)
            final_df = pd.concat([final_df, temp_df], ignore_index=True)
            print(f"Collected {len(pdf_data)} PDFs for CUSIP {c}")
        else:
            print(f"No PDFs found for CUSIP {c}")

    except Exception as e:
        print(f"Error while processing {c}: {e}")

end_time = time.time()
execution_time = end_time - start_time
print(f"The loop took {execution_time} seconds to execute.")
print("\nFinished. Here's a preview of your results:")
print(final_df.head())
driver.quit()

# output pdf
final_df = pd.merge(final_df,df,how='left',left_on='CUSIP',right_on='Cusip 8')
final_ df = final_df[['CUSIP','document_name','pdf_url','CREDIT']]
final_df.to_csv('disclosure_document_list.csv')



Processing CUSIP: 650348AR
Clicked 'Accept' button
Performed dummy click to re-enable page interaction
Disclosure tab opened
Clicked 'All' filter
Collected 26 PDFs for CUSIP 650348AR

Processing CUSIP: 006383AU
Attempt 1: Disclosure tab became stale. Retrying...
Disclosure tab opened
Clicked 'All' filter
Collected 10 PDFs for CUSIP 006383AU

Processing CUSIP: 424682KJ
Attempt 1: Disclosure tab became stale. Retrying...
Disclosure tab opened
Clicked 'All' filter
Collected 11 PDFs for CUSIP 424682KJ

Processing CUSIP: 01025QAW
Attempt 1: Disclosure tab became stale. Retrying...
Disclosure tab opened
Clicked 'All' filter
Skipping stale link at index 35 for CUSIP 01025QAW
Skipping stale link at index 36 for CUSIP 01025QAW
Skipping stale link at index 37 for CUSIP 01025QAW
Skipping stale link at index 38 for CUSIP 01025QAW
Skipping stale link at index 39 for CUSIP 01025QAW
Skipping stale link at index 40 for CUSIP 01025QAW
Skipping stale link at index 41 for CUSIP 01025QAW
Skipping stale l

KeyboardInterrupt: 

In [44]:
# ------------------------- CONFIG -------------------------
CSV_FILE = "disclosure_document_list.csv"
ROOT_DIR = Path("university_pdfs")
TMP_DIR = Path("__tmp_downloads")           # Selenium download dir
WAIT_TIME = 10                              # Wait seconds after opening PDF
TIMEOUT = 20
SLEEP = 0.3
# ----------------------------------------------------------

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )
}

def slugify(text):
    return re.sub(r"[^\w\-. ]", "_", text).strip().replace(" ", "_")

def is_direct_pdf(url):
    return url.lower().endswith(".pdf")

def setup_browser(download_dir):
    chrome_opts = Options()
    # REMOVE HEADLESS TO SEE WHAT’S HAPPENING
    # chrome_opts.add_argument("--headless")
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--disable-gpu")
    chrome_opts.add_argument("--disable-dev-shm-usage")
    chrome_opts.add_experimental_option("prefs", {
        "download.default_directory": str(download_dir.resolve()),
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True,  # Don't render PDF in-browser
    })
    return webdriver.Chrome(options=chrome_opts)

def download_via_requests(url, dest_path):
    try:
        r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
        if r.status_code == 403:
            return False
        r.raise_for_status()
        dest_path.parent.mkdir(parents=True, exist_ok=True)
        with open(dest_path, "wb") as f:
            f.write(r.content)
        return True
    except Exception as e:
        print(f"[requests fail] {url} → {e}")
        return False

def download_via_chrome(driver, url, dest_path):
    TMP_DIR.mkdir(exist_ok=True)
    for f in TMP_DIR.glob("*"):
        f.unlink()

    try:
        driver.get(url)
        time.sleep(WAIT_TIME)

        pdf_files = list(TMP_DIR.glob("*.pdf"))
        if not pdf_files:
            print(f"[chrome fail] No PDF found for {url}")
            return False

        pdf_file = max(pdf_files, key=os.path.getctime)
        dest_path.parent.mkdir(parents=True, exist_ok=True)
        pdf_file.rename(dest_path)
        return True

    except Exception as e:
        print(f"[chrome error] {url} → {e}")
        return False

def main():
    df = pd.read_csv(CSV_FILE).dropna(subset=["CREDIT", "pdf_url", "document_name"])
    driver = setup_browser(TMP_DIR)

    try:
        for credit, group in tqdm(df.groupby("CREDIT"), desc="Universities"):
            credit_folder = ROOT_DIR / slugify(credit)

            for _, row in group.iterrows():
                url = row["pdf_url"]
                name = slugify(row["document_name"])
                ext = Path(urlparse(url).path).suffix or ".pdf"
                target = credit_folder / f"{name}{ext}"

                if target.exists():
                    continue

                success = download_via_requests(url, target)
                if not success:
                    print(f"[Fallback → Chrome UI] {url}")
                    success = download_via_chrome(driver, url, target)

                time.sleep(SLEEP)
    finally:
        driver.quit()
        for f in TMP_DIR.glob("*"):
            f.unlink()
        TMP_DIR.rmdir()



if __name__ == "__main__":
    main()

Universities:   0%|                                     | 0/508 [00:00<?, ?it/s]

[Fallback → Chrome UI] https://emma.msrb.org/P21556096-P21202302-P21622256.pdf
[Fallback → Chrome UI] https://emma.msrb.org/P21908059-P21458935-P21907560.pdf
[Fallback → Chrome UI] https://emma.msrb.org/P21854951-P21420199-P21863705.pdf


Universities:   0%|                                     | 0/508 [00:32<?, ?it/s]


KeyboardInterrupt: 