In [43]:
import os
import re
import time
import requests
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from urllib.parse import urlparse
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.action_chains import ActionChains




In [45]:
# ------------------------- CONFIG -------------------------
TEST_MODE = True
EXCEL_PATH = "Higher Ed cusips_test.xlsx" if TEST_MODE else "Higher Ed cusips.xlsx"
EXCEL_OUTPUT = "disclosure_document_list_test.csv" if TEST_MODE else "disclosure_document_list.csv"
ROOT_DIR = Path("university_pdfs_test" if TEST_MODE else "university_pdfs")
FAILED_LOG_PATH = "failed_downloads_test.csv" if TEST_MODE else "failed_downloads.csv"
TMP_DIR = Path("__tmp_downloads")
WAIT_TIME = 10
TIMEOUT = 20
SLEEP = 0.3
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )
}
# ----------------------------------------------------------

# Load CUSIP data
holdings = pd.read_excel(EXCEL_PATH, sheet_name='Holdings')
index = pd.read_excel(EXCEL_PATH, sheet_name='Index')
df_cusips = pd.concat([holdings[['Cusip 8', 'CREDIT']], index[['Cusip 8', 'CREDIT']]]).reset_index(drop=True)
df_cusips = df_cusips.groupby('CREDIT')['Cusip 8'].first().reset_index()
list_cusip = df_cusips['Cusip 8'].to_list()

def slugify(text):
    return re.sub(r"[^\w\-. ]", "_", text).strip().replace(" ", "_")

def setup_browser(download_dir):
    chrome_opts = Options()
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--disable-gpu")
    chrome_opts.add_argument("--disable-dev-shm-usage")
    chrome_opts.add_experimental_option("prefs", {
        "download.default_directory": str(download_dir.resolve()),
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True,
    })
    return webdriver.Chrome(options=chrome_opts)

def handle_cookie_consent(driver):
    try:
        accept_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.ID, "ctl00_mainContentArea_disclaimerContent_yesButton"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", accept_button)
        time.sleep(0.3)
        accept_button.click()
        print("Clicked 'Accept' button")
        time.sleep(0.5)
        body = driver.find_element(By.TAG_NAME, "body")
        ActionChains(driver).move_to_element_with_offset(body, 0, 0).click().perform()
        print("Performed dummy click")
    except TimeoutException:
        print("No cookie banner found")
    except Exception as e:
        print(f"Cookie error: {e}")

def click_disclosure_tab_with_retry(driver, retries=3):
    for attempt in range(retries):
        try:
            disclosure_tab = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, '//a[@href="#tabDisclosureDocuments"]'))
            )
            driver.execute_script('arguments[0].scrollIntoView({block: "center"});', disclosure_tab)
            time.sleep(0.4)
            driver.execute_script("arguments[0].click();", disclosure_tab)
            return
        except StaleElementReferenceException:
            print(f"Attempt {attempt + 1}: Stale tab. Retrying...")
            time.sleep(1)
    raise Exception("Could not click Disclosure tab")

def extract_tooltip_pdfs(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    results = []
    for tooltip in soup.select("a.ihpQtipHelp.rtTip[help]"):
        help_html = tooltip.get("help")
        section_name = tooltip.get_text(strip=True)
        if not help_html:
            continue
        inner_soup = BeautifulSoup(help_html, "html.parser")
        for a in inner_soup.find_all("a"):
            href = a.get("href")
            doc_text = a.text.strip()
            if href and href.endswith(".pdf"):
                full_url = f"https://emma.msrb.org{href}" if not href.startswith("http") else href
                combined_name = f"{section_name} - {doc_text}"
                results.append({
                    "document_name": combined_name,
                    "pdf_url": full_url
                })
    return results

def download_via_requests(url, dest_path):
    try:
        r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
        if r.status_code == 403:
            return False
        r.raise_for_status()
        dest_path.parent.mkdir(parents=True, exist_ok=True)
        with open(dest_path, "wb") as f:
            f.write(r.content)
        return True
    except Exception as e:
        print(f"[requests fail] {url} → {e}")
        return False

def download_via_chrome(driver, url, dest_path):
    TMP_DIR.mkdir(exist_ok=True)
    for f in TMP_DIR.glob("*"):
        f.unlink()
    try:
        driver.get(url)
        time.sleep(WAIT_TIME)
        pdf_files = list(TMP_DIR.glob("*.pdf"))
        if not pdf_files:
            print(f"[chrome fail] No PDF for {url}")
            return False
        pdf_file = max(pdf_files, key=os.path.getctime)
        dest_path.parent.mkdir(parents=True, exist_ok=True)
        pdf_file.rename(dest_path)
        return True
    except Exception as e:
        print(f"[chrome error] {url} → {e}")
        return False

def download_pdfs(final_df):
    failed_downloads = []
    driver = setup_browser(TMP_DIR)
    try:
        for credit, group in tqdm(final_df.groupby("CREDIT"), desc="Universities"):
            folder = ROOT_DIR / slugify(credit)
            for _, row in group.iterrows():
                url = row["pdf_url"]
                name = slugify(row["document_name"])
                ext = Path(urlparse(url).path).suffix or ".pdf"
                target = folder / f"{name}{ext}"
                if target.exists():
                    continue
                success = download_via_requests(url, target)
                if not success:
                    print(f"[Chrome fallback] {url}")
                    success = download_via_chrome(driver, url, target)
                if not success:
                    failed_downloads.append({
                        "CREDIT": credit,
                        "document_name": row["document_name"],
                        "pdf_url": url
                    })
                time.sleep(SLEEP)
    finally:
        driver.quit()
        for f in TMP_DIR.glob("*"):
            f.unlink()
        TMP_DIR.rmdir()

    if failed_downloads:
        fail_df = pd.DataFrame(failed_downloads)
        fail_df.to_csv(FAILED_LOG_PATH, index=False)
        print(f"\nLogged {len(failed_downloads)} failed downloads to {FAILED_LOG_PATH}")

def filter_documents(df):
    keywords = [
        "annual disclosure",
        "financial statement",
        "financial disclosure",
        "audited financials",
        "continuing disclosure"
    ]
    return df[df['document_name'].str.lower().apply(
        lambda x: any(keyword in x for keyword in keywords)
    )]

# ------------------ MAIN SCRAPING SECTION ------------------
cookie_handled = False
final_df = pd.DataFrame()
driver = webdriver.Chrome()
driver.get("https://emma.msrb.org/")

start_time = time.time()

for c in tqdm(list_cusip, desc="Scraping CUSIPs"):
    print(f"\nProcessing CUSIP: {c}")
    try:
        search_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "quickSearchText"))
        )
        search_box.clear()
        search_box.send_keys(c)
        search_box.send_keys(Keys.RETURN)

        if not cookie_handled:
            handle_cookie_consent(driver)
            cookie_handled = True

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, '//ul[contains(@class, "ui-tabs-nav")]'))
        )

        click_disclosure_tab_with_retry(driver)

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID, "tabDisclosureDocuments"))
        )

        try:
            radio = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[name="Filter.SelectedPredefinedDateRange"][value="All"]'))
            )
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", radio)
            time.sleep(0.3)
            if not radio.get_attribute("checked"):
                radio.click()
        except Exception as e:
            print(f"Filter click failed: {e}")

        pdf_data = []
        pdf_links = driver.find_elements(By.XPATH, '//a[contains(@ga-name, "clickDisclosureDocuments") and contains(@href, ".pdf")]')

        for i, link in enumerate(pdf_links):
            try:
                text = link.text.strip()
                href = link.get_attribute("href")
                if text and href and href.endswith(".pdf"):
                    full_url = href if href.startswith("http") else f"https://emma.msrb.org{href}"
                    pdf_data.append({"CUSIP": c, "document_name": text, "pdf_url": full_url})
            except StaleElementReferenceException:
                print(f"Skipping stale link {i} for CUSIP {c}")

        hidden_pdfs = extract_tooltip_pdfs(driver)
        for pdf in hidden_pdfs:
            pdf["CUSIP"] = c
            pdf_data.append(pdf)

        if pdf_data:
            temp_df = pd.DataFrame(pdf_data)
            final_df = pd.concat([final_df, temp_df], ignore_index=True)
            print(f"Collected {len(pdf_data)} PDFs for {c}")
        else:
            print(f"No PDFs found for {c}")

    except Exception as e:
        print(f"Error for CUSIP {c}: {e}")

end_time = time.time()
print(f"\nScraping took {end_time - start_time:.2f} seconds")
driver.quit()

final_df = pd.merge(final_df, df_cusips, how='left', left_on='CUSIP', right_on='Cusip 8')
final_df = final_df[['CUSIP', 'document_name', 'pdf_url', 'CREDIT']]
final_df.to_csv(EXCEL_OUTPUT)

final_df = filter_documents(final_df)
download_pdfs(final_df)


FileNotFoundError: [Errno 2] No such file or directory: 'Higher Ed cusips_test.xlsx'

In [48]:
import os
os.getcwd()


FileNotFoundError: [Errno 2] No such file or directory