In [3]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from bs4 import BeautifulSoup


holdings = pd.read_excel("Higher Ed cusips.xlsx",sheet_name='Holdings')
index = pd.read_excel("Higher Ed cusips.xlsx",sheet_name='Index')

df = pd.concat([holdings[['Cusip 8','CREDIT']],index[['Cusip 8','CREDIT']]]).reset_index(drop=True)

df = df.groupby('CREDIT')['Cusip 8'].first().reset_index() #get unique universities and a cusip for each university

list_cusip = df['Cusip 8'].to_list()

In [5]:
# check number of unique names\
print(f"# of unique universities {len(df)}")

# of unique universities 529


In [51]:
# Initialize chrome driver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
driver.get("https://emma.msrb.org/")

## Scraper

In [53]:
def click_disclosure_tab_with_retry(driver, retries=3):
    for attempt in range(retries):
        try:
            disclosure_tab = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, '//a[@href="#tabDisclosureDocuments"]'))
            )
            driver.execute_script('arguments[0].scrollIntoView({block: "center"});', disclosure_tab)
            time.sleep(0.4)
            driver.execute_script("arguments[0].click();", disclosure_tab)
            return
        except StaleElementReferenceException:
            print(f"Attempt {attempt + 1}: Disclosure tab became stale. Retrying...")
            time.sleep(1)
    raise Exception("Could not click Disclosure tab after several attempts.")


def extract_tooltip_pdfs(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    results = []

    for tooltip in soup.select("a.ihpQtipHelp.rtTip[help]"):
        help_html = tooltip.get("help")
        section_name = tooltip.get_text(strip=True)  # This gives "Annual Financial Disclosures Posted 11/25/2024"
        if not help_html:
            continue

        inner_soup = BeautifulSoup(help_html, "html.parser")
        for a in inner_soup.find_all("a"):
            href = a.get("href")
            doc_text = a.text.strip()
            if href and href.endswith(".pdf"):
                full_url = f"https://emma.msrb.org{href}" if not href.startswith("http") else href
                # Prepend the section name to the document text
                combined_name = f"{section_name} - {doc_text}"
                results.append({
                    "document_name": combined_name,
                    "pdf_url": full_url
                })
    return results



final_df = pd.DataFrame()

start_time = time.time()

for c in list_cusip:
    print(f"\nProcessing CUSIP: {c}")
    try:
        search_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "quickSearchText"))
        )
        search_box.clear()
        search_box.send_keys(c)
        search_box.send_keys(Keys.RETURN)

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, '//ul[contains(@class, "ui-tabs-nav")]'))
        )

        click_disclosure_tab_with_retry(driver)

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID, "tabDisclosureDocuments"))
        )

        print("Disclosure tab opened")

        # Select filter "All"
        try:
            radio = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[name="Filter.SelectedPredefinedDateRange"][value="All"]'))
            )
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", radio)
            time.sleep(0.3)
            if not radio.get_attribute("checked"):
                radio.click()
            print("Clicked 'All' filter")
        except Exception as e:
            print(f"Could not click 'All' filter: {e}")

        pdf_data = []
        pdf_links = driver.find_elements(By.XPATH, '//a[contains(@ga-name, "clickDisclosureDocuments") and contains(@href, ".pdf")]')

        for i, link in enumerate(pdf_links):
            try:
                text = link.text.strip()
                href = link.get_attribute("href")
                if text and href and href.endswith(".pdf"):
                    full_url = href if href.startswith("http") else f"https://emma.msrb.org{href}"
                    pdf_data.append({
                        "CUSIP": c,
                        "document_name": text,
                        "pdf_url": full_url
                    })
            except StaleElementReferenceException:
                print(f"Skipping stale link at index {i} for CUSIP {c}")

        hidden_pdfs = extract_tooltip_pdfs(driver)
        for pdf in hidden_pdfs:
            pdf["CUSIP"] = c
            pdf_data.append(pdf)

        if pdf_data:
            temp_df = pd.DataFrame(pdf_data)
            final_df = pd.concat([final_df, temp_df], ignore_index=True)
            print(f"Collected {len(pdf_data)} PDFs for CUSIP {c}")
        else:
            print(f"No PDFs found for CUSIP {c}")

    except Exception as e:
        print(f"Error while processing {c}: {e}")

end_time = time.time()
execution_time = end_time - start_time
print(f"The loop took {execution_time} seconds to execute.")
print("\nFinished. Here's a preview of your results:")
print(final_df.head())




Processing CUSIP: 650348AR
Disclosure tab opened
Clicked 'All' filter
Collected 26 PDFs for CUSIP 650348AR

Processing CUSIP: 006383AU
Attempt 1: Disclosure tab became stale. Retrying...
Disclosure tab opened
Clicked 'All' filter
Collected 10 PDFs for CUSIP 006383AU

Processing CUSIP: 424682KJ
Attempt 1: Disclosure tab became stale. Retrying...
Disclosure tab opened
Clicked 'All' filter
Collected 11 PDFs for CUSIP 424682KJ

Processing CUSIP: 01025QAW
Attempt 1: Disclosure tab became stale. Retrying...
Disclosure tab opened
Clicked 'All' filter
Skipping stale link at index 34 for CUSIP 01025QAW
Skipping stale link at index 35 for CUSIP 01025QAW
Skipping stale link at index 36 for CUSIP 01025QAW
Skipping stale link at index 37 for CUSIP 01025QAW
Skipping stale link at index 38 for CUSIP 01025QAW
Skipping stale link at index 39 for CUSIP 01025QAW
Skipping stale link at index 40 for CUSIP 01025QAW
Skipping stale link at index 41 for CUSIP 01025QAW
Skipping stale link at index 42 for CUSIP

In [55]:
driver.quit()

In [56]:
final_df = pd.merge(final_df,df,how='left',left_on='CUSIP',right_on='Cusip 8')

In [59]:
final_df[['CUSIP','document_name','pdf_url','CREDIT']].to_csv('disclosure_document_list_L3Y_v3.csv')