In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
   --------------

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Initialize the Chrome driver
driver = webdriver.Chrome()

# List of years to scrape
years = [2021, 2020, 2019, 2018, 2017, 2016]

# Iterate over each year
for year in years:
    # Navigate to the initial page
    driver.get("https://wonder.cdc.gov/nndss/nndss_annual_tables_menu.asp")

    # Wait for the year dropdown to be present
    wait = WebDriverWait(driver, 10)
    year_dropdown = wait.until(EC.presence_of_element_located((By.NAME, "mmwr_year")))

    # Select the current year from the dropdown
    select = Select(year_dropdown)
    select.select_by_visible_text(str(year))

    # Click the Change Year button
    change_year_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[type='submit'][value='Change Year']")))
    change_year_button.click()

    # Wait for the table page to load
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tables-list")))

    # Find the table containing the data
    table = driver.find_element(By.CLASS_NAME, "tables-list")

    # Find the rows in the table
    rows = table.find_elements(By.TAG_NAME, "tr")

    # Determine the table to use based on the year
    if year in [2016, 2017, 2018]:
        table_name = "Table 2h"
    else:
        table_name = "Table 2i"

    # Extract the link from the row that matches the table name
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if cells and table_name in row.text:
            # Find the link in the first cell (td) of this row
            link_cell = cells[0]  # First <td> contains the main link
            link = link_cell.find_element(By.TAG_NAME, "a")  # Get the <a> tag within the first <td>
            link_url = link.get_attribute("href")  # Get the URL of the link
            print(f"Link for {table_name} for year {year}: {link_url}")
            break

    # Navigate to the extracted link
    driver.get(link_url)

    # Wait for the new page to load
    wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))

    # Find the table on the new page
    data_table = driver.find_element(By.TAG_NAME, "table")

    # Initialize lists to store the extracted data
    reporting_areas = []
    leptospirosis_data = []

    # Extract the data for Reporting Area and Leptospirosis
    for data_row in data_table.find_elements(By.TAG_NAME, "tr")[1:]:  # Skip header row
        reporting_area = data_row.find_element(By.TAG_NAME, "th").text  # First <th> contains Reporting Area
        td_elements = data_row.find_elements(By.TAG_NAME, "td")

        if len(td_elements) > 0:  # Ensure there is at least one <td>
            leptospirosis = td_elements[7].text  # Adjust index based on actual structure
            reporting_areas.append(reporting_area)
            leptospirosis_data.append(leptospirosis)

    # Create a DataFrame using pandas
    df = pd.DataFrame({
        "Reporting Area": reporting_areas,
        year: leptospirosis_data
    })

    # Save the DataFrame to a CSV file with the year in the filename
    df.to_csv(f"leptospirosis_{year}.csv", index=False)
    print(f"Data saved to leptospirosis_{year}.csv")

# Close the browser
driver.quit()

Link for Table 2i for year 2021: https://wonder.cdc.gov/nndss/static/2021/annual/2021-table2i.html
Data saved to leptospirosis_2021.csv
Link for Table 2i for year 2020: https://wonder.cdc.gov/nndss/static/2020/annual/2020-table2i.html
Data saved to leptospirosis_2020.csv
Link for Table 2i for year 2019: https://wonder.cdc.gov/nndss/static/2019/annual/2019-table2i.html
Data saved to leptospirosis_2019.csv
Link for Table 2h for year 2018: https://wonder.cdc.gov/nndss/static/2018/annual/2018-table2h.html
Data saved to leptospirosis_2018.csv
Link for Table 2h for year 2017: https://wonder.cdc.gov/nndss/static/2017/annual/2017-table2h.html
Data saved to leptospirosis_2017.csv
Link for Table 2h for year 2016: https://wonder.cdc.gov/nndss/static/2016/annual/2016-table2h.html
Data saved to leptospirosis_2016.csv
