In [63]:
import requests
from bs4 import BeautifulSoup, Tag
import os

In [None]:
BASE_URL = "https://www.icd10data.com"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}
response = requests.get(BASE_URL,headers=headers)

In [None]:
folder = 'downloaded_html_pages'
if not os.path.exists(folder):
    os.makedirs(folder)

In [55]:
def read_file(file_name: str, folder: str = folder):
    with open(os.path.join(folder, file_name), 'r', encoding='utf-8') as f:
        html_content = f.read()
        return html_content

def write_file(file_name: str, content: str, folder: str = folder):
    with open(os.path.join(folder, file_name), 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"File written: {file_name}")


In [None]:
file_name = f'{BASE_URL}.html'.replace('https://www.', '').replace('/', '_')

write_file(file_name, response.text)

In [None]:
soup = BeautifulSoup(read_file(file_name), 'lxml')
body_content = soup.find('div', class_ = "body-content")


In [41]:
body_content

<div class="body-content">
<h1 class="pageHeading">2025 ICD-10-CM Codes</h1>
<ul>
<li>
<a class="identifier" href="/ICD10CM/Codes/A00-B99">A00-B99</a> <div class="tip images-note" data-textdivname="x1" data-titledivname="z1"></div> Certain infectious and parasitic diseases

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/C00-D49">C00-D49</a> <div class="tip images-note" data-textdivname="x24" data-titledivname="z24"></div> Neoplasms

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/D50-D89">D50-D89</a> <div class="tip images-note" data-textdivname="x46" data-titledivname="z46"></div> Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/E00-E89">E00-E89</a> <div class="tip images-note" data-textdivname="x54" data-titledivname="z54"></div> Endocrine, nutritional and metabolic diseases

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/F01-F99">F01-F99</a> <

In [58]:
visited = set()

In [59]:
def get_links(body_content):
    links = body_content.find_all('a')
    for link in links:
        href = link.get('href')
        if href and href.startswith('/ICD10CM/Codes'):
            full_url = f"https://www.icd10data.com{href}"
            visited.add(full_url)

In [60]:
get_links(body_content)

In [61]:
write_file("unique_links.txt", "\n".join(visited))

File written: unique_links.txt


In [62]:
len(visited)

307

In [64]:
from urllib.parse import urljoin

def extract_links_from_codes_section(
    html: str,
    base_url: str = "https://www.icd10data.com",
    href_prefix: str = "/ICD10CM/Codes",
    make_absolute: bool = True
):
    soup = BeautifulSoup(html, "lxml")

    # 1) find the single div whose visible text is exactly "Codes"
    codes_div = None
    for div in soup.find_all("div"):
        if div.get_text(strip=True) == "Codes":
            codes_div = div
            break

    if codes_div is None:
        return []

    # 2) find the next sibling that is a <ul> (skip whitespace/text nodes)
    sibling = codes_div.next_sibling
    while sibling is not None and not (isinstance(sibling, Tag) and sibling.name == "ul"):
        sibling = sibling.next_sibling

    # If not found by sibling walking, fall back to find_next('ul') but ensure it's after codes_div
    if sibling is None or not (isinstance(sibling, Tag) and sibling.name == "ul"):
        sibling = codes_div.find_next("ul")
        if sibling is None:
            return []

    ul = sibling  # this should be the <ul> we want

    # 3) extract links from <li> -> <a>
    results = []
    seen = set()
    for li in ul.find_all("li", recursive=False):  # top-level li's in that ul
        a = li.find("a", href=True)
        if not a:
            # sometimes anchor might be nested differently; try any anchor inside li
            a = li.find("a", href=True)
        if not a:
            continue
        href = a["href"].strip()
        if href_prefix is not None and not href.startswith(href_prefix):
            continue
        link = urljoin(base_url, href) if make_absolute else href
        if link not in seen:
            seen.add(link)
            results.append(link)

    return results
