In [1]:
import os
import random
import time

import requests
from tqdm import tqdm
from bs4 import BeautifulSoup, Tag

In [None]:
BASE_URL = "https://www.icd10data.com"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) \
    Apple WeKit /537.36(KHTML , like Gecko) \
    Chrome/80.0.3987.162 Safari/537.36"
}
response = requests.get(BASE_URL, headers=headers)

In [3]:
folder = 'downloaded_html_pages'
if not os.path.exists(folder):
    os.makedirs(folder)

In [34]:
def read_file(file_name: str, folder: str = folder):
    with open(os.path.join(folder, file_name), 'r', encoding='utf-8') as f:
        html_content = f.read()
        return html_content

def write_file(file_name: str, content: str, folder: str = folder):
    with open(os.path.join(folder, file_name), 'w', encoding='utf-8') as f:
        f.write(content)


In [None]:
file_name = f'{BASE_URL}.html'.replace('https://www.', '').replace('/', '_')

write_file(file_name, response.text)

In [None]:
soup = BeautifulSoup(read_file(file_name), 'lxml')
body_content = soup.find('div', class_ = "body-content")


In [41]:
body_content

<div class="body-content">
<h1 class="pageHeading">2025 ICD-10-CM Codes</h1>
<ul>
<li>
<a class="identifier" href="/ICD10CM/Codes/A00-B99">A00-B99</a> <div class="tip images-note" data-textdivname="x1" data-titledivname="z1"></div> Certain infectious and parasitic diseases

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/C00-D49">C00-D49</a> <div class="tip images-note" data-textdivname="x24" data-titledivname="z24"></div> Neoplasms

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/D50-D89">D50-D89</a> <div class="tip images-note" data-textdivname="x46" data-titledivname="z46"></div> Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/E00-E89">E00-E89</a> <div class="tip images-note" data-textdivname="x54" data-titledivname="z54"></div> Endocrine, nutritional and metabolic diseases

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/F01-F99">F01-F99</a> <

In [58]:
visited = set()

In [5]:
def get_links(body_content):
    links = body_content.find_all('a')
    for link in links:
        href = link.get('href')
        if href and href.startswith('/ICD10CM/Codes'):
            full_url = f"https://www.icd10data.com{href}"
            visited.add(full_url)

In [60]:
get_links(body_content)

In [61]:
write_file("unique_links.txt", "\n".join(visited))

File written: unique_links.txt


In [62]:
len(visited)

307

In [None]:
from urllib.parse import urljoin


def extract_links_from_codes_section(
    html: str,
    base_url: str = "https://www.icd10data.com",
    href_prefix: str = "/ICD10CM/Codes",
    make_absolute: bool = True
):
    soup = BeautifulSoup(html, "lxml")

    codes_div = None
    for div in soup.find_all("div"):
        if div.get_text(strip=True) == "Codes":
            codes_div = div
            break

    if codes_div is None:
        return []

    sibling = codes_div.next_sibling
    while sibling is not None and not (isinstance(sibling, Tag) and sibling.name == "ul"):
        sibling = sibling.next_sibling

    if sibling is None or not (isinstance(sibling, Tag) and sibling.name == "ul"):
        sibling = codes_div.find_next("ul")
        if sibling is None:
            return []

    ul = sibling 

    results = []
    seen = set()
    for li in ul.find_all("li", recursive=False):  
        a = li.find("a", href=True)
        if not a:
            a = li.find("a", href=True)
        if not a:
            continue
        href = a["href"].strip()
        if href_prefix is not None and not href.startswith(href_prefix):
            continue
        link = urljoin(base_url, href) if make_absolute else href
        if link not in seen:
            seen.add(link)
            results.append(link)

    return results


In [None]:
test_url = "https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39"

response = requests.get(test_url, headers=headers).text
write_file("test_page.html", response)

soup = BeautifulSoup(read_file("test_page.html"), 'lxml')
body_content = soup.find('div', class_ = "body-content")

results = extract_links_from_codes_section(str(body_content))

results

['https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R30-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R31-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R32-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R33-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R34-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R35-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R36-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R37-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R39-']


In [None]:
results

['https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R30-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R31-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R32-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R33-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R34-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R35-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R36-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R37-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R39-']

In [None]:
def extract_unique_category_links(input_file: str, output_file: str, headers: dict = headers) -> None:
    with open(input_file, 'r', encoding='utf-8') as f:
        links = [link.strip() for link in f if link.strip()]
    
    unique_extracted_links = set()

    for link in links:
        response = requests.get(link, headers=headers).text
        file_name = f"{link.split('/')[-1]}.html".replace('-', '_')
        write_file(file_name, response)
        
        soup = BeautifulSoup(read_file(file_name), 'lxml')
        body_content = soup.find('div', class_="body-content")
        
        extracted_links = extract_links_from_codes_section(str(body_content))
        unique_extracted_links.update(extracted_links)
        
        write_file(output_file, "\n".join(unique_extracted_links))

File written: E70_E88.html
File written: unique_extracted_links.txt
File written: P10_P15.html
File written: unique_extracted_links.txt
File written: R30_R39.html
File written: unique_extracted_links.txt
File written: Q38_Q45.html
File written: unique_extracted_links.txt
File written: M50_M54.html
File written: unique_extracted_links.txt
File written: V10_V19.html
File written: unique_extracted_links.txt
File written: A80_A89.html
File written: unique_extracted_links.txt
File written: S10_S19.html
File written: unique_extracted_links.txt
File written: Q50_Q56.html
File written: unique_extracted_links.txt
File written: R25_R29.html
File written: unique_extracted_links.txt
File written: A90_A99.html
File written: unique_extracted_links.txt
File written: A20_A28.html
File written: unique_extracted_links.txt
File written: U50_U85.html
File written: unique_extracted_links.txt
File written: O20_O29.html
File written: unique_extracted_links.txt
File written: G50_G59.html
File written: unique_

In [16]:
test_url = "https://www.icd10data.com/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-"
response = requests.get(test_url, headers=headers).text
soup = BeautifulSoup(response, 'lxml')
write_file("test_page_z37.html", response)

File written: test_page_z37.html


In [17]:
content= read_file("test_page_z37.html")

In [24]:
content_soup = BeautifulSoup(content, 'lxml')
code_hierarchy = content_soup.find('ul', class_='codeHierarchy')
code_links = [a['href'] for a in code_hierarchy.find_all('a')][1:]
code_links

['/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.0',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.1',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.2',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.3',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.4',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.5',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.50',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.51',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.52',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.53',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.54',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.59',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.6',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.60',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.61',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.62',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.63',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.64',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.69',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.7',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.9']

In [None]:
def extract_leaf_code_links(input_file: str, output_file: str, headers: dict = headers) -> None:
    """Extract leaf-level ICD-10 links from a list of category URLs."""
    
    with open(input_file, 'r', encoding='utf-8') as f:
        links = [link.strip() for link in f if link.strip()]

    leaf_codes_links = set()

    for link in tqdm(links, desc="Processing ICD category pages", unit="page"):
        response = requests.get(link, headers=headers).text
        file_name = f"{link.split('/')[-1]}.html".replace('-', '_')
        write_file(file_name, response)  
        
        soup = BeautifulSoup(read_file(file_name), 'lxml') 
        code_hierarchy = soup.find('ul', class_='codeHierarchy')
        
        if code_hierarchy:
            code_links = [
                f"{BASE_URL}{a['href']}" for a in code_hierarchy.find_all('a')[1:]
            ]
            leaf_codes_links.update(code_links)
        
        write_file(output_file, "\n".join(leaf_codes_links))
        
        time.sleep(random.uniform(0.7, 1.3))

extract_leaf_code_links("unique_extracted_links.txt", "leaf_code_links.txt")