In [108]:
import os
import re
import random
import time

import requests
from tqdm import tqdm
from bs4 import BeautifulSoup, Tag

In [None]:
BASE_URL = "https://www.icd10data.com"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) \
    Apple WeKit /537.36(KHTML , like Gecko) \
    Chrome/80.0.3987.162 Safari/537.36"
}
response = requests.get(BASE_URL, headers=headers)

In [63]:
folder = 'downloaded_html_pages'
if not os.path.exists(folder):
    os.makedirs(folder)

In [64]:
def read_file(file_name: str, folder: str = folder):
    with open(os.path.join(folder, file_name), 'r', encoding='utf-8') as f:
        html_content = f.read()
        return html_content

def write_file(file_name: str, content: str, folder: str = folder):
    with open(os.path.join(folder, file_name), 'w', encoding='utf-8') as f:
        f.write(content)


In [None]:
file_name = f'{BASE_URL}.html'.replace('https://www.', '').replace('/', '_')

write_file(file_name, response.text)

In [65]:
soup = BeautifulSoup(read_file(file_name), 'lxml')
body_content = soup.find('div', class_ = "body-content")


NameError: name 'file_name' is not defined

In [41]:
body_content

<div class="body-content">
<h1 class="pageHeading">2025 ICD-10-CM Codes</h1>
<ul>
<li>
<a class="identifier" href="/ICD10CM/Codes/A00-B99">A00-B99</a> <div class="tip images-note" data-textdivname="x1" data-titledivname="z1"></div> Certain infectious and parasitic diseases

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/C00-D49">C00-D49</a> <div class="tip images-note" data-textdivname="x24" data-titledivname="z24"></div> Neoplasms

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/D50-D89">D50-D89</a> <div class="tip images-note" data-textdivname="x46" data-titledivname="z46"></div> Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/E00-E89">E00-E89</a> <div class="tip images-note" data-textdivname="x54" data-titledivname="z54"></div> Endocrine, nutritional and metabolic diseases

    </li>
<li>
<a class="identifier" href="/ICD10CM/Codes/F01-F99">F01-F99</a> <

In [58]:
visited = set()

In [5]:
def get_links(body_content):
    links = body_content.find_all('a')
    for link in links:
        href = link.get('href')
        if href and href.startswith('/ICD10CM/Codes'):
            full_url = f"https://www.icd10data.com{href}"
            visited.add(full_url)

In [60]:
get_links(body_content)

In [61]:
write_file("unique_links.txt", "\n".join(visited))

File written: unique_links.txt


In [62]:
len(visited)

307

In [None]:
from urllib.parse import urljoin


def extract_links_from_codes_section(
    html: str,
    base_url: str = "https://www.icd10data.com",
    href_prefix: str = "/ICD10CM/Codes",
    make_absolute: bool = True
):
    soup = BeautifulSoup(html, "lxml")

    codes_div = None
    for div in soup.find_all("div"):
        if div.get_text(strip=True) == "Codes":
            codes_div = div
            break

    if codes_div is None:
        return []

    sibling = codes_div.next_sibling
    while sibling is not None and not (isinstance(sibling, Tag) and sibling.name == "ul"):
        sibling = sibling.next_sibling

    if sibling is None or not (isinstance(sibling, Tag) and sibling.name == "ul"):
        sibling = codes_div.find_next("ul")
        if sibling is None:
            return []

    ul = sibling 

    results = []
    seen = set()
    for li in ul.find_all("li", recursive=False):  
        a = li.find("a", href=True)
        if not a:
            a = li.find("a", href=True)
        if not a:
            continue
        href = a["href"].strip()
        if href_prefix is not None and not href.startswith(href_prefix):
            continue
        link = urljoin(base_url, href) if make_absolute else href
        if link not in seen:
            seen.add(link)
            results.append(link)

    return results


In [None]:
test_url = "https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39"

response = requests.get(test_url, headers=headers).text
write_file("test_page.html", response)

soup = BeautifulSoup(read_file("test_page.html"), 'lxml')
body_content = soup.find('div', class_ = "body-content")

results = extract_links_from_codes_section(str(body_content))

results

['https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R30-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R31-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R32-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R33-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R34-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R35-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R36-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R37-', 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R39-']


In [None]:
results

['https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R30-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R31-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R32-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R33-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R34-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R35-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R36-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R37-',
 'https://www.icd10data.com/ICD10CM/Codes/R00-R99/R30-R39/R39-']

In [None]:
def extract_unique_category_links(input_file: str, output_file: str, headers: dict = headers) -> None:
    with open(input_file, 'r', encoding='utf-8') as f:
        links = [link.strip() for link in f if link.strip()]
    
    unique_extracted_links = set()

    for link in links:
        response = requests.get(link, headers=headers).text
        file_name = f"{link.split('/')[-1]}.html".replace('-', '_')
        write_file(file_name, response)
        
        soup = BeautifulSoup(read_file(file_name), 'lxml')
        body_content = soup.find('div', class_="body-content")
        
        extracted_links = extract_links_from_codes_section(str(body_content))
        unique_extracted_links.update(extracted_links)
        
        write_file(output_file, "\n".join(unique_extracted_links))

File written: E70_E88.html
File written: unique_extracted_links.txt
File written: P10_P15.html
File written: unique_extracted_links.txt
File written: R30_R39.html
File written: unique_extracted_links.txt
File written: Q38_Q45.html
File written: unique_extracted_links.txt
File written: M50_M54.html
File written: unique_extracted_links.txt
File written: V10_V19.html
File written: unique_extracted_links.txt
File written: A80_A89.html
File written: unique_extracted_links.txt
File written: S10_S19.html
File written: unique_extracted_links.txt
File written: Q50_Q56.html
File written: unique_extracted_links.txt
File written: R25_R29.html
File written: unique_extracted_links.txt
File written: A90_A99.html
File written: unique_extracted_links.txt
File written: A20_A28.html
File written: unique_extracted_links.txt
File written: U50_U85.html
File written: unique_extracted_links.txt
File written: O20_O29.html
File written: unique_extracted_links.txt
File written: G50_G59.html
File written: unique_

In [16]:
test_url = "https://www.icd10data.com/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-"
response = requests.get(test_url, headers=headers).text
soup = BeautifulSoup(response, 'lxml')
write_file("test_page_z37.html", response)

File written: test_page_z37.html


In [17]:
content= read_file("test_page_z37.html")

In [24]:
content_soup = BeautifulSoup(content, 'lxml')
code_hierarchy = content_soup.find('ul', class_='codeHierarchy')
code_links = [a['href'] for a in code_hierarchy.find_all('a')][1:]
code_links

['/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.0',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.1',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.2',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.3',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.4',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.5',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.50',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.51',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.52',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.53',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.54',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.59',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.6',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.60',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.61',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.62',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.63',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.64',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.69',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.7',
 '/ICD10CM/Codes/Z00-Z99/Z30-Z39/Z37-/Z37.9']

In [None]:
def extract_leaf_code_links(input_file: str, output_file: str, headers: dict = headers) -> None:
    """Extract leaf-level ICD-10 links from a list of category URLs."""
    
    with open(input_file, 'r', encoding='utf-8') as f:
        links = [link.strip() for link in f if link.strip()]

    leaf_codes_links = set()

    for link in tqdm(links, desc="Processing ICD category pages", unit="page"):
        response = requests.get(link, headers=headers).text
        file_name = f"{link.split('/')[-1]}.html".replace('-', '_')
        write_file(file_name, response)  
        
        soup = BeautifulSoup(read_file(file_name), 'lxml') 
        code_hierarchy = soup.find('ul', class_='codeHierarchy')
        
        if code_hierarchy:
            code_links = [
                f"{BASE_URL}{a['href']}" for a in code_hierarchy.find_all('a')[1:]
            ]
            leaf_codes_links.update(code_links)
        
        write_file(output_file, "\n".join(leaf_codes_links))
        
        time.sleep(random.uniform(0.7, 1.3))

extract_leaf_code_links("unique_extracted_links.txt", "leaf_code_links.txt")

In [66]:
url = "https://www.icd10data.com/ICD10CM/Codes/J00-J99/J40-J4A/J45-/J45.31"
response = requests.get(url, headers=headers).text
soup = BeautifulSoup(response, 'lxml')
body_content = soup.find('div', class_ = "body-content")

print(body_content.prettify())

<div class="body-content">
 <ol class="breadcrumb2">
  <li>
   <a href="/ICD10CM/Codes">
    ICD-10-CM Codes
   </a>
  </li>
  ›
  <li>
   <a href="/ICD10CM/Codes/J00-J99">
    J00-J99
   </a>
  </li>
  ›
  <li>
   <a href="/ICD10CM/Codes/J00-J99/J40-J4A">
    J40-J4A
   </a>
  </li>
  ›
  <li>
   <a href="/ICD10CM/Codes/J00-J99/J40-J4A/J45-">
    J45-
   </a>
  </li>
  ›
  <li>
   2025 ICD-10-CM Diagnosis Code
   <span class="identifierDetail">
    J45.31
   </span>
  </li>
 </ol>
 <div class="headingContainer">
  <i class="glyphicon glyphicon-triangle-right success">
  </i>
  <h1>
   2025 ICD-10-CM Diagnosis Code
   <span class="identifierDetail">
    J45.31
   </span>
  </h1>
  <div class="tip i26" data-textdivname="x13509" data-titledivname="z13509">
  </div>
  <img src="/images/us.webp" style="padding-left:4px;"/>
 </div>
 <h2 class="codeDescription">
  Mild persistent asthma with (acute) exacerbation
 </h2>
 <ul id="badgeList">
  <span class="label codebadge label-default">
   20

In [67]:
write_file("test_page_j45_31.html", str(body_content))

In [99]:
ul_list = body_content.find('ul', id="badgeList")
tag = any(span.get_text(strip=True) == "Billable/Specific Code" for span in ul_list.find_all('span'))
tag

True

In [100]:
approx_span = body_content.find('span', string="Approximate Synonyms")

synonyms = []

if approx_span:
    ul_tag = approx_span.find_next('ul')
    
    if ul_tag:
        for li in ul_tag.find_all('li'):
            synonyms.append(li.get_text(strip=True))
            
synonyms

['Acute exacerbation of mild persistent allergic asthma',
 'Acute exacerbation of mild persistent asthma',
 'Acute exacerbation of mild persistent asthma with allergic rhinitis',
 'Asthma, persistent, mild with acute exacerbation',
 'Mild persistent allergic asthma with acute exacerbation',
 'Mild persistent asthma with allergic rhinitis with acute exacerbation']

In [101]:
diagnosis_anchor = body_content.find('a', string = "Diagnosis Index")
ul_tag = diagnosis_anchor.find_next('ul')
ul_tag

<ul>
<li class="codeLine"> <span id="15875"><a name="15875"></a><a href="/ICD10CM/Index/A/Asthma%2c_asthmatic">Asthma, asthmatic</a> (bronchial) (catarrh) (spasmodic) <a class="identifier" href="/ICD10CM/Codes/J00-J99/J40-J4A/J45-/J45.909">J45.909</a></span><ul class="tree" id="tree"><li class="codeLine"> <span id="15894"><a name="15894"></a><a href="/ICD10CM/Index/A/Asthma%2c_asthmatic#15894">mild persistent</a> <a class="identifier" href="/ICD10CM/Codes/J00-J99/J40-J4A/J45-/J45.30">J45.30</a></span><ul class="tree" id="tree"><li class="codeLine"> <span id="15895"><a name="15895"></a><a href="/ICD10CM/Index/A/Asthma%2c_asthmatic#15895">with</a></span><ul class="tree" id="tree"><li class="codeLine"> <span id="15947"><a name="15947"></a>exacerbation <span class="identifier highlight">J45.31</span> (acute)</span></li></ul></li></ul></li><li class="codeLine"> <span id="15902"><a name="15902"></a><a href="/ICD10CM/Index/A/Asthma%2c_asthmatic#15902">persistent</a></span><ul class="tree" id=

In [102]:
ul_tag

<ul>
<li class="codeLine"> <span id="15875"><a name="15875"></a><a href="/ICD10CM/Index/A/Asthma%2c_asthmatic">Asthma, asthmatic</a> (bronchial) (catarrh) (spasmodic) <a class="identifier" href="/ICD10CM/Codes/J00-J99/J40-J4A/J45-/J45.909">J45.909</a></span><ul class="tree" id="tree"><li class="codeLine"> <span id="15894"><a name="15894"></a><a href="/ICD10CM/Index/A/Asthma%2c_asthmatic#15894">mild persistent</a> <a class="identifier" href="/ICD10CM/Codes/J00-J99/J40-J4A/J45-/J45.30">J45.30</a></span><ul class="tree" id="tree"><li class="codeLine"> <span id="15895"><a name="15895"></a><a href="/ICD10CM/Index/A/Asthma%2c_asthmatic#15895">with</a></span><ul class="tree" id="tree"><li class="codeLine"> <span id="15947"><a name="15947"></a>exacerbation <span class="identifier highlight">J45.31</span> (acute)</span></li></ul></li></ul></li><li class="codeLine"> <span id="15902"><a name="15902"></a><a href="/ICD10CM/Index/A/Asthma%2c_asthmatic#15902">persistent</a></span><ul class="tree" id=

In [None]:
def ensure_space_around_codes(s: str) -> str:
    ICD_CODE = r'([A-Z][0-9][A-Z0-9](?:\.[A-Z0-9]{1,4})?)'
    """Ensure a space before/after ICD-10 codes if missing."""
    s = re.sub(r'(?<!\s)' + ICD_CODE, r' \1', s)      # add space before
    s = re.sub(ICD_CODE + r'(?=[A-Za-z(])', r'\1 ', s)  # add space after
    return re.sub(r'\s{2,}', ' ', s).strip()

def clean_li(li):
    """Extracts immediate text (excluding nested <ul> and tooltips)."""
    for div in li.find_all(["div", "span"], class_=["tip", "z32", "codebadge", "images-note"]):
        div.decompose()

    parts = []
    for child in li.contents:
        if getattr(child, "name", None) == "ul":
            continue
        text = getattr(child, "get_text", lambda **k: str(child))(strip=True)
        if text:
            parts.append(text)

    return " ".join(parts).strip()

def parse_ul(ul, level=0):
    results = []
    for li in ul.find_all("li", recursive=False):
        main_text = clean_li(li)

        # ✅ only immediate children for code (not descendants)
        code = None
        for child in li.find_all(["a", "span"], class_="identifier", recursive=False):
            code = child.get_text(strip=True)
            break

        if code:
            main_text = main_text.replace(code, "").rstrip()
            main_text = f"{main_text} {code}".strip()

        main_text = ensure_space_around_codes(main_text)

        entry = "  " * level + main_text
        results.append(entry)

        # recurse into nested <ul>
        child_ul = li.find("ul", class_="tree", recursive=False)
        if child_ul:
            results.extend(parse_ul(child_ul, level + 1))

    return results


In [105]:
parsed = parse_ul(ul_tag)
print("\n".join(parsed))

Asthma, asthmatic(bronchial) (catarrh) (spasmodic) J45.909
  mild persistent J45.30
    with
      exacerbation J45.31 (acute)
  persistent
    mild J45.30
      with
        exacerbation J45.31 (acute)


In [106]:
def flatten_icd(lines):
    ICD_CODE = re.compile(r"\b[A-Z][0-9][A-Z0-9](?:\.[A-Z0-9]{1,4})?\b")
    results = []
    stack = []

    for line in lines:
        indent = len(line) - len(line.lstrip())
        text = line.strip()

        # find ICD code
        match = ICD_CODE.search(text)
        if match:
            code = match.group(0)
            term = text.replace(code, "").strip()
        else:
            term = text
            code = None

        # pop stack until correct level
        while stack and stack[-1][0] >= indent:
            stack.pop()

        stack.append((indent, term, code))

        # build full phrase
        full_terms = [t for _, t, _ in stack]

        if code:  # ✅ emit any line that has a code
            results.append(" ".join(full_terms).strip() + " " + code)

    return results


In [115]:
for line in flatten_icd(parsed):
    print(line + ",")

Asthma, asthmatic(bronchial) (catarrh) (spasmodic) J45.909,
Asthma, asthmatic(bronchial) (catarrh) (spasmodic) mild persistent J45.30,
Asthma, asthmatic(bronchial) (catarrh) (spasmodic) mild persistent with exacerbation  (acute) J45.31,
Asthma, asthmatic(bronchial) (catarrh) (spasmodic) persistent mild J45.30,
Asthma, asthmatic(bronchial) (catarrh) (spasmodic) persistent mild with exacerbation  (acute) J45.31,


In [None]:
url = "https://www.icd10data.com/ICD10CM/Codes/N00-N99/N40-N53/N52-/N52"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
body_content = soup.find('div', class_ = "body-content")





In [113]:
def extract_excludes_list(soup):
    results = []

    # Find the span with text "Type 1 Excludes"
    span = soup.find("span", string=lambda s: s and "Type 1 Excludes" in s)
    if not span:
        return results

    # Get the next ul sibling
    ul = span.find_next("ul")
    if not ul:
        return results

    # Extract li text (cleaned up)
    for li in ul.find_all("li", recursive=False):
        # Remove inner noise (badges, tooltips, etc.)
        for div in li.find_all(["div", "ul", "span"], class_=["tip", "z32", "codebadge", "images-note"]):
            div.decompose()

        text = " ".join(li.get_text(" ", strip=True).split())
        results.append(text)

    return results


In [114]:
excludes_list = extract_excludes_list(body_content)
print(excludes_list)

['psychogenic impotence ( F52.21 )']
